From 87e82a1cc1db6a29a824d025d1b16ed9ad6d4236 Mon Sep 17 00:00:00 2001 From: Carlos Ernesto Alvarez Berumen Date: Thu, 6 Nov 2025 00:18:53 -0600 Subject: [PATCH 01/15] refactored AttributeType utilities --- .../amber/core/tuple/AttributeTypeUtils.scala | 126 +++++++++ .../org/apache/amber/core/tuple/Schema.scala | 4 + .../org/apache/amber/core/tuple/Tuple.scala | 24 ++ .../aggregate/AggregationOperation.scala | 118 +------- .../sortPartitions/SortPartitionsOpExec.scala | 23 +- .../sort/StableMergeSortOpExecSpec.scala | 262 ++++++++---------- 6 files changed, 295 insertions(+), 262 deletions(-) diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala index e4fdcb4611d..6fe03c4969b 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala @@ -387,6 +387,132 @@ object AttributeTypeUtils extends Serializable { } } + /** Three-way compare for the given attribute type. + * Returns < 0 if left < right, > 0 if left > right, 0 if equal. + * Null semantics: null < non-null (both null => 0). + */ + @throws[UnsupportedOperationException] + def compare(left: Any, right: Any, attrType: AttributeType): Int = (left, right) match { + case (null, null) => 0 + case (null, _) => -1 + case (_, null) => 1 + case _ => + attrType match { + case AttributeType.INTEGER => + java.lang.Integer.compare( + left.asInstanceOf[Number].intValue(), + right.asInstanceOf[Number].intValue() + ) + case AttributeType.LONG => + java.lang.Long.compare( + left.asInstanceOf[Number].longValue(), + right.asInstanceOf[Number].longValue() + ) + case AttributeType.DOUBLE => + java.lang.Double.compare( + left.asInstanceOf[Number].doubleValue(), + right.asInstanceOf[Number].doubleValue() + ) // handles ±Inf/NaN per JDK + case AttributeType.BOOLEAN => + java.lang.Boolean.compare( + left.asInstanceOf[Boolean], + right.asInstanceOf[Boolean] + ) + case AttributeType.TIMESTAMP => + java.lang.Long.compare( + left.asInstanceOf[Timestamp].getTime, + right.asInstanceOf[Timestamp].getTime + ) + case AttributeType.STRING => + left.toString.compareTo(right.toString) + case AttributeType.BINARY => + java.util.Arrays.compareUnsigned( + left.asInstanceOf[Array[Byte]], + right.asInstanceOf[Array[Byte]] + ) + case _ => + throw new UnsupportedOperationException( + s"Unsupported attribute type for compare: $attrType" + ) + } + } + + /** Type-aware addition (null is identity). */ + @throws[UnsupportedOperationException] + def add(left: Object, right: Object, attrType: AttributeType): Object = { + if (left == null && right == null) return zeroValue(attrType) + if (left == null) return right + if (right == null) return left + + attrType match { + case AttributeType.INTEGER => + java.lang.Integer.valueOf( + left.asInstanceOf[Number].intValue() + right.asInstanceOf[Number].intValue() + ) + case AttributeType.LONG => + java.lang.Long.valueOf( + left.asInstanceOf[Number].longValue() + right.asInstanceOf[Number].longValue() + ) + case AttributeType.DOUBLE => + java.lang.Double.valueOf( + left.asInstanceOf[Number].doubleValue() + right.asInstanceOf[Number].doubleValue() + ) + case AttributeType.TIMESTAMP => + new Timestamp( + left.asInstanceOf[Timestamp].getTime + right.asInstanceOf[Timestamp].getTime + ) + case _ => + throw new UnsupportedOperationException( + s"Unsupported attribute type for addition: $attrType" + ) + } + } + + /** Additive identity for supported numeric/timestamp types. + * For BINARY an empty array is returned as a benign identity value. + */ + @throws[UnsupportedOperationException] + def zeroValue(attrType: AttributeType): Object = attrType match { + case AttributeType.INTEGER => java.lang.Integer.valueOf(0) + case AttributeType.LONG => java.lang.Long.valueOf(0L) + case AttributeType.DOUBLE => java.lang.Double.valueOf(0.0d) + case AttributeType.TIMESTAMP => new Timestamp(0L) + case AttributeType.BINARY => Array.emptyByteArray + case _ => + throw new UnsupportedOperationException( + s"Unsupported attribute type for zero value: $attrType" + ) + } + + /** Maximum sentinel. */ + @throws[UnsupportedOperationException] + def maxValue(attrType: AttributeType): Object = attrType match { + case AttributeType.INTEGER => java.lang.Integer.valueOf(Integer.MAX_VALUE) + case AttributeType.LONG => java.lang.Long.valueOf(java.lang.Long.MAX_VALUE) + case AttributeType.DOUBLE => java.lang.Double.valueOf(java.lang.Double.MAX_VALUE) + case AttributeType.TIMESTAMP => new Timestamp(java.lang.Long.MAX_VALUE) + case _ => + throw new UnsupportedOperationException( + s"Unsupported attribute type for max value: $attrType" + ) + } + + /** Minimum sentinel (note Double.MIN_VALUE is > 0). + * For BINARY under lexicographic order, the empty array is the global minimum. + */ + @throws[UnsupportedOperationException] + def minValue(attrType: AttributeType): Object = attrType match { + case AttributeType.INTEGER => java.lang.Integer.valueOf(Integer.MIN_VALUE) + case AttributeType.LONG => java.lang.Long.valueOf(java.lang.Long.MIN_VALUE) + case AttributeType.DOUBLE => java.lang.Double.valueOf(java.lang.Double.MIN_VALUE) + case AttributeType.TIMESTAMP => new Timestamp(0L) + case AttributeType.BINARY => Array.emptyByteArray + case _ => + throw new UnsupportedOperationException( + s"Unsupported attribute type for min value: $attrType" + ) + } + class AttributeTypeException(msg: String, cause: Throwable = null) extends IllegalArgumentException(msg, cause) {} } diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Schema.scala b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Schema.scala index 0bdf84a9eba..5e207209578 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Schema.scala +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Schema.scala @@ -191,6 +191,10 @@ case class Schema @JsonCreator() ( object Schema { + /** Build a Schema with (name, type) pairs, in order, rejecting duplicates. */ + def of(attrs: (String, AttributeType)*): Schema = + attrs.foldLeft(Schema()) { case (acc, (name, tpe)) => acc.add(name, tpe) } + /** * Creates a Schema instance from a raw map representation. * Each entry in the map contains an attribute name and its type as strings. diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Tuple.scala b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Tuple.scala index 14025e0f1f3..aa547bd701d 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Tuple.scala +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Tuple.scala @@ -112,6 +112,30 @@ case class Tuple @JsonCreator() ( object Tuple { + /** Build a Tuple from (name -> value) pairs, coercing values to the schema types. */ + def of(schema: Schema, values: (String, Any)*): Tuple = { + val nameToValue: Map[String, Any] = values.toMap + val coercedFields: Array[Any] = + schema.getAttributes + .map { attribute => + val rawValue: Any = nameToValue.getOrElse(attribute.getName, null) + AttributeTypeUtils.parseField(rawValue, attribute.getType, force = true) + } + .toArray + Tuple(schema, coercedFields) + } + + /** Build a Tuple without coercion. + * Uses the builder’s runtime type checks; values must already match the schema’s field classes. + * Missing attributes (or unknown attribute names) will cause an error. + */ + def ofStrict(schema: Schema, values: (String, Any)*): Tuple = + values.foldLeft(Tuple.builder(schema)) { + case (builder, (attrName, value)) => + builder.add(schema.getAttribute(attrName), value) + }.build() + + /** * Validates that the provided attributes match the provided fields in type and order. * diff --git a/common/workflow-operator/src/main/scala/org/apache/amber/operator/aggregate/AggregationOperation.scala b/common/workflow-operator/src/main/scala/org/apache/amber/operator/aggregate/AggregationOperation.scala index 8818d831e1c..7feafc80cf7 100644 --- a/common/workflow-operator/src/main/scala/org/apache/amber/operator/aggregate/AggregationOperation.scala +++ b/common/workflow-operator/src/main/scala/org/apache/amber/operator/aggregate/AggregationOperation.scala @@ -21,11 +21,9 @@ package org.apache.amber.operator.aggregate import com.fasterxml.jackson.annotation.{JsonIgnore, JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} -import org.apache.amber.core.tuple.AttributeTypeUtils.parseTimestamp -import org.apache.amber.core.tuple.{Attribute, AttributeType, Tuple} +import org.apache.amber.core.tuple.{Attribute, AttributeType, AttributeTypeUtils, Tuple} import org.apache.amber.operator.metadata.annotations.AutofillAttributeName -import java.sql.Timestamp import javax.validation.constraints.NotNull case class AveragePartialObj(sum: Double, count: Double) extends Serializable {} @@ -130,12 +128,12 @@ class AggregationOperation { ) } new DistributedAggregation[Object]( - () => zero(attributeType), + () => AttributeTypeUtils.zeroValue(attributeType), (partial, tuple) => { val value = tuple.getField[Object](attribute) - add(partial, value, attributeType) + AttributeTypeUtils.add(partial, value, attributeType) }, - (partial1, partial2) => add(partial1, partial2, attributeType), + (partial1, partial2) => AttributeTypeUtils.add(partial1, partial2, attributeType), partial => partial ) } @@ -190,15 +188,15 @@ class AggregationOperation { ) } new DistributedAggregation[Object]( - () => maxValue(attributeType), + () => AttributeTypeUtils.maxValue(attributeType), (partial, tuple) => { val value = tuple.getField[Object](attribute) - val comp = compare(value, partial, attributeType) + val comp = AttributeTypeUtils.compare(value, partial, attributeType) if (value != null && comp < 0) value else partial }, (partial1, partial2) => - if (compare(partial1, partial2, attributeType) < 0) partial1 else partial2, - partial => if (partial == maxValue(attributeType)) null else partial + if (AttributeTypeUtils.compare(partial1, partial2, attributeType) < 0) partial1 else partial2, + partial => if (partial == AttributeTypeUtils.maxValue(attributeType)) null else partial ) } @@ -214,15 +212,15 @@ class AggregationOperation { ) } new DistributedAggregation[Object]( - () => minValue(attributeType), + () => AttributeTypeUtils.minValue(attributeType), (partial, tuple) => { val value = tuple.getField[Object](attribute) - val comp = compare(value, partial, attributeType) + val comp = AttributeTypeUtils.compare(value, partial, attributeType) if (value != null && comp > 0) value else partial }, (partial1, partial2) => - if (compare(partial1, partial2, attributeType) > 0) partial1 else partial2, - partial => if (partial == maxValue(attributeType)) null else partial + if (AttributeTypeUtils.compare(partial1, partial2, attributeType) > 0) partial1 else partial2, + partial => if (partial == AttributeTypeUtils.maxValue(attributeType)) null else partial ) } @@ -232,7 +230,7 @@ class AggregationOperation { return None if (tuple.getSchema.getAttribute(attribute).getType == AttributeType.TIMESTAMP) - Option(parseTimestamp(value.toString).getTime.toDouble) + Option(AttributeTypeUtils.parseTimestamp(value.toString).getTime.toDouble) else Option(value.toString.toDouble) } @@ -254,94 +252,4 @@ class AggregationOperation { } ) } - - // return a.compare(b), - // < 0 if a < b, - // > 0 if a > b, - // 0 if a = b - private def compare(a: Object, b: Object, attributeType: AttributeType): Int = { - if (a == null && b == null) { - return 0 - } else if (a == null) { - return -1 - } else if (b == null) { - return 1 - } - attributeType match { - case AttributeType.INTEGER => a.asInstanceOf[Integer].compareTo(b.asInstanceOf[Integer]) - case AttributeType.DOUBLE => - a.asInstanceOf[java.lang.Double].compareTo(b.asInstanceOf[java.lang.Double]) - case AttributeType.LONG => - a.asInstanceOf[java.lang.Long].compareTo(b.asInstanceOf[java.lang.Long]) - case AttributeType.TIMESTAMP => - a.asInstanceOf[Timestamp].getTime.compareTo(b.asInstanceOf[Timestamp].getTime) - case _ => - throw new UnsupportedOperationException( - "Unsupported attribute type for comparison: " + attributeType - ) - } - } - - private def add(a: Object, b: Object, attributeType: AttributeType): Object = { - if (a == null && b == null) { - return zero(attributeType) - } else if (a == null) { - return b - } else if (b == null) { - return a - } - attributeType match { - case AttributeType.INTEGER => - Integer.valueOf(a.asInstanceOf[Integer] + b.asInstanceOf[Integer]) - case AttributeType.DOUBLE => - java.lang.Double.valueOf( - a.asInstanceOf[java.lang.Double] + b.asInstanceOf[java.lang.Double] - ) - case AttributeType.LONG => - java.lang.Long.valueOf(a.asInstanceOf[java.lang.Long] + b.asInstanceOf[java.lang.Long]) - case AttributeType.TIMESTAMP => - new Timestamp(a.asInstanceOf[Timestamp].getTime + b.asInstanceOf[Timestamp].getTime) - case _ => - throw new UnsupportedOperationException( - "Unsupported attribute type for addition: " + attributeType - ) - } - } - - private def zero(attributeType: AttributeType): Object = - attributeType match { - case AttributeType.INTEGER => java.lang.Integer.valueOf(0) - case AttributeType.DOUBLE => java.lang.Double.valueOf(0) - case AttributeType.LONG => java.lang.Long.valueOf(0) - case AttributeType.TIMESTAMP => new Timestamp(0) - case _ => - throw new UnsupportedOperationException( - "Unsupported attribute type for zero value: " + attributeType - ) - } - - private def maxValue(attributeType: AttributeType): Object = - attributeType match { - case AttributeType.INTEGER => Integer.MAX_VALUE.asInstanceOf[Object] - case AttributeType.DOUBLE => java.lang.Double.MAX_VALUE.asInstanceOf[Object] - case AttributeType.LONG => java.lang.Long.MAX_VALUE.asInstanceOf[Object] - case AttributeType.TIMESTAMP => new Timestamp(java.lang.Long.MAX_VALUE) - case _ => - throw new UnsupportedOperationException( - "Unsupported attribute type for max value: " + attributeType - ) - } - - private def minValue(attributeType: AttributeType): Object = - attributeType match { - case AttributeType.INTEGER => Integer.MIN_VALUE.asInstanceOf[Object] - case AttributeType.DOUBLE => java.lang.Double.MIN_VALUE.asInstanceOf[Object] - case AttributeType.LONG => java.lang.Long.MIN_VALUE.asInstanceOf[Object] - case AttributeType.TIMESTAMP => new Timestamp(0) - case _ => - throw new UnsupportedOperationException( - "Unsupported attribute type for min value: " + attributeType - ) - } - } diff --git a/common/workflow-operator/src/main/scala/org/apache/amber/operator/sortPartitions/SortPartitionsOpExec.scala b/common/workflow-operator/src/main/scala/org/apache/amber/operator/sortPartitions/SortPartitionsOpExec.scala index ac6a9da59ce..0ec5bce5184 100644 --- a/common/workflow-operator/src/main/scala/org/apache/amber/operator/sortPartitions/SortPartitionsOpExec.scala +++ b/common/workflow-operator/src/main/scala/org/apache/amber/operator/sortPartitions/SortPartitionsOpExec.scala @@ -20,7 +20,7 @@ package org.apache.amber.operator.sortPartitions import org.apache.amber.core.executor.OperatorExecutor -import org.apache.amber.core.tuple.{AttributeType, Tuple, TupleLike} +import org.apache.amber.core.tuple.{AttributeType, AttributeTypeUtils, Tuple, TupleLike} import org.apache.amber.util.JSONUtils.objectMapper import scala.collection.mutable.ArrayBuffer @@ -47,18 +47,11 @@ class SortPartitionsOpExec(descString: String) extends OperatorExecutor { override def onFinish(port: Int): Iterator[TupleLike] = sortTuples() - private def compareTuples(t1: Tuple, t2: Tuple): Boolean = { - val attributeType = t1.getSchema.getAttribute(desc.sortAttributeName).getType - val attributeIndex = t1.getSchema.getIndex(desc.sortAttributeName) - attributeType match { - case AttributeType.LONG => - t1.getField[Long](attributeIndex) < t2.getField[Long](attributeIndex) - case AttributeType.INTEGER => - t1.getField[Int](attributeIndex) < t2.getField[Int](attributeIndex) - case AttributeType.DOUBLE => - t1.getField[Double](attributeIndex) < t2.getField[Double](attributeIndex) - case _ => - true // unsupported type - } - } + private def compareTuples(t1: Tuple, t2: Tuple): Boolean = + AttributeTypeUtils.compare( + t1.getField[Any](t1.getSchema.getIndex(desc.sortAttributeName)), + t2.getField[Any](t2.getSchema.getIndex(desc.sortAttributeName)), + t1.getSchema.getAttribute(desc.sortAttributeName).getType + ) < 0 + } diff --git a/common/workflow-operator/src/test/scala/org/apache/amber/operator/sort/StableMergeSortOpExecSpec.scala b/common/workflow-operator/src/test/scala/org/apache/amber/operator/sort/StableMergeSortOpExecSpec.scala index ecb38cfff4f..c0d237795c0 100644 --- a/common/workflow-operator/src/test/scala/org/apache/amber/operator/sort/StableMergeSortOpExecSpec.scala +++ b/common/workflow-operator/src/test/scala/org/apache/amber/operator/sort/StableMergeSortOpExecSpec.scala @@ -51,28 +51,6 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { // Helpers // =========================================================================== - /** Build a Schema with (name, type) pairs, in-order. */ - private def schemaOf(attributes: (String, AttributeType)*): Schema = { - attributes.foldLeft(Schema()) { - case (acc, (name, attrType)) => acc.add(new Attribute(name, attrType)) - } - } - - /** - * Construct a Tuple for the provided schema. - * - * @param values map-like varargs: "colName" -> value. Must provide every column. - * @throws NoSuchElementException if a provided key is not in the schema. - */ - private def tupleOf(schema: Schema, values: (String, Any)*): Tuple = { - val valueMap = values.toMap - val builder = Tuple.builder(schema) - schema.getAttributeNames.asJava.forEach { name => - builder.add(schema.getAttribute(name), valueMap(name)) - } - builder.build() - } - /** Convenience builder for a single sort key with direction (ASC by default). */ private def sortKey( attribute: String, @@ -128,13 +106,13 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { // =========================================================================== "StableMergeSortOpExec" should "sort integers ascending and preserve duplicate order" in { - val schema = schemaOf("value" -> AttributeType.INTEGER, "label" -> AttributeType.STRING) + val schema = Schema.of("value" -> AttributeType.INTEGER, "label" -> AttributeType.STRING) val tuples = List( - tupleOf(schema, "value" -> 3, "label" -> "a"), - tupleOf(schema, "value" -> 1, "label" -> "first-1"), - tupleOf(schema, "value" -> 2, "label" -> "b"), - tupleOf(schema, "value" -> 1, "label" -> "first-2"), - tupleOf(schema, "value" -> 3, "label" -> "c") + Tuple.of(schema, "value" -> 3, "label" -> "a"), + Tuple.of(schema, "value" -> 1, "label" -> "first-1"), + Tuple.of(schema, "value" -> 2, "label" -> "b"), + Tuple.of(schema, "value" -> 1, "label" -> "first-2"), + Tuple.of(schema, "value" -> 3, "label" -> "c") ) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("value")) } assert(result.map(_.getField[Int]("value")) == List(1, 1, 2, 3, 3)) @@ -144,12 +122,12 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "sort integers descending while preserving stability" in { - val schema = schemaOf("value" -> AttributeType.INTEGER, "label" -> AttributeType.STRING) + val schema = Schema.of("value" -> AttributeType.INTEGER, "label" -> AttributeType.STRING) val tuples = List( - tupleOf(schema, "value" -> 2, "label" -> "first"), - tupleOf(schema, "value" -> 2, "label" -> "second"), - tupleOf(schema, "value" -> 1, "label" -> "third"), - tupleOf(schema, "value" -> 3, "label" -> "fourth") + Tuple.of(schema, "value" -> 2, "label" -> "first"), + Tuple.of(schema, "value" -> 2, "label" -> "second"), + Tuple.of(schema, "value" -> 1, "label" -> "third"), + Tuple.of(schema, "value" -> 3, "label" -> "fourth") ) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("value", SortPreference.DESC)) @@ -161,12 +139,12 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "handle string ordering (case-sensitive)" in { - val schema = schemaOf("name" -> AttributeType.STRING) + val schema = Schema.of("name" -> AttributeType.STRING) val tuples = List( - tupleOf(schema, "name" -> "apple"), - tupleOf(schema, "name" -> "Banana"), - tupleOf(schema, "name" -> "banana"), - tupleOf(schema, "name" -> "APPLE") + Tuple.of(schema, "name" -> "apple"), + Tuple.of(schema, "name" -> "Banana"), + Tuple.of(schema, "name" -> "banana"), + Tuple.of(schema, "name" -> "APPLE") ) val sorted = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("name", SortPreference.ASC)) @@ -175,35 +153,35 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "order ASCII strings by Java compareTo (punctuation < digits < uppercase < lowercase)" in { - val schema = schemaOf("str" -> AttributeType.STRING) - val tuples = List("a", "A", "0", "~", "!").map(s => tupleOf(schema, "str" -> s)) + val schema = Schema.of("str" -> AttributeType.STRING) + val tuples = List("a", "A", "0", "~", "!").map(s => Tuple.of(schema, "str" -> s)) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("str")) } assert(result.map(_.getField[String]("str")) == List("!", "0", "A", "a", "~")) } it should "sort negatives and zeros correctly" in { - val schema = schemaOf("value" -> AttributeType.INTEGER) - val tuples = List(0, -1, -10, 5, -3, 2).map(v => tupleOf(schema, "value" -> v)) + val schema = Schema.of("value" -> AttributeType.INTEGER) + val tuples = List(0, -1, -10, 5, -3, 2).map(v => Tuple.of(schema, "value" -> v)) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("value")) } assert(result.map(_.getField[Int]("value")) == List(-10, -3, -1, 0, 2, 5)) } it should "sort LONG values ascending" in { - val schema = schemaOf("id" -> AttributeType.LONG) - val tuples = List(5L, 1L, 3L, 9L, 0L).map(v => tupleOf(schema, "id" -> v)) + val schema = Schema.of("id" -> AttributeType.LONG) + val tuples = List(5L, 1L, 3L, 9L, 0L).map(v => Tuple.of(schema, "id" -> v)) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("id")) } assert(result.map(_.getField[Long]("id")) == List(0L, 1L, 3L, 5L, 9L)) } it should "sort TIMESTAMP ascending" in { - val schema = schemaOf("timestamp" -> AttributeType.TIMESTAMP) + val schema = Schema.of("timestamp" -> AttributeType.TIMESTAMP) val base = Timestamp.valueOf("2022-01-01 00:00:00") val tuples = List( new Timestamp(base.getTime + 4000), new Timestamp(base.getTime + 1000), new Timestamp(base.getTime + 3000), new Timestamp(base.getTime + 2000) - ).map(ts => tupleOf(schema, "timestamp" -> ts)) + ).map(ts => Tuple.of(schema, "timestamp" -> ts)) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("timestamp", SortPreference.ASC)) } @@ -212,14 +190,14 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "sort TIMESTAMP descending" in { - val schema = schemaOf("timestamp" -> AttributeType.TIMESTAMP) + val schema = Schema.of("timestamp" -> AttributeType.TIMESTAMP) val base = Timestamp.valueOf("2023-01-01 00:00:00") val tuples = List( new Timestamp(base.getTime + 3000), base, new Timestamp(base.getTime + 1000), new Timestamp(base.getTime + 2000) - ).map(ts => tupleOf(schema, "timestamp" -> ts)) + ).map(ts => Tuple.of(schema, "timestamp" -> ts)) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("timestamp", SortPreference.DESC)) } @@ -228,15 +206,15 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "treat numeric strings as strings (lexicographic ordering)" in { - val schema = schemaOf("str" -> AttributeType.STRING) - val tuples = List("2", "10", "1", "11", "20").map(s => tupleOf(schema, "str" -> s)) + val schema = Schema.of("str" -> AttributeType.STRING) + val tuples = List("2", "10", "1", "11", "20").map(s => Tuple.of(schema, "str" -> s)) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("str")) } assert(result.map(_.getField[String]("str")) == List("1", "10", "11", "2", "20")) } it should "sort BOOLEAN ascending (false < true) and descending" in { - val schema = schemaOf("bool" -> AttributeType.BOOLEAN) - val tuples = List(true, false, true, false).map(v => tupleOf(schema, "bool" -> v)) + val schema = Schema.of("bool" -> AttributeType.BOOLEAN) + val tuples = List(true, false, true, false).map(v => Tuple.of(schema, "bool" -> v)) val asc = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("bool", SortPreference.ASC)) } @@ -248,7 +226,7 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "sort BINARY ascending (unsigned lexicographic) incl. empty and high-bit bytes" in { - val schema = schemaOf("bin" -> AttributeType.BINARY) + val schema = Schema.of("bin" -> AttributeType.BINARY) val bytesEmpty = Array[Byte]() // [] val bytes00 = Array(0x00.toByte) // [00] @@ -259,7 +237,7 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { val bytesFF = Array(0xff.toByte) // [FF] (-1) val inputTuples = List(bytes80, bytes0000, bytesEmpty, bytesFF, bytes0001, bytes00, bytes7F) - .map(arr => tupleOf(schema, "bin" -> arr)) + .map(arr => Tuple.of(schema, "bin" -> arr)) val sorted = runStableMergeSort(schema, inputTuples) { _.keys = sortKeysBuffer(sortKey("bin")) } @@ -276,10 +254,10 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { // =========================================================================== it should "sort DOUBLE values including -0.0, 0.0, infinities and NaN" in { - val schema = schemaOf("x" -> AttributeType.DOUBLE) + val schema = Schema.of("x" -> AttributeType.DOUBLE) val tuples = List(Double.NaN, Double.PositiveInfinity, 1.5, -0.0, 0.0, -3.2, Double.NegativeInfinity) - .map(v => tupleOf(schema, "x" -> v)) + .map(v => Tuple.of(schema, "x" -> v)) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("x")) } @@ -294,14 +272,14 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "place NaN before null when sorting DOUBLE ascending (nulls last policy)" in { - val schema = schemaOf("x" -> AttributeType.DOUBLE) + val schema = Schema.of("x" -> AttributeType.DOUBLE) val tuples = List( - tupleOf(schema, "x" -> null), - tupleOf(schema, "x" -> Double.NaN), - tupleOf(schema, "x" -> Double.NegativeInfinity), - tupleOf(schema, "x" -> 1.0), - tupleOf(schema, "x" -> Double.PositiveInfinity), - tupleOf(schema, "x" -> null) + Tuple.of(schema, "x" -> null), + Tuple.of(schema, "x" -> Double.NaN), + Tuple.of(schema, "x" -> Double.NegativeInfinity), + Tuple.of(schema, "x" -> 1.0), + Tuple.of(schema, "x" -> Double.PositiveInfinity), + Tuple.of(schema, "x" -> null) ) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("x")) } val values = result.map(_.getField[java.lang.Double]("x")) @@ -314,12 +292,12 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "place nulls last regardless of ascending or descending" in { - val schema = schemaOf("value" -> AttributeType.INTEGER, "label" -> AttributeType.STRING) + val schema = Schema.of("value" -> AttributeType.INTEGER, "label" -> AttributeType.STRING) val tuples = List( - tupleOf(schema, "value" -> null, "label" -> "null-1"), - tupleOf(schema, "value" -> 5, "label" -> "five"), - tupleOf(schema, "value" -> null, "label" -> "null-2"), - tupleOf(schema, "value" -> 3, "label" -> "three") + Tuple.of(schema, "value" -> null, "label" -> "null-1"), + Tuple.of(schema, "value" -> 5, "label" -> "five"), + Tuple.of(schema, "value" -> null, "label" -> "null-2"), + Tuple.of(schema, "value" -> 3, "label" -> "three") ) val asc = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("value", SortPreference.ASC)) @@ -333,20 +311,20 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "order NaN highest on secondary DESC but still place nulls last" in { - val schema = schemaOf( + val schema = Schema.of( "group" -> AttributeType.STRING, "score" -> AttributeType.DOUBLE, "label" -> AttributeType.STRING ) val tuples = List( - tupleOf(schema, "group" -> "A", "score" -> java.lang.Double.NaN, "label" -> "nan"), - tupleOf(schema, "group" -> "A", "score" -> Double.PositiveInfinity, "label" -> "pinf"), - tupleOf(schema, "group" -> "A", "score" -> 1.0, "label" -> "one"), - tupleOf(schema, "group" -> "A", "score" -> 0.0, "label" -> "zero"), - tupleOf(schema, "group" -> "A", "score" -> -1.0, "label" -> "neg"), - tupleOf(schema, "group" -> "A", "score" -> Double.NegativeInfinity, "label" -> "ninf"), - tupleOf(schema, "group" -> "A", "score" -> null, "label" -> "null-1"), - tupleOf(schema, "group" -> "A", "score" -> null, "label" -> "null-2") + Tuple.of(schema, "group" -> "A", "score" -> java.lang.Double.NaN, "label" -> "nan"), + Tuple.of(schema, "group" -> "A", "score" -> Double.PositiveInfinity, "label" -> "pinf"), + Tuple.of(schema, "group" -> "A", "score" -> 1.0, "label" -> "one"), + Tuple.of(schema, "group" -> "A", "score" -> 0.0, "label" -> "zero"), + Tuple.of(schema, "group" -> "A", "score" -> -1.0, "label" -> "neg"), + Tuple.of(schema, "group" -> "A", "score" -> Double.NegativeInfinity, "label" -> "ninf"), + Tuple.of(schema, "group" -> "A", "score" -> null, "label" -> "null-1"), + Tuple.of(schema, "group" -> "A", "score" -> null, "label" -> "null-2") ) val result = runStableMergeSort(schema, tuples) { desc => desc.keys = @@ -359,20 +337,20 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "sort BINARY descending with nulls last and preserve stability for equal byte arrays" in { - val schema = schemaOf("bin" -> AttributeType.BINARY, "id" -> AttributeType.STRING) + val schema = Schema.of("bin" -> AttributeType.BINARY, "id" -> AttributeType.STRING) val key00 = Array(0x00.toByte) val keyFF = Array(0xff.toByte) val inputTuples = List( - tupleOf(schema, "bin" -> keyFF, "id" -> "ff-1"), - tupleOf(schema, "bin" -> key00, "id" -> "00-1"), - tupleOf( + Tuple.of(schema, "bin" -> keyFF, "id" -> "ff-1"), + Tuple.of(schema, "bin" -> key00, "id" -> "00-1"), + Tuple.of( schema, "bin" -> key00, "id" -> "00-2" ), // equal to previous; stability should keep order - tupleOf(schema, "bin" -> null, "id" -> "null-1") + Tuple.of(schema, "bin" -> null, "id" -> "null-1") ) val sorted = runStableMergeSort(schema, inputTuples) { @@ -387,7 +365,7 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { // =========================================================================== it should "support multi-key sorting with mixed attribute types" in { - val schema = schemaOf( + val schema = Schema.of( "dept" -> AttributeType.STRING, "score" -> AttributeType.DOUBLE, "name" -> AttributeType.STRING, @@ -395,29 +373,29 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { ) val base = new Timestamp(Timestamp.valueOf("2020-01-01 00:00:00").getTime) val tuples = List( - tupleOf(schema, "dept" -> "Sales", "score" -> 9.5, "name" -> "Alice", "hired" -> base), - tupleOf( + Tuple.of(schema, "dept" -> "Sales", "score" -> 9.5, "name" -> "Alice", "hired" -> base), + Tuple.of( schema, "dept" -> "Sales", "score" -> 9.5, "name" -> "Bob", "hired" -> new Timestamp(base.getTime + 1000) ), - tupleOf( + Tuple.of( schema, "dept" -> "Sales", "score" -> 8.0, "name" -> "Carol", "hired" -> new Timestamp(base.getTime + 2000) ), - tupleOf( + Tuple.of( schema, "dept" -> "Engineering", "score" -> 9.5, "name" -> "Dave", "hired" -> new Timestamp(base.getTime + 3000) ), - tupleOf( + Tuple.of( schema, "dept" -> null, "score" -> 9.5, @@ -436,7 +414,7 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "handle multi-key with descending primary and ascending secondary" in { - val schema = schemaOf( + val schema = Schema.of( "major" -> AttributeType.INTEGER, "minor" -> AttributeType.INTEGER, "idx" -> AttributeType.INTEGER @@ -449,7 +427,7 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { (1, 1, 4), (3, 0, 5), (3, 2, 6) - ).map { case (ma, mi, i) => tupleOf(schema, "major" -> ma, "minor" -> mi, "idx" -> i) } + ).map { case (ma, mi, i) => Tuple.of(schema, "major" -> ma, "minor" -> mi, "idx" -> i) } val result = runStableMergeSort(schema, tuples) { desc => desc.keys = sortKeysBuffer(sortKey("major", SortPreference.DESC), sortKey("minor", SortPreference.ASC)) @@ -463,7 +441,7 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "use the third key as a tiebreaker (ASC, ASC, then DESC)" in { - val schema = schemaOf( + val schema = Schema.of( "keyA" -> AttributeType.INTEGER, "keyB" -> AttributeType.INTEGER, "keyC" -> AttributeType.INTEGER, @@ -475,7 +453,7 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { (1, 1, 2, "x2"), (1, 0, 9, "y9") ).map { - case (a, b, c, id) => tupleOf(schema, "keyA" -> a, "keyB" -> b, "keyC" -> c, "id" -> id) + case (a, b, c, id) => Tuple.of(schema, "keyA" -> a, "keyB" -> b, "keyC" -> c, "id" -> id) } val result = runStableMergeSort(schema, tuples) { _.keys = @@ -485,7 +463,7 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "place nulls last across multiple keys (primary ASC, secondary DESC)" in { - val schema = schemaOf("keyA" -> AttributeType.STRING, "keyB" -> AttributeType.INTEGER) + val schema = Schema.of("keyA" -> AttributeType.STRING, "keyB" -> AttributeType.INTEGER) val tuples = List( ("x", 2), (null, 1), @@ -493,7 +471,7 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { (null, 5), ("a", 9), ("a", 2) - ).map { case (s, i) => tupleOf(schema, "keyA" -> s, "keyB" -> i) } + ).map { case (s, i) => Tuple.of(schema, "keyA" -> s, "keyB" -> i) } val result = runStableMergeSort(schema, tuples) { desc => desc.keys = sortKeysBuffer(sortKey("keyA", SortPreference.ASC), sortKey("keyB", SortPreference.DESC)) @@ -503,16 +481,16 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "when primary keys are both null, fall back to secondary ASC (nulls still after non-nulls)" in { - val schema = schemaOf( + val schema = Schema.of( "keyA" -> AttributeType.STRING, "keyB" -> AttributeType.INTEGER, "id" -> AttributeType.STRING ) val tuples = List( - tupleOf(schema, "keyA" -> "A", "keyB" -> 2, "id" -> "non-null-a"), - tupleOf(schema, "keyA" -> null, "keyB" -> 5, "id" -> "null-a-5"), - tupleOf(schema, "keyA" -> null, "keyB" -> 1, "id" -> "null-a-1"), - tupleOf(schema, "keyA" -> "B", "keyB" -> 9, "id" -> "non-null-b") + Tuple.of(schema, "keyA" -> "A", "keyB" -> 2, "id" -> "non-null-a"), + Tuple.of(schema, "keyA" -> null, "keyB" -> 5, "id" -> "null-a-5"), + Tuple.of(schema, "keyA" -> null, "keyB" -> 1, "id" -> "null-a-1"), + Tuple.of(schema, "keyA" -> "B", "keyB" -> 9, "id" -> "non-null-b") ) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("keyA"), sortKey("keyB")) @@ -524,7 +502,7 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "use INTEGER secondary key to break ties when primary BINARY keys are equal" in { - val schema = schemaOf( + val schema = Schema.of( "bin" -> AttributeType.BINARY, "score" -> AttributeType.INTEGER, "label" -> AttributeType.STRING @@ -534,9 +512,9 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { val key01 = Array(0x01.toByte) val inputTuples = List( - tupleOf(schema, "bin" -> key01, "score" -> 1, "label" -> "01-score1"), - tupleOf(schema, "bin" -> key00, "score" -> 9, "label" -> "00-score9"), - tupleOf(schema, "bin" -> key01, "score" -> 2, "label" -> "01-score2") + Tuple.of(schema, "bin" -> key01, "score" -> 1, "label" -> "01-score1"), + Tuple.of(schema, "bin" -> key00, "score" -> 9, "label" -> "00-score9"), + Tuple.of(schema, "bin" -> key01, "score" -> 2, "label" -> "01-score2") ) val sorted = runStableMergeSort(schema, inputTuples) { desc => @@ -554,8 +532,8 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { // =========================================================================== it should "preserve original order among tuples with equal keys" in { - val schema = schemaOf("key" -> AttributeType.INTEGER, "index" -> AttributeType.INTEGER) - val tuples = (0 until 100).map(i => tupleOf(schema, "key" -> (i % 5), "index" -> i)) + val schema = Schema.of("key" -> AttributeType.INTEGER, "index" -> AttributeType.INTEGER) + val tuples = (0 until 100).map(i => Tuple.of(schema, "key" -> (i % 5), "index" -> i)) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("key")) } val grouped = result.groupBy(_.getField[Int]("key")).values grouped.foreach { group => @@ -565,9 +543,9 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "act as a stable pass-through when keys are empty" in { - val schema = schemaOf("value" -> AttributeType.INTEGER, "label" -> AttributeType.STRING) + val schema = Schema.of("value" -> AttributeType.INTEGER, "label" -> AttributeType.STRING) val tuples = List(3, 1, 4, 1, 5, 9).zipWithIndex - .map { case (v, i) => tupleOf(schema, "value" -> v, "label" -> s"row-$i") } + .map { case (v, i) => Tuple.of(schema, "value" -> v, "label" -> s"row-$i") } val result = runStableMergeSort(schema, tuples) { desc => desc.keys = ListBuffer.empty[SortCriteriaUnit] } @@ -578,8 +556,8 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "buffer tuples until onFinish is called" in { - val schema = schemaOf("value" -> AttributeType.INTEGER) - val tuple = tupleOf(schema, "value" -> 2) + val schema = Schema.of("value" -> AttributeType.INTEGER) + val tuple = Tuple.of(schema, "value" -> 2) val desc = new StableMergeSortOpDesc(); desc.keys = sortKeysBuffer(sortKey("value")) val exec = new StableMergeSortOpExec(objectMapper.writeValueAsString(desc)) exec.open() @@ -591,22 +569,22 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "return empty for empty input" in { - val schema = schemaOf("value" -> AttributeType.INTEGER) + val schema = Schema.of("value" -> AttributeType.INTEGER) val result = runStableMergeSort(schema, Seq.empty) { _.keys = sortKeysBuffer(sortKey("value")) } assert(result.isEmpty) } it should "handle single element input" in { - val schema = schemaOf("value" -> AttributeType.INTEGER) - val result = runStableMergeSort(schema, Seq(tupleOf(schema, "value" -> 42))) { + val schema = Schema.of("value" -> AttributeType.INTEGER) + val result = runStableMergeSort(schema, Seq(Tuple.of(schema, "value" -> 42))) { _.keys = sortKeysBuffer(sortKey("value")) } assert(result.map(_.getField[Int]("value")) == List(42)) } it should "sort large inputs efficiently (sanity on boundaries)" in { - val schema = schemaOf("value" -> AttributeType.INTEGER, "label" -> AttributeType.STRING) - val tuples = (50000 to 1 by -1).map(i => tupleOf(schema, "value" -> i, "label" -> s"row-$i")) + val schema = Schema.of("value" -> AttributeType.INTEGER, "label" -> AttributeType.STRING) + val tuples = (50000 to 1 by -1).map(i => Tuple.of(schema, "value" -> i, "label" -> s"row-$i")) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("value")) } assert(result.head.getField[Int]("value") == 1) assert(result(1).getField[Int]("value") == 2) @@ -618,14 +596,14 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { // =========================================================================== it should "merge incrementally: bucket sizes match binary decomposition after each push" in { - val schema = schemaOf("value" -> AttributeType.INTEGER) + val schema = Schema.of("value" -> AttributeType.INTEGER) val desc = new StableMergeSortOpDesc(); desc.keys = sortKeysBuffer(sortKey("value")) val exec = new StableMergeSortOpExec(objectMapper.writeValueAsString(desc)) exec.open() val totalCount = 64 for (index <- (totalCount - 1) to 0 by -1) { - exec.processTuple(tupleOf(schema, "value" -> index), 0) + exec.processTuple(Tuple.of(schema, "value" -> index), 0) val sizes = getBucketSizes(exec).sorted assert(sizes == binaryDecomposition(totalCount - index)) } @@ -634,7 +612,7 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "maintain bucket-stack invariant (no adjacent equal sizes) after each insertion" in { - val schema = schemaOf("value" -> AttributeType.INTEGER) + val schema = Schema.of("value" -> AttributeType.INTEGER) val desc = new StableMergeSortOpDesc(); desc.keys = sortKeysBuffer(sortKey("value")) val exec = new StableMergeSortOpExec(objectMapper.writeValueAsString(desc)) exec.open() @@ -642,7 +620,7 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { val totalCount = 200 val stream = (0 until totalCount by 2) ++ (1 until totalCount by 2) stream.foreach { index => - exec.processTuple(tupleOf(schema, "value" -> (totalCount - 1 - index)), 0) + exec.processTuple(Tuple.of(schema, "value" -> (totalCount - 1 - index)), 0) val sizes = getBucketSizes(exec) sizes.sliding(2).foreach { pair => if (pair.length == 2) assert(pair.head != pair.last) @@ -653,12 +631,12 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "form expected bucket sizes at milestones (1,2,3,4,7,8,15,16)" in { - val schema = schemaOf("value" -> AttributeType.INTEGER) + val schema = Schema.of("value" -> AttributeType.INTEGER) val desc = new StableMergeSortOpDesc(); desc.keys = sortKeysBuffer(sortKey("value")) val exec = new StableMergeSortOpExec(objectMapper.writeValueAsString(desc)) exec.open() - val inputSequence = (100 to 1 by -1).map(i => tupleOf(schema, "value" -> i)) + val inputSequence = (100 to 1 by -1).map(i => Tuple.of(schema, "value" -> i)) val milestones = Set(1, 2, 3, 4, 7, 8, 15, 16) var pushed = 0 inputSequence.foreach { t => @@ -677,20 +655,20 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { // =========================================================================== "mergeSortedBuckets" should "be stable: left bucket wins on equal keys" in { - val schema = schemaOf("key" -> AttributeType.INTEGER, "id" -> AttributeType.STRING) + val schema = Schema.of("key" -> AttributeType.INTEGER, "id" -> AttributeType.STRING) val desc = new StableMergeSortOpDesc(); desc.keys = sortKeysBuffer(sortKey("key")) val exec = new StableMergeSortOpExec(objectMapper.writeValueAsString(desc)); exec.open() // Seed to resolve schema/keys once. - exec.processTuple(tupleOf(schema, "key" -> 0, "id" -> "seed"), 0) + exec.processTuple(Tuple.of(schema, "key" -> 0, "id" -> "seed"), 0) val left = ArrayBuffer( - tupleOf(schema, "key" -> 1, "id" -> "L1"), - tupleOf(schema, "key" -> 2, "id" -> "L2") + Tuple.of(schema, "key" -> 1, "id" -> "L1"), + Tuple.of(schema, "key" -> 2, "id" -> "L2") ) val right = ArrayBuffer( - tupleOf(schema, "key" -> 1, "id" -> "R1"), - tupleOf(schema, "key" -> 3, "id" -> "R3") + Tuple.of(schema, "key" -> 1, "id" -> "R1"), + Tuple.of(schema, "key" -> 3, "id" -> "R3") ) val merged = exec.mergeSortedBuckets(left, right) @@ -700,15 +678,15 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } "mergeSortedBuckets" should "handle empty left bucket" in { - val schema = schemaOf("key" -> AttributeType.INTEGER, "id" -> AttributeType.STRING) + val schema = Schema.of("key" -> AttributeType.INTEGER, "id" -> AttributeType.STRING) val desc = new StableMergeSortOpDesc(); desc.keys = sortKeysBuffer(sortKey("key")) val exec = new StableMergeSortOpExec(objectMapper.writeValueAsString(desc)); exec.open() - exec.processTuple(tupleOf(schema, "key" -> 0, "id" -> "seed"), 0) // seed keys + exec.processTuple(Tuple.of(schema, "key" -> 0, "id" -> "seed"), 0) // seed keys val left = ArrayBuffer.empty[Tuple] val right = ArrayBuffer( - tupleOf(schema, "key" -> 1, "id" -> "r1"), - tupleOf(schema, "key" -> 2, "id" -> "r2") + Tuple.of(schema, "key" -> 1, "id" -> "r1"), + Tuple.of(schema, "key" -> 2, "id" -> "r2") ) val merged = exec.mergeSortedBuckets(left, right) assert(merged.map(_.getField[String]("id")).toList == List("r1", "r2")) @@ -716,14 +694,14 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } "mergeSortedBuckets" should "handle empty right bucket" in { - val schema = schemaOf("key" -> AttributeType.INTEGER, "id" -> AttributeType.STRING) + val schema = Schema.of("key" -> AttributeType.INTEGER, "id" -> AttributeType.STRING) val desc = new StableMergeSortOpDesc(); desc.keys = sortKeysBuffer(sortKey("key")) val exec = new StableMergeSortOpExec(objectMapper.writeValueAsString(desc)); exec.open() - exec.processTuple(tupleOf(schema, "key" -> 0, "id" -> "seed"), 0) + exec.processTuple(Tuple.of(schema, "key" -> 0, "id" -> "seed"), 0) val left = ArrayBuffer( - tupleOf(schema, "key" -> 1, "id" -> "l1"), - tupleOf(schema, "key" -> 2, "id" -> "l2") + Tuple.of(schema, "key" -> 1, "id" -> "l1"), + Tuple.of(schema, "key" -> 2, "id" -> "l2") ) val right = ArrayBuffer.empty[Tuple] val merged = exec.mergeSortedBuckets(left, right) @@ -736,16 +714,16 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { // =========================================================================== "pushBucketAndCombine" should "merge two size-2 buckets into size-4 on push (with existing size-1 seed)" in { - val schema = schemaOf("value" -> AttributeType.INTEGER) + val schema = Schema.of("value" -> AttributeType.INTEGER) val desc = new StableMergeSortOpDesc(); desc.keys = sortKeysBuffer(sortKey("value")) val exec = new StableMergeSortOpExec(objectMapper.writeValueAsString(desc)); exec.open() // seed to compile keys -> results in one size-1 bucket in the stack - exec.processTuple(tupleOf(schema, "value" -> 0), 0) + exec.processTuple(Tuple.of(schema, "value" -> 0), 0) // two pre-sorted buckets of size 2 - val bucket1 = ArrayBuffer(tupleOf(schema, "value" -> 1), tupleOf(schema, "value" -> 3)) - val bucket2 = ArrayBuffer(tupleOf(schema, "value" -> 2), tupleOf(schema, "value" -> 4)) + val bucket1 = ArrayBuffer(Tuple.of(schema, "value" -> 1), Tuple.of(schema, "value" -> 3)) + val bucket2 = ArrayBuffer(Tuple.of(schema, "value" -> 2), Tuple.of(schema, "value" -> 4)) exec.pushBucketAndCombine(bucket1) // sizes now [1,2] exec.pushBucketAndCombine(bucket2) // equal top [2,2] => merged to 4; sizes [1,4] @@ -756,10 +734,10 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "return the same sorted output if onFinish is called twice in a row" in { - val schema = schemaOf("value" -> AttributeType.INTEGER) + val schema = Schema.of("value" -> AttributeType.INTEGER) val desc = new StableMergeSortOpDesc(); desc.keys = sortKeysBuffer(sortKey("value")) val exec = new StableMergeSortOpExec(objectMapper.writeValueAsString(desc)); exec.open() - List(3, 1, 2).foreach(i => exec.processTuple(tupleOf(schema, "value" -> i), 0)) + List(3, 1, 2).foreach(i => exec.processTuple(Tuple.of(schema, "value" -> i), 0)) val first = exec.onFinish(0).map(_.asInstanceOf[Tuple]).toList.map(_.getField[Int]("value")) val second = exec.onFinish(0).map(_.asInstanceOf[Tuple]).toList.map(_.getField[Int]("value")) @@ -769,10 +747,10 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "have processTuple always return empty iterators until finish" in { - val schema = schemaOf("value" -> AttributeType.INTEGER) + val schema = Schema.of("value" -> AttributeType.INTEGER) val desc = new StableMergeSortOpDesc(); desc.keys = sortKeysBuffer(sortKey("value")) val exec = new StableMergeSortOpExec(objectMapper.writeValueAsString(desc)); exec.open() - val immediates = (10 to 1 by -1).map(i => exec.processTuple(tupleOf(schema, "value" -> i), 0)) + val immediates = (10 to 1 by -1).map(i => exec.processTuple(Tuple.of(schema, "value" -> i), 0)) assert(immediates.forall(_.isEmpty)) val out = exec.onFinish(0).map(_.asInstanceOf[Tuple]).toList.map(_.getField[Int]("value")) assert(out == (1 to 10).toList) From ea3518f6a338bbb400c355611aa59d9139b6f1f9 Mon Sep 17 00:00:00 2001 From: Carlos Ernesto Alvarez Berumen Date: Thu, 6 Nov 2025 00:52:00 -0600 Subject: [PATCH 02/15] added formatting --- .../amber/core/tuple/AttributeTypeUtils.scala | 170 +++++++++--------- .../org/apache/amber/core/tuple/Tuple.scala | 27 ++- .../aggregate/AggregationOperation.scala | 6 +- .../sortPartitions/SortPartitionsOpExec.scala | 2 +- .../sort/StableMergeSortOpExecSpec.scala | 3 +- 5 files changed, 106 insertions(+), 102 deletions(-) diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala index 6fe03c4969b..6db2f34c7fd 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala @@ -388,54 +388,55 @@ object AttributeTypeUtils extends Serializable { } /** Three-way compare for the given attribute type. - * Returns < 0 if left < right, > 0 if left > right, 0 if equal. - * Null semantics: null < non-null (both null => 0). - */ + * Returns < 0 if left < right, > 0 if left > right, 0 if equal. + * Null semantics: null < non-null (both null => 0). + */ @throws[UnsupportedOperationException] - def compare(left: Any, right: Any, attrType: AttributeType): Int = (left, right) match { - case (null, null) => 0 - case (null, _) => -1 - case (_, null) => 1 - case _ => - attrType match { - case AttributeType.INTEGER => - java.lang.Integer.compare( - left.asInstanceOf[Number].intValue(), - right.asInstanceOf[Number].intValue() - ) - case AttributeType.LONG => - java.lang.Long.compare( - left.asInstanceOf[Number].longValue(), - right.asInstanceOf[Number].longValue() - ) - case AttributeType.DOUBLE => - java.lang.Double.compare( - left.asInstanceOf[Number].doubleValue(), - right.asInstanceOf[Number].doubleValue() - ) // handles ±Inf/NaN per JDK - case AttributeType.BOOLEAN => - java.lang.Boolean.compare( - left.asInstanceOf[Boolean], - right.asInstanceOf[Boolean] - ) - case AttributeType.TIMESTAMP => - java.lang.Long.compare( - left.asInstanceOf[Timestamp].getTime, - right.asInstanceOf[Timestamp].getTime - ) - case AttributeType.STRING => - left.toString.compareTo(right.toString) - case AttributeType.BINARY => - java.util.Arrays.compareUnsigned( - left.asInstanceOf[Array[Byte]], - right.asInstanceOf[Array[Byte]] - ) - case _ => - throw new UnsupportedOperationException( - s"Unsupported attribute type for compare: $attrType" - ) - } - } + def compare(left: Any, right: Any, attrType: AttributeType): Int = + (left, right) match { + case (null, null) => 0 + case (null, _) => -1 + case (_, null) => 1 + case _ => + attrType match { + case AttributeType.INTEGER => + java.lang.Integer.compare( + left.asInstanceOf[Number].intValue(), + right.asInstanceOf[Number].intValue() + ) + case AttributeType.LONG => + java.lang.Long.compare( + left.asInstanceOf[Number].longValue(), + right.asInstanceOf[Number].longValue() + ) + case AttributeType.DOUBLE => + java.lang.Double.compare( + left.asInstanceOf[Number].doubleValue(), + right.asInstanceOf[Number].doubleValue() + ) // handles ±Inf/NaN per JDK + case AttributeType.BOOLEAN => + java.lang.Boolean.compare( + left.asInstanceOf[Boolean], + right.asInstanceOf[Boolean] + ) + case AttributeType.TIMESTAMP => + java.lang.Long.compare( + left.asInstanceOf[Timestamp].getTime, + right.asInstanceOf[Timestamp].getTime + ) + case AttributeType.STRING => + left.toString.compareTo(right.toString) + case AttributeType.BINARY => + java.util.Arrays.compareUnsigned( + left.asInstanceOf[Array[Byte]], + right.asInstanceOf[Array[Byte]] + ) + case _ => + throw new UnsupportedOperationException( + s"Unsupported attribute type for compare: $attrType" + ) + } + } /** Type-aware addition (null is identity). */ @throws[UnsupportedOperationException] @@ -469,49 +470,52 @@ object AttributeTypeUtils extends Serializable { } /** Additive identity for supported numeric/timestamp types. - * For BINARY an empty array is returned as a benign identity value. - */ + * For BINARY an empty array is returned as a benign identity value. + */ @throws[UnsupportedOperationException] - def zeroValue(attrType: AttributeType): Object = attrType match { - case AttributeType.INTEGER => java.lang.Integer.valueOf(0) - case AttributeType.LONG => java.lang.Long.valueOf(0L) - case AttributeType.DOUBLE => java.lang.Double.valueOf(0.0d) - case AttributeType.TIMESTAMP => new Timestamp(0L) - case AttributeType.BINARY => Array.emptyByteArray - case _ => - throw new UnsupportedOperationException( - s"Unsupported attribute type for zero value: $attrType" - ) - } + def zeroValue(attrType: AttributeType): Object = + attrType match { + case AttributeType.INTEGER => java.lang.Integer.valueOf(0) + case AttributeType.LONG => java.lang.Long.valueOf(0L) + case AttributeType.DOUBLE => java.lang.Double.valueOf(0.0d) + case AttributeType.TIMESTAMP => new Timestamp(0L) + case AttributeType.BINARY => Array.emptyByteArray + case _ => + throw new UnsupportedOperationException( + s"Unsupported attribute type for zero value: $attrType" + ) + } /** Maximum sentinel. */ @throws[UnsupportedOperationException] - def maxValue(attrType: AttributeType): Object = attrType match { - case AttributeType.INTEGER => java.lang.Integer.valueOf(Integer.MAX_VALUE) - case AttributeType.LONG => java.lang.Long.valueOf(java.lang.Long.MAX_VALUE) - case AttributeType.DOUBLE => java.lang.Double.valueOf(java.lang.Double.MAX_VALUE) - case AttributeType.TIMESTAMP => new Timestamp(java.lang.Long.MAX_VALUE) - case _ => - throw new UnsupportedOperationException( - s"Unsupported attribute type for max value: $attrType" - ) - } + def maxValue(attrType: AttributeType): Object = + attrType match { + case AttributeType.INTEGER => java.lang.Integer.valueOf(Integer.MAX_VALUE) + case AttributeType.LONG => java.lang.Long.valueOf(java.lang.Long.MAX_VALUE) + case AttributeType.DOUBLE => java.lang.Double.valueOf(java.lang.Double.MAX_VALUE) + case AttributeType.TIMESTAMP => new Timestamp(java.lang.Long.MAX_VALUE) + case _ => + throw new UnsupportedOperationException( + s"Unsupported attribute type for max value: $attrType" + ) + } /** Minimum sentinel (note Double.MIN_VALUE is > 0). - * For BINARY under lexicographic order, the empty array is the global minimum. - */ + * For BINARY under lexicographic order, the empty array is the global minimum. + */ @throws[UnsupportedOperationException] - def minValue(attrType: AttributeType): Object = attrType match { - case AttributeType.INTEGER => java.lang.Integer.valueOf(Integer.MIN_VALUE) - case AttributeType.LONG => java.lang.Long.valueOf(java.lang.Long.MIN_VALUE) - case AttributeType.DOUBLE => java.lang.Double.valueOf(java.lang.Double.MIN_VALUE) - case AttributeType.TIMESTAMP => new Timestamp(0L) - case AttributeType.BINARY => Array.emptyByteArray - case _ => - throw new UnsupportedOperationException( - s"Unsupported attribute type for min value: $attrType" - ) - } + def minValue(attrType: AttributeType): Object = + attrType match { + case AttributeType.INTEGER => java.lang.Integer.valueOf(Integer.MIN_VALUE) + case AttributeType.LONG => java.lang.Long.valueOf(java.lang.Long.MIN_VALUE) + case AttributeType.DOUBLE => java.lang.Double.valueOf(java.lang.Double.MIN_VALUE) + case AttributeType.TIMESTAMP => new Timestamp(0L) + case AttributeType.BINARY => Array.emptyByteArray + case _ => + throw new UnsupportedOperationException( + s"Unsupported attribute type for min value: $attrType" + ) + } class AttributeTypeException(msg: String, cause: Throwable = null) extends IllegalArgumentException(msg, cause) {} diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Tuple.scala b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Tuple.scala index aa547bd701d..21e98283cd2 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Tuple.scala +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Tuple.scala @@ -116,25 +116,24 @@ object Tuple { def of(schema: Schema, values: (String, Any)*): Tuple = { val nameToValue: Map[String, Any] = values.toMap val coercedFields: Array[Any] = - schema.getAttributes - .map { attribute => - val rawValue: Any = nameToValue.getOrElse(attribute.getName, null) - AttributeTypeUtils.parseField(rawValue, attribute.getType, force = true) - } - .toArray + schema.getAttributes.map { attribute => + val rawValue: Any = nameToValue.getOrElse(attribute.getName, null) + AttributeTypeUtils.parseField(rawValue, attribute.getType, force = true) + }.toArray Tuple(schema, coercedFields) } /** Build a Tuple without coercion. - * Uses the builder’s runtime type checks; values must already match the schema’s field classes. - * Missing attributes (or unknown attribute names) will cause an error. - */ + * Uses the builder’s runtime type checks; values must already match the schema’s field classes. + * Missing attributes (or unknown attribute names) will cause an error. + */ def ofStrict(schema: Schema, values: (String, Any)*): Tuple = - values.foldLeft(Tuple.builder(schema)) { - case (builder, (attrName, value)) => - builder.add(schema.getAttribute(attrName), value) - }.build() - + values + .foldLeft(Tuple.builder(schema)) { + case (builder, (attrName, value)) => + builder.add(schema.getAttribute(attrName), value) + } + .build() /** * Validates that the provided attributes match the provided fields in type and order. diff --git a/common/workflow-operator/src/main/scala/org/apache/amber/operator/aggregate/AggregationOperation.scala b/common/workflow-operator/src/main/scala/org/apache/amber/operator/aggregate/AggregationOperation.scala index 7feafc80cf7..931163e9ed3 100644 --- a/common/workflow-operator/src/main/scala/org/apache/amber/operator/aggregate/AggregationOperation.scala +++ b/common/workflow-operator/src/main/scala/org/apache/amber/operator/aggregate/AggregationOperation.scala @@ -195,7 +195,8 @@ class AggregationOperation { if (value != null && comp < 0) value else partial }, (partial1, partial2) => - if (AttributeTypeUtils.compare(partial1, partial2, attributeType) < 0) partial1 else partial2, + if (AttributeTypeUtils.compare(partial1, partial2, attributeType) < 0) partial1 + else partial2, partial => if (partial == AttributeTypeUtils.maxValue(attributeType)) null else partial ) } @@ -219,7 +220,8 @@ class AggregationOperation { if (value != null && comp > 0) value else partial }, (partial1, partial2) => - if (AttributeTypeUtils.compare(partial1, partial2, attributeType) > 0) partial1 else partial2, + if (AttributeTypeUtils.compare(partial1, partial2, attributeType) > 0) partial1 + else partial2, partial => if (partial == AttributeTypeUtils.maxValue(attributeType)) null else partial ) } diff --git a/common/workflow-operator/src/main/scala/org/apache/amber/operator/sortPartitions/SortPartitionsOpExec.scala b/common/workflow-operator/src/main/scala/org/apache/amber/operator/sortPartitions/SortPartitionsOpExec.scala index 0ec5bce5184..52d2c735bb6 100644 --- a/common/workflow-operator/src/main/scala/org/apache/amber/operator/sortPartitions/SortPartitionsOpExec.scala +++ b/common/workflow-operator/src/main/scala/org/apache/amber/operator/sortPartitions/SortPartitionsOpExec.scala @@ -20,7 +20,7 @@ package org.apache.amber.operator.sortPartitions import org.apache.amber.core.executor.OperatorExecutor -import org.apache.amber.core.tuple.{AttributeType, AttributeTypeUtils, Tuple, TupleLike} +import org.apache.amber.core.tuple.{AttributeTypeUtils, Tuple, TupleLike} import org.apache.amber.util.JSONUtils.objectMapper import scala.collection.mutable.ArrayBuffer diff --git a/common/workflow-operator/src/test/scala/org/apache/amber/operator/sort/StableMergeSortOpExecSpec.scala b/common/workflow-operator/src/test/scala/org/apache/amber/operator/sort/StableMergeSortOpExecSpec.scala index c0d237795c0..e0fc940ec65 100644 --- a/common/workflow-operator/src/test/scala/org/apache/amber/operator/sort/StableMergeSortOpExecSpec.scala +++ b/common/workflow-operator/src/test/scala/org/apache/amber/operator/sort/StableMergeSortOpExecSpec.scala @@ -19,13 +19,12 @@ package org.apache.amber.operator.sort -import org.apache.amber.core.tuple.{Attribute, AttributeType, Schema, Tuple} +import org.apache.amber.core.tuple.{AttributeType, Schema, Tuple} import org.apache.amber.util.JSONUtils.objectMapper import org.scalatest.flatspec.AnyFlatSpec import java.sql.Timestamp import scala.collection.mutable.{ArrayBuffer, ListBuffer} -import scala.jdk.CollectionConverters.IterableHasAsJava /** * Integration and internal-behavior tests for [[StableMergeSortOpExec]]. From 617b5b7cd6aa430edf91c8cd630da151c430c7c3 Mon Sep 17 00:00:00 2001 From: Carlos Ernesto Alvarez Berumen Date: Thu, 13 Nov 2025 21:51:33 -0600 Subject: [PATCH 03/15] deleted ofStrict, unified match case use --- .../amber/core/tuple/AttributeTypeUtils.scala | 55 ++++++++++--------- .../org/apache/amber/core/tuple/Tuple.scala | 12 ---- .../service/resource/DatasetResource.scala | 46 +++++++++++++++- 3 files changed, 72 insertions(+), 41 deletions(-) diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala index 6db2f34c7fd..382846fd3d6 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala @@ -440,34 +440,35 @@ object AttributeTypeUtils extends Serializable { /** Type-aware addition (null is identity). */ @throws[UnsupportedOperationException] - def add(left: Object, right: Object, attrType: AttributeType): Object = { - if (left == null && right == null) return zeroValue(attrType) - if (left == null) return right - if (right == null) return left - - attrType match { - case AttributeType.INTEGER => - java.lang.Integer.valueOf( - left.asInstanceOf[Number].intValue() + right.asInstanceOf[Number].intValue() - ) - case AttributeType.LONG => - java.lang.Long.valueOf( - left.asInstanceOf[Number].longValue() + right.asInstanceOf[Number].longValue() - ) - case AttributeType.DOUBLE => - java.lang.Double.valueOf( - left.asInstanceOf[Number].doubleValue() + right.asInstanceOf[Number].doubleValue() - ) - case AttributeType.TIMESTAMP => - new Timestamp( - left.asInstanceOf[Timestamp].getTime + right.asInstanceOf[Timestamp].getTime - ) - case _ => - throw new UnsupportedOperationException( - s"Unsupported attribute type for addition: $attrType" - ) + def add(left: Object, right: Object, attrType: AttributeType): Object = + (left, right) match { + case (null, null) => zeroValue(attrType) + case (null, r) => r + case (l, null) => l + case (l, r) => + attrType match { + case AttributeType.INTEGER => + java.lang.Integer.valueOf( + l.asInstanceOf[Number].intValue() + r.asInstanceOf[Number].intValue() + ) + case AttributeType.LONG => + java.lang.Long.valueOf( + l.asInstanceOf[Number].longValue() + r.asInstanceOf[Number].longValue() + ) + case AttributeType.DOUBLE => + java.lang.Double.valueOf( + l.asInstanceOf[Number].doubleValue() + r.asInstanceOf[Number].doubleValue() + ) + case AttributeType.TIMESTAMP => + new Timestamp( + l.asInstanceOf[Timestamp].getTime + r.asInstanceOf[Timestamp].getTime + ) + case _ => + throw new UnsupportedOperationException( + s"Unsupported attribute type for addition: $attrType" + ) + } } - } /** Additive identity for supported numeric/timestamp types. * For BINARY an empty array is returned as a benign identity value. diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Tuple.scala b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Tuple.scala index 21e98283cd2..f9db43d1d1e 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Tuple.scala +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Tuple.scala @@ -123,18 +123,6 @@ object Tuple { Tuple(schema, coercedFields) } - /** Build a Tuple without coercion. - * Uses the builder’s runtime type checks; values must already match the schema’s field classes. - * Missing attributes (or unknown attribute names) will cause an error. - */ - def ofStrict(schema: Schema, values: (String, Any)*): Tuple = - values - .foldLeft(Tuple.builder(schema)) { - case (builder, (attrName, value)) => - builder.add(schema.getAttribute(attrName), value) - } - .build() - /** * Validates that the provided attributes match the provided fields in type and order. * diff --git a/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala b/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala index 75e2f6e601e..f49588546ac 100644 --- a/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala +++ b/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala @@ -28,6 +28,7 @@ import org.apache.amber.core.storage.model.OnDataset import org.apache.amber.core.storage.util.LakeFSStorageClient import org.apache.amber.core.storage.{DocumentFactory, FileResolver} import org.apache.texera.auth.SessionUser +import org.apache.texera.config.DefaultsConfig import org.apache.texera.dao.SqlServer import org.apache.texera.dao.SqlServer.withTransaction import org.apache.texera.dao.jooq.generated.enums.PrivilegeEnum @@ -53,6 +54,7 @@ import org.apache.texera.service.util.S3StorageClient.{ MAXIMUM_NUM_OF_MULTIPART_S3_PARTS, MINIMUM_NUM_OF_MULTIPART_S3_PART } +import org.jooq.impl.DSL import org.jooq.{DSLContext, EnumType} import java.io.{InputStream, OutputStream} @@ -178,6 +180,26 @@ class DatasetResource { private val ERR_DATASET_VERSION_NOT_FOUND_MESSAGE = "The version of the dataset not found" private val EXPIRATION_MINUTES = 5 + private val SingleFileUploadMaxSizeKey = "single_file_upload_max_size_mib" + + def singleFileUploadMaxSizeMib: Int = { + val valueOpt = Option( + context + .select(DSL.field("value", classOf[String])) + .from(DSL.table("site_settings")) + .where(DSL.field("key", classOf[String]).eq(SingleFileUploadMaxSizeKey)) + .fetchOne(0, classOf[String]) + ) + + valueOpt + .flatMap(v => scala.util.Try(v.toInt).toOption) + .getOrElse(DefaultsConfig.allDefaults(SingleFileUploadMaxSizeKey).toInt) + } + + /** Maximum allowed single-file upload size in bytes (MiB → bytes). */ + private def maxSingleFileUploadBytes: Long = + singleFileUploadMaxSizeMib.toLong * 1024L * 1024L + /** * Helper function to get the dataset from DB with additional information including user access privilege and owner email */ @@ -401,7 +423,6 @@ class DatasetResource { e ) } - // delete the directory on S3 if ( S3StorageClient.directoryExists(StorageConfig.lakefsBucketName, dataset.getRepositoryName) @@ -504,6 +525,7 @@ class DatasetResource { var buffered = 0 var partNumber = 1 val completedParts = ListBuffer[(Int, String)]() + var totalBytesRead = 0L @inline def flush(): Unit = { if (buffered == 0) return @@ -519,6 +541,13 @@ class DatasetResource { var read = fileStream.read(buf, buffered, buf.length - buffered) while (read != -1) { buffered += read + totalBytesRead += read + if (totalBytesRead > maxSingleFileUploadBytes) { + throw new WebApplicationException( + s"File exceeds maximum allowed size of ${singleFileUploadMaxSizeMib} MiB.", + Response.Status.REQUEST_ENTITY_TOO_LARGE + ) + } if (buffered == buf.length) flush() // buffer full read = fileStream.read(buf, buffered, buf.length - buffered) } @@ -737,7 +766,20 @@ class DatasetResource { partsList, physicalAddress ) - + val sizeBytes = Option(objectStats.getSizeBytes).map(_.longValue()).getOrElse(0L) + if (sizeBytes > maxSingleFileUploadBytes) { + // Roll back staged object to previous committed state (or remove if new). + try { + LakeFSStorageClient.resetObjectUploadOrDeletion(repositoryName, filePath) + } catch { + case _: Exception => // best-effort cleanup + } + throw new WebApplicationException( + s"File exceeds maximum allowed size of " + + s"${singleFileUploadMaxSizeMib} MiB. Upload has been rolled back.", + Response.Status.REQUEST_ENTITY_TOO_LARGE + ) + } Response .ok( Map( From dfbff43570b8825d89ae5184844abfc1dd457d07 Mon Sep 17 00:00:00 2001 From: Carlos Ernesto Alvarez Berumen Date: Thu, 13 Nov 2025 21:54:43 -0600 Subject: [PATCH 04/15] Revert "deleted ofStrict, unified match case use" This reverts commit 617b5b7cd6aa430edf91c8cd630da151c430c7c3. --- .../amber/core/tuple/AttributeTypeUtils.scala | 55 +++++++++---------- .../org/apache/amber/core/tuple/Tuple.scala | 12 ++++ .../service/resource/DatasetResource.scala | 46 +--------------- 3 files changed, 41 insertions(+), 72 deletions(-) diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala index 382846fd3d6..6db2f34c7fd 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala @@ -440,35 +440,34 @@ object AttributeTypeUtils extends Serializable { /** Type-aware addition (null is identity). */ @throws[UnsupportedOperationException] - def add(left: Object, right: Object, attrType: AttributeType): Object = - (left, right) match { - case (null, null) => zeroValue(attrType) - case (null, r) => r - case (l, null) => l - case (l, r) => - attrType match { - case AttributeType.INTEGER => - java.lang.Integer.valueOf( - l.asInstanceOf[Number].intValue() + r.asInstanceOf[Number].intValue() - ) - case AttributeType.LONG => - java.lang.Long.valueOf( - l.asInstanceOf[Number].longValue() + r.asInstanceOf[Number].longValue() - ) - case AttributeType.DOUBLE => - java.lang.Double.valueOf( - l.asInstanceOf[Number].doubleValue() + r.asInstanceOf[Number].doubleValue() - ) - case AttributeType.TIMESTAMP => - new Timestamp( - l.asInstanceOf[Timestamp].getTime + r.asInstanceOf[Timestamp].getTime - ) - case _ => - throw new UnsupportedOperationException( - s"Unsupported attribute type for addition: $attrType" - ) - } + def add(left: Object, right: Object, attrType: AttributeType): Object = { + if (left == null && right == null) return zeroValue(attrType) + if (left == null) return right + if (right == null) return left + + attrType match { + case AttributeType.INTEGER => + java.lang.Integer.valueOf( + left.asInstanceOf[Number].intValue() + right.asInstanceOf[Number].intValue() + ) + case AttributeType.LONG => + java.lang.Long.valueOf( + left.asInstanceOf[Number].longValue() + right.asInstanceOf[Number].longValue() + ) + case AttributeType.DOUBLE => + java.lang.Double.valueOf( + left.asInstanceOf[Number].doubleValue() + right.asInstanceOf[Number].doubleValue() + ) + case AttributeType.TIMESTAMP => + new Timestamp( + left.asInstanceOf[Timestamp].getTime + right.asInstanceOf[Timestamp].getTime + ) + case _ => + throw new UnsupportedOperationException( + s"Unsupported attribute type for addition: $attrType" + ) } + } /** Additive identity for supported numeric/timestamp types. * For BINARY an empty array is returned as a benign identity value. diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Tuple.scala b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Tuple.scala index f9db43d1d1e..21e98283cd2 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Tuple.scala +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Tuple.scala @@ -123,6 +123,18 @@ object Tuple { Tuple(schema, coercedFields) } + /** Build a Tuple without coercion. + * Uses the builder’s runtime type checks; values must already match the schema’s field classes. + * Missing attributes (or unknown attribute names) will cause an error. + */ + def ofStrict(schema: Schema, values: (String, Any)*): Tuple = + values + .foldLeft(Tuple.builder(schema)) { + case (builder, (attrName, value)) => + builder.add(schema.getAttribute(attrName), value) + } + .build() + /** * Validates that the provided attributes match the provided fields in type and order. * diff --git a/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala b/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala index f49588546ac..75e2f6e601e 100644 --- a/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala +++ b/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala @@ -28,7 +28,6 @@ import org.apache.amber.core.storage.model.OnDataset import org.apache.amber.core.storage.util.LakeFSStorageClient import org.apache.amber.core.storage.{DocumentFactory, FileResolver} import org.apache.texera.auth.SessionUser -import org.apache.texera.config.DefaultsConfig import org.apache.texera.dao.SqlServer import org.apache.texera.dao.SqlServer.withTransaction import org.apache.texera.dao.jooq.generated.enums.PrivilegeEnum @@ -54,7 +53,6 @@ import org.apache.texera.service.util.S3StorageClient.{ MAXIMUM_NUM_OF_MULTIPART_S3_PARTS, MINIMUM_NUM_OF_MULTIPART_S3_PART } -import org.jooq.impl.DSL import org.jooq.{DSLContext, EnumType} import java.io.{InputStream, OutputStream} @@ -180,26 +178,6 @@ class DatasetResource { private val ERR_DATASET_VERSION_NOT_FOUND_MESSAGE = "The version of the dataset not found" private val EXPIRATION_MINUTES = 5 - private val SingleFileUploadMaxSizeKey = "single_file_upload_max_size_mib" - - def singleFileUploadMaxSizeMib: Int = { - val valueOpt = Option( - context - .select(DSL.field("value", classOf[String])) - .from(DSL.table("site_settings")) - .where(DSL.field("key", classOf[String]).eq(SingleFileUploadMaxSizeKey)) - .fetchOne(0, classOf[String]) - ) - - valueOpt - .flatMap(v => scala.util.Try(v.toInt).toOption) - .getOrElse(DefaultsConfig.allDefaults(SingleFileUploadMaxSizeKey).toInt) - } - - /** Maximum allowed single-file upload size in bytes (MiB → bytes). */ - private def maxSingleFileUploadBytes: Long = - singleFileUploadMaxSizeMib.toLong * 1024L * 1024L - /** * Helper function to get the dataset from DB with additional information including user access privilege and owner email */ @@ -423,6 +401,7 @@ class DatasetResource { e ) } + // delete the directory on S3 if ( S3StorageClient.directoryExists(StorageConfig.lakefsBucketName, dataset.getRepositoryName) @@ -525,7 +504,6 @@ class DatasetResource { var buffered = 0 var partNumber = 1 val completedParts = ListBuffer[(Int, String)]() - var totalBytesRead = 0L @inline def flush(): Unit = { if (buffered == 0) return @@ -541,13 +519,6 @@ class DatasetResource { var read = fileStream.read(buf, buffered, buf.length - buffered) while (read != -1) { buffered += read - totalBytesRead += read - if (totalBytesRead > maxSingleFileUploadBytes) { - throw new WebApplicationException( - s"File exceeds maximum allowed size of ${singleFileUploadMaxSizeMib} MiB.", - Response.Status.REQUEST_ENTITY_TOO_LARGE - ) - } if (buffered == buf.length) flush() // buffer full read = fileStream.read(buf, buffered, buf.length - buffered) } @@ -766,20 +737,7 @@ class DatasetResource { partsList, physicalAddress ) - val sizeBytes = Option(objectStats.getSizeBytes).map(_.longValue()).getOrElse(0L) - if (sizeBytes > maxSingleFileUploadBytes) { - // Roll back staged object to previous committed state (or remove if new). - try { - LakeFSStorageClient.resetObjectUploadOrDeletion(repositoryName, filePath) - } catch { - case _: Exception => // best-effort cleanup - } - throw new WebApplicationException( - s"File exceeds maximum allowed size of " + - s"${singleFileUploadMaxSizeMib} MiB. Upload has been rolled back.", - Response.Status.REQUEST_ENTITY_TOO_LARGE - ) - } + Response .ok( Map( From 03e9c67e5e2474fda16404e0919c9e9f5de7f539 Mon Sep 17 00:00:00 2001 From: Carlos Ernesto Alvarez Berumen Date: Thu, 13 Nov 2025 21:51:33 -0600 Subject: [PATCH 05/15] deleted ofStrict, unified match case use (cherry picked from commit 617b5b7cd6aa430edf91c8cd630da151c430c7c3) --- .../amber/core/tuple/AttributeTypeUtils.scala | 55 ++++++++++--------- .../org/apache/amber/core/tuple/Tuple.scala | 12 ---- .../service/resource/DatasetResource.scala | 46 +++++++++++++++- 3 files changed, 72 insertions(+), 41 deletions(-) diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala index 6db2f34c7fd..382846fd3d6 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala @@ -440,34 +440,35 @@ object AttributeTypeUtils extends Serializable { /** Type-aware addition (null is identity). */ @throws[UnsupportedOperationException] - def add(left: Object, right: Object, attrType: AttributeType): Object = { - if (left == null && right == null) return zeroValue(attrType) - if (left == null) return right - if (right == null) return left - - attrType match { - case AttributeType.INTEGER => - java.lang.Integer.valueOf( - left.asInstanceOf[Number].intValue() + right.asInstanceOf[Number].intValue() - ) - case AttributeType.LONG => - java.lang.Long.valueOf( - left.asInstanceOf[Number].longValue() + right.asInstanceOf[Number].longValue() - ) - case AttributeType.DOUBLE => - java.lang.Double.valueOf( - left.asInstanceOf[Number].doubleValue() + right.asInstanceOf[Number].doubleValue() - ) - case AttributeType.TIMESTAMP => - new Timestamp( - left.asInstanceOf[Timestamp].getTime + right.asInstanceOf[Timestamp].getTime - ) - case _ => - throw new UnsupportedOperationException( - s"Unsupported attribute type for addition: $attrType" - ) + def add(left: Object, right: Object, attrType: AttributeType): Object = + (left, right) match { + case (null, null) => zeroValue(attrType) + case (null, r) => r + case (l, null) => l + case (l, r) => + attrType match { + case AttributeType.INTEGER => + java.lang.Integer.valueOf( + l.asInstanceOf[Number].intValue() + r.asInstanceOf[Number].intValue() + ) + case AttributeType.LONG => + java.lang.Long.valueOf( + l.asInstanceOf[Number].longValue() + r.asInstanceOf[Number].longValue() + ) + case AttributeType.DOUBLE => + java.lang.Double.valueOf( + l.asInstanceOf[Number].doubleValue() + r.asInstanceOf[Number].doubleValue() + ) + case AttributeType.TIMESTAMP => + new Timestamp( + l.asInstanceOf[Timestamp].getTime + r.asInstanceOf[Timestamp].getTime + ) + case _ => + throw new UnsupportedOperationException( + s"Unsupported attribute type for addition: $attrType" + ) + } } - } /** Additive identity for supported numeric/timestamp types. * For BINARY an empty array is returned as a benign identity value. diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Tuple.scala b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Tuple.scala index 21e98283cd2..f9db43d1d1e 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Tuple.scala +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Tuple.scala @@ -123,18 +123,6 @@ object Tuple { Tuple(schema, coercedFields) } - /** Build a Tuple without coercion. - * Uses the builder’s runtime type checks; values must already match the schema’s field classes. - * Missing attributes (or unknown attribute names) will cause an error. - */ - def ofStrict(schema: Schema, values: (String, Any)*): Tuple = - values - .foldLeft(Tuple.builder(schema)) { - case (builder, (attrName, value)) => - builder.add(schema.getAttribute(attrName), value) - } - .build() - /** * Validates that the provided attributes match the provided fields in type and order. * diff --git a/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala b/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala index 75e2f6e601e..f49588546ac 100644 --- a/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala +++ b/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala @@ -28,6 +28,7 @@ import org.apache.amber.core.storage.model.OnDataset import org.apache.amber.core.storage.util.LakeFSStorageClient import org.apache.amber.core.storage.{DocumentFactory, FileResolver} import org.apache.texera.auth.SessionUser +import org.apache.texera.config.DefaultsConfig import org.apache.texera.dao.SqlServer import org.apache.texera.dao.SqlServer.withTransaction import org.apache.texera.dao.jooq.generated.enums.PrivilegeEnum @@ -53,6 +54,7 @@ import org.apache.texera.service.util.S3StorageClient.{ MAXIMUM_NUM_OF_MULTIPART_S3_PARTS, MINIMUM_NUM_OF_MULTIPART_S3_PART } +import org.jooq.impl.DSL import org.jooq.{DSLContext, EnumType} import java.io.{InputStream, OutputStream} @@ -178,6 +180,26 @@ class DatasetResource { private val ERR_DATASET_VERSION_NOT_FOUND_MESSAGE = "The version of the dataset not found" private val EXPIRATION_MINUTES = 5 + private val SingleFileUploadMaxSizeKey = "single_file_upload_max_size_mib" + + def singleFileUploadMaxSizeMib: Int = { + val valueOpt = Option( + context + .select(DSL.field("value", classOf[String])) + .from(DSL.table("site_settings")) + .where(DSL.field("key", classOf[String]).eq(SingleFileUploadMaxSizeKey)) + .fetchOne(0, classOf[String]) + ) + + valueOpt + .flatMap(v => scala.util.Try(v.toInt).toOption) + .getOrElse(DefaultsConfig.allDefaults(SingleFileUploadMaxSizeKey).toInt) + } + + /** Maximum allowed single-file upload size in bytes (MiB → bytes). */ + private def maxSingleFileUploadBytes: Long = + singleFileUploadMaxSizeMib.toLong * 1024L * 1024L + /** * Helper function to get the dataset from DB with additional information including user access privilege and owner email */ @@ -401,7 +423,6 @@ class DatasetResource { e ) } - // delete the directory on S3 if ( S3StorageClient.directoryExists(StorageConfig.lakefsBucketName, dataset.getRepositoryName) @@ -504,6 +525,7 @@ class DatasetResource { var buffered = 0 var partNumber = 1 val completedParts = ListBuffer[(Int, String)]() + var totalBytesRead = 0L @inline def flush(): Unit = { if (buffered == 0) return @@ -519,6 +541,13 @@ class DatasetResource { var read = fileStream.read(buf, buffered, buf.length - buffered) while (read != -1) { buffered += read + totalBytesRead += read + if (totalBytesRead > maxSingleFileUploadBytes) { + throw new WebApplicationException( + s"File exceeds maximum allowed size of ${singleFileUploadMaxSizeMib} MiB.", + Response.Status.REQUEST_ENTITY_TOO_LARGE + ) + } if (buffered == buf.length) flush() // buffer full read = fileStream.read(buf, buffered, buf.length - buffered) } @@ -737,7 +766,20 @@ class DatasetResource { partsList, physicalAddress ) - + val sizeBytes = Option(objectStats.getSizeBytes).map(_.longValue()).getOrElse(0L) + if (sizeBytes > maxSingleFileUploadBytes) { + // Roll back staged object to previous committed state (or remove if new). + try { + LakeFSStorageClient.resetObjectUploadOrDeletion(repositoryName, filePath) + } catch { + case _: Exception => // best-effort cleanup + } + throw new WebApplicationException( + s"File exceeds maximum allowed size of " + + s"${singleFileUploadMaxSizeMib} MiB. Upload has been rolled back.", + Response.Status.REQUEST_ENTITY_TOO_LARGE + ) + } Response .ok( Map( From ec7b4bbd14214ea33cabbca79acffdaf1488d39f Mon Sep 17 00:00:00 2001 From: Carlos Ernesto Alvarez Berumen Date: Thu, 13 Nov 2025 22:05:27 -0600 Subject: [PATCH 06/15] Restored DatasetResourceScala --- .../service/resource/DatasetResource.scala | 384 ++++++++---------- 1 file changed, 171 insertions(+), 213 deletions(-) diff --git a/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala b/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala index f49588546ac..8969bb3c0f6 100644 --- a/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala +++ b/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala @@ -28,7 +28,6 @@ import org.apache.amber.core.storage.model.OnDataset import org.apache.amber.core.storage.util.LakeFSStorageClient import org.apache.amber.core.storage.{DocumentFactory, FileResolver} import org.apache.texera.auth.SessionUser -import org.apache.texera.config.DefaultsConfig import org.apache.texera.dao.SqlServer import org.apache.texera.dao.SqlServer.withTransaction import org.apache.texera.dao.jooq.generated.enums.PrivilegeEnum @@ -54,7 +53,6 @@ import org.apache.texera.service.util.S3StorageClient.{ MAXIMUM_NUM_OF_MULTIPART_S3_PARTS, MINIMUM_NUM_OF_MULTIPART_S3_PART } -import org.jooq.impl.DSL import org.jooq.{DSLContext, EnumType} import java.io.{InputStream, OutputStream} @@ -75,8 +73,8 @@ object DatasetResource { .createDSLContext() /** - * Helper function to get the dataset from DB using did - */ + * Helper function to get the dataset from DB using did + */ private def getDatasetByID(ctx: DSLContext, did: Integer): Dataset = { val datasetDao = new DatasetDao(ctx.configuration()) val dataset = datasetDao.fetchOneByDid(did) @@ -87,8 +85,8 @@ object DatasetResource { } /** - * Helper function to PUT exactly len bytes from buf to presigned URL, return the ETag - */ + * Helper function to PUT exactly len bytes from buf to presigned URL, return the ETag + */ private def put(buf: Array[Byte], len: Int, url: String, partNum: Int): String = { val conn = new URL(url).openConnection().asInstanceOf[HttpURLConnection] conn.setDoOutput(true); @@ -108,12 +106,12 @@ object DatasetResource { } /** - * Helper function to get the dataset version from DB using dvid - */ + * Helper function to get the dataset version from DB using dvid + */ private def getDatasetVersionByID( - ctx: DSLContext, - dvid: Integer - ): DatasetVersion = { + ctx: DSLContext, + dvid: Integer + ): DatasetVersion = { val datasetVersionDao = new DatasetVersionDao(ctx.configuration()) val version = datasetVersionDao.fetchOneByDvid(dvid) if (version == null) { @@ -123,12 +121,12 @@ object DatasetResource { } /** - * Helper function to get the latest dataset version from the DB - */ + * Helper function to get the latest dataset version from the DB + */ private def getLatestDatasetVersion( - ctx: DSLContext, - did: Integer - ): Option[DatasetVersion] = { + ctx: DSLContext, + did: Integer + ): Option[DatasetVersion] = { ctx .selectFrom(DATASET_VERSION) .where(DATASET_VERSION.DID.eq(did)) @@ -139,38 +137,38 @@ object DatasetResource { } case class DashboardDataset( - dataset: Dataset, - ownerEmail: String, - accessPrivilege: EnumType, - isOwner: Boolean, - size: Long - ) + dataset: Dataset, + ownerEmail: String, + accessPrivilege: EnumType, + isOwner: Boolean, + size: Long + ) case class DashboardDatasetVersion( - datasetVersion: DatasetVersion, - fileNodes: List[DatasetFileNode] - ) + datasetVersion: DatasetVersion, + fileNodes: List[DatasetFileNode] + ) case class CreateDatasetRequest( - datasetName: String, - datasetDescription: String, - isDatasetPublic: Boolean, - isDatasetDownloadable: Boolean - ) + datasetName: String, + datasetDescription: String, + isDatasetPublic: Boolean, + isDatasetDownloadable: Boolean + ) case class Diff( - path: String, - pathType: String, - diffType: String, // "added", "removed", "changed", etc. - sizeBytes: Option[Long] // Size of the changed file (None for directories) - ) + path: String, + pathType: String, + diffType: String, // "added", "removed", "changed", etc. + sizeBytes: Option[Long] // Size of the changed file (None for directories) + ) case class DatasetDescriptionModification(did: Integer, description: String) case class DatasetVersionRootFileNodesResponse( - fileNodes: List[DatasetFileNode], - size: Long - ) + fileNodes: List[DatasetFileNode], + size: Long + ) } @Produces(Array(MediaType.APPLICATION_JSON, "image/jpeg", "application/pdf")) @@ -180,34 +178,14 @@ class DatasetResource { private val ERR_DATASET_VERSION_NOT_FOUND_MESSAGE = "The version of the dataset not found" private val EXPIRATION_MINUTES = 5 - private val SingleFileUploadMaxSizeKey = "single_file_upload_max_size_mib" - - def singleFileUploadMaxSizeMib: Int = { - val valueOpt = Option( - context - .select(DSL.field("value", classOf[String])) - .from(DSL.table("site_settings")) - .where(DSL.field("key", classOf[String]).eq(SingleFileUploadMaxSizeKey)) - .fetchOne(0, classOf[String]) - ) - - valueOpt - .flatMap(v => scala.util.Try(v.toInt).toOption) - .getOrElse(DefaultsConfig.allDefaults(SingleFileUploadMaxSizeKey).toInt) - } - - /** Maximum allowed single-file upload size in bytes (MiB → bytes). */ - private def maxSingleFileUploadBytes: Long = - singleFileUploadMaxSizeMib.toLong * 1024L * 1024L - /** - * Helper function to get the dataset from DB with additional information including user access privilege and owner email - */ + * Helper function to get the dataset from DB with additional information including user access privilege and owner email + */ private def getDashboardDataset( - ctx: DSLContext, - did: Integer, - requesterUid: Option[Integer] - ): DashboardDataset = { + ctx: DSLContext, + did: Integer, + requesterUid: Option[Integer] + ): DashboardDataset = { val targetDataset = getDatasetByID(ctx, did) if (requesterUid.isEmpty && !targetDataset.getIsPublic) { @@ -236,9 +214,9 @@ class DatasetResource { @Path("/create") @Consumes(Array(MediaType.APPLICATION_JSON)) def createDataset( - request: CreateDatasetRequest, - @Auth user: SessionUser - ): DashboardDataset = { + request: CreateDatasetRequest, + @Auth user: SessionUser + ): DashboardDataset = { withTransaction(context) { ctx => val uid = user.getUid @@ -323,10 +301,10 @@ class DatasetResource { @Path("/{did}/version/create") @Consumes(Array(MediaType.TEXT_PLAIN)) def createDatasetVersion( - versionName: String, - @PathParam("did") did: Integer, - @Auth user: SessionUser - ): DashboardDatasetVersion = { + versionName: String, + @PathParam("did") did: Integer, + @Auth user: SessionUser + ): DashboardDatasetVersion = { val uid = user.getUid withTransaction(context) { ctx => if (!userHasWriteAccess(ctx, did, uid)) { @@ -423,6 +401,7 @@ class DatasetResource { e ) } + // delete the directory on S3 if ( S3StorageClient.directoryExists(StorageConfig.lakefsBucketName, dataset.getRepositoryName) @@ -443,9 +422,9 @@ class DatasetResource { @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/update/description") def updateDatasetDescription( - modificator: DatasetDescriptionModification, - @Auth sessionUser: SessionUser - ): Response = { + modificator: DatasetDescriptionModification, + @Auth sessionUser: SessionUser + ): Response = { withTransaction(context) { ctx => val uid = sessionUser.getUid val datasetDao = new DatasetDao(ctx.configuration()) @@ -465,13 +444,13 @@ class DatasetResource { @Path("/{did}/upload") @Consumes(Array(MediaType.APPLICATION_OCTET_STREAM)) def uploadOneFileToDataset( - @PathParam("did") did: Integer, - @QueryParam("filePath") encodedFilePath: String, - @QueryParam("message") message: String, - fileStream: InputStream, - @Context headers: HttpHeaders, - @Auth user: SessionUser - ): Response = { + @PathParam("did") did: Integer, + @QueryParam("filePath") encodedFilePath: String, + @QueryParam("message") message: String, + fileStream: InputStream, + @Context headers: HttpHeaders, + @Auth user: SessionUser + ): Response = { // These variables are defined at the top so catch block can access them val uid = user.getUid var repoName: String = null @@ -525,7 +504,6 @@ class DatasetResource { var buffered = 0 var partNumber = 1 val completedParts = ListBuffer[(Int, String)]() - var totalBytesRead = 0L @inline def flush(): Unit = { if (buffered == 0) return @@ -541,13 +519,6 @@ class DatasetResource { var read = fileStream.read(buf, buffered, buf.length - buffered) while (read != -1) { buffered += read - totalBytesRead += read - if (totalBytesRead > maxSingleFileUploadBytes) { - throw new WebApplicationException( - s"File exceeds maximum allowed size of ${singleFileUploadMaxSizeMib} MiB.", - Response.Status.REQUEST_ENTITY_TOO_LARGE - ) - } if (buffered == buf.length) flush() // buffer full read = fileStream.read(buf, buffered, buf.length - buffered) } @@ -586,11 +557,11 @@ class DatasetResource { @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/presign-download") def getPresignedUrl( - @QueryParam("filePath") encodedUrl: String, - @QueryParam("repositoryName") repositoryName: String, - @QueryParam("commitHash") commitHash: String, - @Auth user: SessionUser - ): Response = { + @QueryParam("filePath") encodedUrl: String, + @QueryParam("repositoryName") repositoryName: String, + @QueryParam("commitHash") commitHash: String, + @Auth user: SessionUser + ): Response = { val uid = user.getUid generatePresignedResponse(encodedUrl, repositoryName, commitHash, uid) } @@ -599,11 +570,11 @@ class DatasetResource { @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/presign-download-s3") def getPresignedUrlWithS3( - @QueryParam("filePath") encodedUrl: String, - @QueryParam("repositoryName") repositoryName: String, - @QueryParam("commitHash") commitHash: String, - @Auth user: SessionUser - ): Response = { + @QueryParam("filePath") encodedUrl: String, + @QueryParam("repositoryName") repositoryName: String, + @QueryParam("commitHash") commitHash: String, + @Auth user: SessionUser + ): Response = { val uid = user.getUid generatePresignedResponse(encodedUrl, repositoryName, commitHash, uid) } @@ -611,20 +582,20 @@ class DatasetResource { @GET @Path("/public-presign-download") def getPublicPresignedUrl( - @QueryParam("filePath") encodedUrl: String, - @QueryParam("repositoryName") repositoryName: String, - @QueryParam("commitHash") commitHash: String - ): Response = { + @QueryParam("filePath") encodedUrl: String, + @QueryParam("repositoryName") repositoryName: String, + @QueryParam("commitHash") commitHash: String + ): Response = { generatePresignedResponse(encodedUrl, repositoryName, commitHash, null) } @GET @Path("/public-presign-download-s3") def getPublicPresignedUrlWithS3( - @QueryParam("filePath") encodedUrl: String, - @QueryParam("repositoryName") repositoryName: String, - @QueryParam("commitHash") commitHash: String - ): Response = { + @QueryParam("filePath") encodedUrl: String, + @QueryParam("repositoryName") repositoryName: String, + @QueryParam("commitHash") commitHash: String + ): Response = { generatePresignedResponse(encodedUrl, repositoryName, commitHash, null) } @@ -633,10 +604,10 @@ class DatasetResource { @Path("/{did}/file") @Consumes(Array(MediaType.APPLICATION_JSON)) def deleteDatasetFile( - @PathParam("did") did: Integer, - @QueryParam("filePath") encodedFilePath: String, - @Auth user: SessionUser - ): Response = { + @PathParam("did") did: Integer, + @QueryParam("filePath") encodedFilePath: String, + @Auth user: SessionUser + ): Response = { val uid = user.getUid withTransaction(context) { ctx => if (!userHasWriteAccess(ctx, did, uid)) { @@ -665,18 +636,18 @@ class DatasetResource { @Path("/multipart-upload") @Consumes(Array(MediaType.APPLICATION_JSON)) def multipartUpload( - @QueryParam("type") operationType: String, - @QueryParam("ownerEmail") ownerEmail: String, - @QueryParam("datasetName") datasetName: String, - @QueryParam("filePath") encodedUrl: String, - @QueryParam("uploadId") uploadId: Optional[String], - @QueryParam("numParts") numParts: Optional[Integer], - payload: Map[ - String, - Any - ], // Expecting {"parts": [...], "physicalAddress": "s3://bucket/path"} - @Auth user: SessionUser - ): Response = { + @QueryParam("type") operationType: String, + @QueryParam("ownerEmail") ownerEmail: String, + @QueryParam("datasetName") datasetName: String, + @QueryParam("filePath") encodedUrl: String, + @QueryParam("uploadId") uploadId: Optional[String], + @QueryParam("numParts") numParts: Optional[Integer], + payload: Map[ + String, + Any + ], // Expecting {"parts": [...], "physicalAddress": "s3://bucket/path"} + @Auth user: SessionUser + ): Response = { val uid = user.getUid withTransaction(context) { ctx => @@ -766,20 +737,7 @@ class DatasetResource { partsList, physicalAddress ) - val sizeBytes = Option(objectStats.getSizeBytes).map(_.longValue()).getOrElse(0L) - if (sizeBytes > maxSingleFileUploadBytes) { - // Roll back staged object to previous committed state (or remove if new). - try { - LakeFSStorageClient.resetObjectUploadOrDeletion(repositoryName, filePath) - } catch { - case _: Exception => // best-effort cleanup - } - throw new WebApplicationException( - s"File exceeds maximum allowed size of " + - s"${singleFileUploadMaxSizeMib} MiB. Upload has been rolled back.", - Response.Status.REQUEST_ENTITY_TOO_LARGE - ) - } + Response .ok( Map( @@ -820,9 +778,9 @@ class DatasetResource { @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/{did}/update/publicity") def toggleDatasetPublicity( - @PathParam("did") did: Integer, - @Auth sessionUser: SessionUser - ): Response = { + @PathParam("did") did: Integer, + @Auth sessionUser: SessionUser + ): Response = { withTransaction(context) { ctx => val datasetDao = new DatasetDao(ctx.configuration()) val uid = sessionUser.getUid @@ -844,9 +802,9 @@ class DatasetResource { @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/{did}/update/downloadable") def toggleDatasetDownloadable( - @PathParam("did") did: Integer, - @Auth sessionUser: SessionUser - ): Response = { + @PathParam("did") did: Integer, + @Auth sessionUser: SessionUser + ): Response = { withTransaction(context) { ctx => val datasetDao = new DatasetDao(ctx.configuration()) val uid = sessionUser.getUid @@ -869,9 +827,9 @@ class DatasetResource { @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/{did}/diff") def getDatasetDiff( - @PathParam("did") did: Integer, - @Auth user: SessionUser - ): List[Diff] = { + @PathParam("did") did: Integer, + @Auth user: SessionUser + ): List[Diff] = { val uid = user.getUid withTransaction(context) { ctx => if (!userHasReadAccess(ctx, did, uid)) { @@ -899,10 +857,10 @@ class DatasetResource { @Path("/{did}/diff") @Consumes(Array(MediaType.APPLICATION_JSON)) def resetDatasetFileDiff( - @PathParam("did") did: Integer, - @QueryParam("filePath") encodedFilePath: String, - @Auth user: SessionUser - ): Response = { + @PathParam("did") did: Integer, + @QueryParam("filePath") encodedFilePath: String, + @Auth user: SessionUser + ): Response = { val uid = user.getUid withTransaction(context) { ctx => if (!userHasWriteAccess(ctx, did, uid)) { @@ -926,17 +884,17 @@ class DatasetResource { } /** - * This method returns a list of DashboardDatasets objects that are accessible by current user. - * - * @param user the session user - * @return list of user accessible DashboardDataset objects - */ + * This method returns a list of DashboardDatasets objects that are accessible by current user. + * + * @param user the session user + * @return list of user accessible DashboardDataset objects + */ @GET @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/list") def listDatasets( - @Auth user: SessionUser - ): List[DashboardDataset] = { + @Auth user: SessionUser + ): List[DashboardDataset] = { val uid = user.getUid withTransaction(context)(ctx => { var accessibleDatasets: ListBuffer[DashboardDataset] = ListBuffer() @@ -1010,9 +968,9 @@ class DatasetResource { @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/{did}/version/list") def getDatasetVersionList( - @PathParam("did") did: Integer, - @Auth user: SessionUser - ): List[DatasetVersion] = { + @PathParam("did") did: Integer, + @Auth user: SessionUser + ): List[DatasetVersion] = { val uid = user.getUid withTransaction(context)(ctx => { val dataset = getDatasetByID(ctx, did) @@ -1026,8 +984,8 @@ class DatasetResource { @GET @Path("/{name}/publicVersion/list") def getPublicDatasetVersionList( - @PathParam("name") did: Integer - ): List[DatasetVersion] = { + @PathParam("name") did: Integer + ): List[DatasetVersion] = { withTransaction(context)(ctx => { if (!isDatasetPublic(ctx, did)) { throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE) @@ -1040,9 +998,9 @@ class DatasetResource { @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/{did}/version/latest") def retrieveLatestDatasetVersion( - @PathParam("did") did: Integer, - @Auth user: SessionUser - ): DashboardDatasetVersion = { + @PathParam("did") did: Integer, + @Auth user: SessionUser + ): DashboardDatasetVersion = { val uid = user.getUid withTransaction(context)(ctx => { if (!userHasReadAccess(ctx, did, uid)) { @@ -1082,11 +1040,11 @@ class DatasetResource { @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/{did}/versionZip") def getDatasetVersionZip( - @PathParam("did") did: Integer, - @QueryParam("dvid") dvid: Integer, // Dataset version ID, nullable - @QueryParam("latest") latest: java.lang.Boolean, // Flag to get latest version, nullable - @Auth user: SessionUser - ): Response = { + @PathParam("did") did: Integer, + @QueryParam("dvid") dvid: Integer, // Dataset version ID, nullable + @QueryParam("latest") latest: java.lang.Boolean, // Flag to get latest version, nullable + @Auth user: SessionUser + ): Response = { withTransaction(context) { ctx => if ((dvid != null && latest != null) || (dvid == null && latest == null)) { @@ -1162,10 +1120,10 @@ class DatasetResource { @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/{did}/version/{dvid}/rootFileNodes") def retrieveDatasetVersionRootFileNodes( - @PathParam("did") did: Integer, - @PathParam("dvid") dvid: Integer, - @Auth user: SessionUser - ): DatasetVersionRootFileNodesResponse = { + @PathParam("did") did: Integer, + @PathParam("dvid") dvid: Integer, + @Auth user: SessionUser + ): DatasetVersionRootFileNodesResponse = { val uid = user.getUid withTransaction(context)(ctx => fetchDatasetVersionRootFileNodes(ctx, did, dvid, Some(uid))) } @@ -1173,9 +1131,9 @@ class DatasetResource { @GET @Path("/{did}/publicVersion/{dvid}/rootFileNodes") def retrievePublicDatasetVersionRootFileNodes( - @PathParam("did") did: Integer, - @PathParam("dvid") dvid: Integer - ): DatasetVersionRootFileNodesResponse = { + @PathParam("did") did: Integer, + @PathParam("dvid") dvid: Integer + ): DatasetVersionRootFileNodesResponse = { withTransaction(context)(ctx => fetchDatasetVersionRootFileNodes(ctx, did, dvid, None)) } @@ -1183,9 +1141,9 @@ class DatasetResource { @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/{did}") def getDataset( - @PathParam("did") did: Integer, - @Auth user: SessionUser - ): DashboardDataset = { + @PathParam("did") did: Integer, + @Auth user: SessionUser + ): DashboardDataset = { val uid = user.getUid withTransaction(context)(ctx => getDashboardDataset(ctx, did, Some(uid))) } @@ -1193,16 +1151,16 @@ class DatasetResource { @GET @Path("/public/{did}") def getPublicDataset( - @PathParam("did") did: Integer - ): DashboardDataset = { + @PathParam("did") did: Integer + ): DashboardDataset = { withTransaction(context)(ctx => getDashboardDataset(ctx, did, None)) } @GET @Path("/file") def retrieveDatasetSingleFile( - @QueryParam("path") pathStr: String - ): Response = { + @QueryParam("path") pathStr: String + ): Response = { val decodedPathStr = URLDecoder.decode(pathStr, StandardCharsets.UTF_8.name()) withTransaction(context)(_ => { @@ -1245,10 +1203,10 @@ class DatasetResource { } /** - * This method returns all owner user names of the dataset that the user has access to - * - * @return OwnerName[] - */ + * This method returns all owner user names of the dataset that the user has access to + * + * @return OwnerName[] + */ @GET @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/user-dataset-owners") @@ -1265,16 +1223,16 @@ class DatasetResource { } /** - * Validates the dataset name. - * - * Rules: - * - Must be at least 1 character long. - * - Only lowercase letters, numbers, underscores, and hyphens are allowed. - * - Cannot start with a hyphen. - * - * @param name The dataset name to validate. - * @throws IllegalArgumentException if the name is invalid. - */ + * Validates the dataset name. + * + * Rules: + * - Must be at least 1 character long. + * - Only lowercase letters, numbers, underscores, and hyphens are allowed. + * - Cannot start with a hyphen. + * + * @param name The dataset name to validate. + * @throws IllegalArgumentException if the name is invalid. + */ private def validateDatasetName(name: String): Unit = { val datasetNamePattern = "^[A-Za-z0-9_-]+$".r if (!datasetNamePattern.matches(name)) { @@ -1298,11 +1256,11 @@ class DatasetResource { } private def fetchDatasetVersionRootFileNodes( - ctx: DSLContext, - did: Integer, - dvid: Integer, - uid: Option[Integer] - ): DatasetVersionRootFileNodesResponse = { + ctx: DSLContext, + did: Integer, + dvid: Integer, + uid: Option[Integer] + ): DatasetVersionRootFileNodesResponse = { val dataset = getDashboardDataset(ctx, did, uid) val datasetVersion = getDatasetVersionByID(ctx, dvid) val datasetName = dataset.dataset.getName @@ -1332,11 +1290,11 @@ class DatasetResource { } private def generatePresignedResponse( - encodedUrl: String, - repositoryName: String, - commitHash: String, - uid: Integer - ): Response = { + encodedUrl: String, + repositoryName: String, + commitHash: String, + uid: Integer + ): Response = { resolveDatasetAndPath(encodedUrl, repositoryName, commitHash, uid) match { case Left(errorResponse) => errorResponse @@ -1353,11 +1311,11 @@ class DatasetResource { } private def resolveDatasetAndPath( - encodedUrl: String, - repositoryName: String, - commitHash: String, - uid: Integer - ): Either[Response, (String, String, String)] = { + encodedUrl: String, + repositoryName: String, + commitHash: String, + uid: Integer + ): Either[Response, (String, String, String)] = { val decodedPathStr = URLDecoder.decode(encodedUrl, StandardCharsets.UTF_8.name()) (Option(repositoryName), Option(commitHash)) match { From c366654ca7be0b63362f9352ba9c038192f622be Mon Sep 17 00:00:00 2001 From: Carlos Ernesto Alvarez Berumen Date: Thu, 13 Nov 2025 22:07:38 -0600 Subject: [PATCH 07/15] v3 --- .../org/apache/texera/service/resource/DatasetResource.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala b/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala index 8969bb3c0f6..06d3f75b30c 100644 --- a/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala +++ b/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala @@ -1372,4 +1372,4 @@ class DatasetResource { Right(response) } } -} +} \ No newline at end of file From 285835bd73d54ec90252866ec4c5f94958ba7f35 Mon Sep 17 00:00:00 2001 From: Carlos Ernesto Alvarez Berumen Date: Thu, 13 Nov 2025 22:18:20 -0600 Subject: [PATCH 08/15] v4 --- .../service/resource/DatasetResource.scala | 340 +++++++++--------- 1 file changed, 170 insertions(+), 170 deletions(-) diff --git a/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala b/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala index 06d3f75b30c..75e2f6e601e 100644 --- a/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala +++ b/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala @@ -73,8 +73,8 @@ object DatasetResource { .createDSLContext() /** - * Helper function to get the dataset from DB using did - */ + * Helper function to get the dataset from DB using did + */ private def getDatasetByID(ctx: DSLContext, did: Integer): Dataset = { val datasetDao = new DatasetDao(ctx.configuration()) val dataset = datasetDao.fetchOneByDid(did) @@ -85,8 +85,8 @@ object DatasetResource { } /** - * Helper function to PUT exactly len bytes from buf to presigned URL, return the ETag - */ + * Helper function to PUT exactly len bytes from buf to presigned URL, return the ETag + */ private def put(buf: Array[Byte], len: Int, url: String, partNum: Int): String = { val conn = new URL(url).openConnection().asInstanceOf[HttpURLConnection] conn.setDoOutput(true); @@ -106,12 +106,12 @@ object DatasetResource { } /** - * Helper function to get the dataset version from DB using dvid - */ + * Helper function to get the dataset version from DB using dvid + */ private def getDatasetVersionByID( - ctx: DSLContext, - dvid: Integer - ): DatasetVersion = { + ctx: DSLContext, + dvid: Integer + ): DatasetVersion = { val datasetVersionDao = new DatasetVersionDao(ctx.configuration()) val version = datasetVersionDao.fetchOneByDvid(dvid) if (version == null) { @@ -121,12 +121,12 @@ object DatasetResource { } /** - * Helper function to get the latest dataset version from the DB - */ + * Helper function to get the latest dataset version from the DB + */ private def getLatestDatasetVersion( - ctx: DSLContext, - did: Integer - ): Option[DatasetVersion] = { + ctx: DSLContext, + did: Integer + ): Option[DatasetVersion] = { ctx .selectFrom(DATASET_VERSION) .where(DATASET_VERSION.DID.eq(did)) @@ -137,38 +137,38 @@ object DatasetResource { } case class DashboardDataset( - dataset: Dataset, - ownerEmail: String, - accessPrivilege: EnumType, - isOwner: Boolean, - size: Long - ) + dataset: Dataset, + ownerEmail: String, + accessPrivilege: EnumType, + isOwner: Boolean, + size: Long + ) case class DashboardDatasetVersion( - datasetVersion: DatasetVersion, - fileNodes: List[DatasetFileNode] - ) + datasetVersion: DatasetVersion, + fileNodes: List[DatasetFileNode] + ) case class CreateDatasetRequest( - datasetName: String, - datasetDescription: String, - isDatasetPublic: Boolean, - isDatasetDownloadable: Boolean - ) + datasetName: String, + datasetDescription: String, + isDatasetPublic: Boolean, + isDatasetDownloadable: Boolean + ) case class Diff( - path: String, - pathType: String, - diffType: String, // "added", "removed", "changed", etc. - sizeBytes: Option[Long] // Size of the changed file (None for directories) - ) + path: String, + pathType: String, + diffType: String, // "added", "removed", "changed", etc. + sizeBytes: Option[Long] // Size of the changed file (None for directories) + ) case class DatasetDescriptionModification(did: Integer, description: String) case class DatasetVersionRootFileNodesResponse( - fileNodes: List[DatasetFileNode], - size: Long - ) + fileNodes: List[DatasetFileNode], + size: Long + ) } @Produces(Array(MediaType.APPLICATION_JSON, "image/jpeg", "application/pdf")) @@ -179,13 +179,13 @@ class DatasetResource { private val EXPIRATION_MINUTES = 5 /** - * Helper function to get the dataset from DB with additional information including user access privilege and owner email - */ + * Helper function to get the dataset from DB with additional information including user access privilege and owner email + */ private def getDashboardDataset( - ctx: DSLContext, - did: Integer, - requesterUid: Option[Integer] - ): DashboardDataset = { + ctx: DSLContext, + did: Integer, + requesterUid: Option[Integer] + ): DashboardDataset = { val targetDataset = getDatasetByID(ctx, did) if (requesterUid.isEmpty && !targetDataset.getIsPublic) { @@ -214,9 +214,9 @@ class DatasetResource { @Path("/create") @Consumes(Array(MediaType.APPLICATION_JSON)) def createDataset( - request: CreateDatasetRequest, - @Auth user: SessionUser - ): DashboardDataset = { + request: CreateDatasetRequest, + @Auth user: SessionUser + ): DashboardDataset = { withTransaction(context) { ctx => val uid = user.getUid @@ -301,10 +301,10 @@ class DatasetResource { @Path("/{did}/version/create") @Consumes(Array(MediaType.TEXT_PLAIN)) def createDatasetVersion( - versionName: String, - @PathParam("did") did: Integer, - @Auth user: SessionUser - ): DashboardDatasetVersion = { + versionName: String, + @PathParam("did") did: Integer, + @Auth user: SessionUser + ): DashboardDatasetVersion = { val uid = user.getUid withTransaction(context) { ctx => if (!userHasWriteAccess(ctx, did, uid)) { @@ -422,9 +422,9 @@ class DatasetResource { @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/update/description") def updateDatasetDescription( - modificator: DatasetDescriptionModification, - @Auth sessionUser: SessionUser - ): Response = { + modificator: DatasetDescriptionModification, + @Auth sessionUser: SessionUser + ): Response = { withTransaction(context) { ctx => val uid = sessionUser.getUid val datasetDao = new DatasetDao(ctx.configuration()) @@ -444,13 +444,13 @@ class DatasetResource { @Path("/{did}/upload") @Consumes(Array(MediaType.APPLICATION_OCTET_STREAM)) def uploadOneFileToDataset( - @PathParam("did") did: Integer, - @QueryParam("filePath") encodedFilePath: String, - @QueryParam("message") message: String, - fileStream: InputStream, - @Context headers: HttpHeaders, - @Auth user: SessionUser - ): Response = { + @PathParam("did") did: Integer, + @QueryParam("filePath") encodedFilePath: String, + @QueryParam("message") message: String, + fileStream: InputStream, + @Context headers: HttpHeaders, + @Auth user: SessionUser + ): Response = { // These variables are defined at the top so catch block can access them val uid = user.getUid var repoName: String = null @@ -557,11 +557,11 @@ class DatasetResource { @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/presign-download") def getPresignedUrl( - @QueryParam("filePath") encodedUrl: String, - @QueryParam("repositoryName") repositoryName: String, - @QueryParam("commitHash") commitHash: String, - @Auth user: SessionUser - ): Response = { + @QueryParam("filePath") encodedUrl: String, + @QueryParam("repositoryName") repositoryName: String, + @QueryParam("commitHash") commitHash: String, + @Auth user: SessionUser + ): Response = { val uid = user.getUid generatePresignedResponse(encodedUrl, repositoryName, commitHash, uid) } @@ -570,11 +570,11 @@ class DatasetResource { @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/presign-download-s3") def getPresignedUrlWithS3( - @QueryParam("filePath") encodedUrl: String, - @QueryParam("repositoryName") repositoryName: String, - @QueryParam("commitHash") commitHash: String, - @Auth user: SessionUser - ): Response = { + @QueryParam("filePath") encodedUrl: String, + @QueryParam("repositoryName") repositoryName: String, + @QueryParam("commitHash") commitHash: String, + @Auth user: SessionUser + ): Response = { val uid = user.getUid generatePresignedResponse(encodedUrl, repositoryName, commitHash, uid) } @@ -582,20 +582,20 @@ class DatasetResource { @GET @Path("/public-presign-download") def getPublicPresignedUrl( - @QueryParam("filePath") encodedUrl: String, - @QueryParam("repositoryName") repositoryName: String, - @QueryParam("commitHash") commitHash: String - ): Response = { + @QueryParam("filePath") encodedUrl: String, + @QueryParam("repositoryName") repositoryName: String, + @QueryParam("commitHash") commitHash: String + ): Response = { generatePresignedResponse(encodedUrl, repositoryName, commitHash, null) } @GET @Path("/public-presign-download-s3") def getPublicPresignedUrlWithS3( - @QueryParam("filePath") encodedUrl: String, - @QueryParam("repositoryName") repositoryName: String, - @QueryParam("commitHash") commitHash: String - ): Response = { + @QueryParam("filePath") encodedUrl: String, + @QueryParam("repositoryName") repositoryName: String, + @QueryParam("commitHash") commitHash: String + ): Response = { generatePresignedResponse(encodedUrl, repositoryName, commitHash, null) } @@ -604,10 +604,10 @@ class DatasetResource { @Path("/{did}/file") @Consumes(Array(MediaType.APPLICATION_JSON)) def deleteDatasetFile( - @PathParam("did") did: Integer, - @QueryParam("filePath") encodedFilePath: String, - @Auth user: SessionUser - ): Response = { + @PathParam("did") did: Integer, + @QueryParam("filePath") encodedFilePath: String, + @Auth user: SessionUser + ): Response = { val uid = user.getUid withTransaction(context) { ctx => if (!userHasWriteAccess(ctx, did, uid)) { @@ -636,18 +636,18 @@ class DatasetResource { @Path("/multipart-upload") @Consumes(Array(MediaType.APPLICATION_JSON)) def multipartUpload( - @QueryParam("type") operationType: String, - @QueryParam("ownerEmail") ownerEmail: String, - @QueryParam("datasetName") datasetName: String, - @QueryParam("filePath") encodedUrl: String, - @QueryParam("uploadId") uploadId: Optional[String], - @QueryParam("numParts") numParts: Optional[Integer], - payload: Map[ - String, - Any - ], // Expecting {"parts": [...], "physicalAddress": "s3://bucket/path"} - @Auth user: SessionUser - ): Response = { + @QueryParam("type") operationType: String, + @QueryParam("ownerEmail") ownerEmail: String, + @QueryParam("datasetName") datasetName: String, + @QueryParam("filePath") encodedUrl: String, + @QueryParam("uploadId") uploadId: Optional[String], + @QueryParam("numParts") numParts: Optional[Integer], + payload: Map[ + String, + Any + ], // Expecting {"parts": [...], "physicalAddress": "s3://bucket/path"} + @Auth user: SessionUser + ): Response = { val uid = user.getUid withTransaction(context) { ctx => @@ -778,9 +778,9 @@ class DatasetResource { @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/{did}/update/publicity") def toggleDatasetPublicity( - @PathParam("did") did: Integer, - @Auth sessionUser: SessionUser - ): Response = { + @PathParam("did") did: Integer, + @Auth sessionUser: SessionUser + ): Response = { withTransaction(context) { ctx => val datasetDao = new DatasetDao(ctx.configuration()) val uid = sessionUser.getUid @@ -802,9 +802,9 @@ class DatasetResource { @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/{did}/update/downloadable") def toggleDatasetDownloadable( - @PathParam("did") did: Integer, - @Auth sessionUser: SessionUser - ): Response = { + @PathParam("did") did: Integer, + @Auth sessionUser: SessionUser + ): Response = { withTransaction(context) { ctx => val datasetDao = new DatasetDao(ctx.configuration()) val uid = sessionUser.getUid @@ -827,9 +827,9 @@ class DatasetResource { @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/{did}/diff") def getDatasetDiff( - @PathParam("did") did: Integer, - @Auth user: SessionUser - ): List[Diff] = { + @PathParam("did") did: Integer, + @Auth user: SessionUser + ): List[Diff] = { val uid = user.getUid withTransaction(context) { ctx => if (!userHasReadAccess(ctx, did, uid)) { @@ -857,10 +857,10 @@ class DatasetResource { @Path("/{did}/diff") @Consumes(Array(MediaType.APPLICATION_JSON)) def resetDatasetFileDiff( - @PathParam("did") did: Integer, - @QueryParam("filePath") encodedFilePath: String, - @Auth user: SessionUser - ): Response = { + @PathParam("did") did: Integer, + @QueryParam("filePath") encodedFilePath: String, + @Auth user: SessionUser + ): Response = { val uid = user.getUid withTransaction(context) { ctx => if (!userHasWriteAccess(ctx, did, uid)) { @@ -884,17 +884,17 @@ class DatasetResource { } /** - * This method returns a list of DashboardDatasets objects that are accessible by current user. - * - * @param user the session user - * @return list of user accessible DashboardDataset objects - */ + * This method returns a list of DashboardDatasets objects that are accessible by current user. + * + * @param user the session user + * @return list of user accessible DashboardDataset objects + */ @GET @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/list") def listDatasets( - @Auth user: SessionUser - ): List[DashboardDataset] = { + @Auth user: SessionUser + ): List[DashboardDataset] = { val uid = user.getUid withTransaction(context)(ctx => { var accessibleDatasets: ListBuffer[DashboardDataset] = ListBuffer() @@ -968,9 +968,9 @@ class DatasetResource { @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/{did}/version/list") def getDatasetVersionList( - @PathParam("did") did: Integer, - @Auth user: SessionUser - ): List[DatasetVersion] = { + @PathParam("did") did: Integer, + @Auth user: SessionUser + ): List[DatasetVersion] = { val uid = user.getUid withTransaction(context)(ctx => { val dataset = getDatasetByID(ctx, did) @@ -984,8 +984,8 @@ class DatasetResource { @GET @Path("/{name}/publicVersion/list") def getPublicDatasetVersionList( - @PathParam("name") did: Integer - ): List[DatasetVersion] = { + @PathParam("name") did: Integer + ): List[DatasetVersion] = { withTransaction(context)(ctx => { if (!isDatasetPublic(ctx, did)) { throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE) @@ -998,9 +998,9 @@ class DatasetResource { @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/{did}/version/latest") def retrieveLatestDatasetVersion( - @PathParam("did") did: Integer, - @Auth user: SessionUser - ): DashboardDatasetVersion = { + @PathParam("did") did: Integer, + @Auth user: SessionUser + ): DashboardDatasetVersion = { val uid = user.getUid withTransaction(context)(ctx => { if (!userHasReadAccess(ctx, did, uid)) { @@ -1040,11 +1040,11 @@ class DatasetResource { @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/{did}/versionZip") def getDatasetVersionZip( - @PathParam("did") did: Integer, - @QueryParam("dvid") dvid: Integer, // Dataset version ID, nullable - @QueryParam("latest") latest: java.lang.Boolean, // Flag to get latest version, nullable - @Auth user: SessionUser - ): Response = { + @PathParam("did") did: Integer, + @QueryParam("dvid") dvid: Integer, // Dataset version ID, nullable + @QueryParam("latest") latest: java.lang.Boolean, // Flag to get latest version, nullable + @Auth user: SessionUser + ): Response = { withTransaction(context) { ctx => if ((dvid != null && latest != null) || (dvid == null && latest == null)) { @@ -1120,10 +1120,10 @@ class DatasetResource { @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/{did}/version/{dvid}/rootFileNodes") def retrieveDatasetVersionRootFileNodes( - @PathParam("did") did: Integer, - @PathParam("dvid") dvid: Integer, - @Auth user: SessionUser - ): DatasetVersionRootFileNodesResponse = { + @PathParam("did") did: Integer, + @PathParam("dvid") dvid: Integer, + @Auth user: SessionUser + ): DatasetVersionRootFileNodesResponse = { val uid = user.getUid withTransaction(context)(ctx => fetchDatasetVersionRootFileNodes(ctx, did, dvid, Some(uid))) } @@ -1131,9 +1131,9 @@ class DatasetResource { @GET @Path("/{did}/publicVersion/{dvid}/rootFileNodes") def retrievePublicDatasetVersionRootFileNodes( - @PathParam("did") did: Integer, - @PathParam("dvid") dvid: Integer - ): DatasetVersionRootFileNodesResponse = { + @PathParam("did") did: Integer, + @PathParam("dvid") dvid: Integer + ): DatasetVersionRootFileNodesResponse = { withTransaction(context)(ctx => fetchDatasetVersionRootFileNodes(ctx, did, dvid, None)) } @@ -1141,9 +1141,9 @@ class DatasetResource { @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/{did}") def getDataset( - @PathParam("did") did: Integer, - @Auth user: SessionUser - ): DashboardDataset = { + @PathParam("did") did: Integer, + @Auth user: SessionUser + ): DashboardDataset = { val uid = user.getUid withTransaction(context)(ctx => getDashboardDataset(ctx, did, Some(uid))) } @@ -1151,16 +1151,16 @@ class DatasetResource { @GET @Path("/public/{did}") def getPublicDataset( - @PathParam("did") did: Integer - ): DashboardDataset = { + @PathParam("did") did: Integer + ): DashboardDataset = { withTransaction(context)(ctx => getDashboardDataset(ctx, did, None)) } @GET @Path("/file") def retrieveDatasetSingleFile( - @QueryParam("path") pathStr: String - ): Response = { + @QueryParam("path") pathStr: String + ): Response = { val decodedPathStr = URLDecoder.decode(pathStr, StandardCharsets.UTF_8.name()) withTransaction(context)(_ => { @@ -1203,10 +1203,10 @@ class DatasetResource { } /** - * This method returns all owner user names of the dataset that the user has access to - * - * @return OwnerName[] - */ + * This method returns all owner user names of the dataset that the user has access to + * + * @return OwnerName[] + */ @GET @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/user-dataset-owners") @@ -1223,16 +1223,16 @@ class DatasetResource { } /** - * Validates the dataset name. - * - * Rules: - * - Must be at least 1 character long. - * - Only lowercase letters, numbers, underscores, and hyphens are allowed. - * - Cannot start with a hyphen. - * - * @param name The dataset name to validate. - * @throws IllegalArgumentException if the name is invalid. - */ + * Validates the dataset name. + * + * Rules: + * - Must be at least 1 character long. + * - Only lowercase letters, numbers, underscores, and hyphens are allowed. + * - Cannot start with a hyphen. + * + * @param name The dataset name to validate. + * @throws IllegalArgumentException if the name is invalid. + */ private def validateDatasetName(name: String): Unit = { val datasetNamePattern = "^[A-Za-z0-9_-]+$".r if (!datasetNamePattern.matches(name)) { @@ -1256,11 +1256,11 @@ class DatasetResource { } private def fetchDatasetVersionRootFileNodes( - ctx: DSLContext, - did: Integer, - dvid: Integer, - uid: Option[Integer] - ): DatasetVersionRootFileNodesResponse = { + ctx: DSLContext, + did: Integer, + dvid: Integer, + uid: Option[Integer] + ): DatasetVersionRootFileNodesResponse = { val dataset = getDashboardDataset(ctx, did, uid) val datasetVersion = getDatasetVersionByID(ctx, dvid) val datasetName = dataset.dataset.getName @@ -1290,11 +1290,11 @@ class DatasetResource { } private def generatePresignedResponse( - encodedUrl: String, - repositoryName: String, - commitHash: String, - uid: Integer - ): Response = { + encodedUrl: String, + repositoryName: String, + commitHash: String, + uid: Integer + ): Response = { resolveDatasetAndPath(encodedUrl, repositoryName, commitHash, uid) match { case Left(errorResponse) => errorResponse @@ -1311,11 +1311,11 @@ class DatasetResource { } private def resolveDatasetAndPath( - encodedUrl: String, - repositoryName: String, - commitHash: String, - uid: Integer - ): Either[Response, (String, String, String)] = { + encodedUrl: String, + repositoryName: String, + commitHash: String, + uid: Integer + ): Either[Response, (String, String, String)] = { val decodedPathStr = URLDecoder.decode(encodedUrl, StandardCharsets.UTF_8.name()) (Option(repositoryName), Option(commitHash)) match { @@ -1372,4 +1372,4 @@ class DatasetResource { Right(response) } } -} \ No newline at end of file +} From 5589b1bf535f9b1788ab7b2f251a9fece4de3250 Mon Sep 17 00:00:00 2001 From: Carlos Ernesto Alvarez Berumen Date: Sun, 16 Nov 2025 15:16:27 -0600 Subject: [PATCH 09/15] v1 --- .../service/resource/DatasetResource.scala | 46 ++++++++++++++++++- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala b/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala index 75e2f6e601e..f49588546ac 100644 --- a/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala +++ b/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala @@ -28,6 +28,7 @@ import org.apache.amber.core.storage.model.OnDataset import org.apache.amber.core.storage.util.LakeFSStorageClient import org.apache.amber.core.storage.{DocumentFactory, FileResolver} import org.apache.texera.auth.SessionUser +import org.apache.texera.config.DefaultsConfig import org.apache.texera.dao.SqlServer import org.apache.texera.dao.SqlServer.withTransaction import org.apache.texera.dao.jooq.generated.enums.PrivilegeEnum @@ -53,6 +54,7 @@ import org.apache.texera.service.util.S3StorageClient.{ MAXIMUM_NUM_OF_MULTIPART_S3_PARTS, MINIMUM_NUM_OF_MULTIPART_S3_PART } +import org.jooq.impl.DSL import org.jooq.{DSLContext, EnumType} import java.io.{InputStream, OutputStream} @@ -178,6 +180,26 @@ class DatasetResource { private val ERR_DATASET_VERSION_NOT_FOUND_MESSAGE = "The version of the dataset not found" private val EXPIRATION_MINUTES = 5 + private val SingleFileUploadMaxSizeKey = "single_file_upload_max_size_mib" + + def singleFileUploadMaxSizeMib: Int = { + val valueOpt = Option( + context + .select(DSL.field("value", classOf[String])) + .from(DSL.table("site_settings")) + .where(DSL.field("key", classOf[String]).eq(SingleFileUploadMaxSizeKey)) + .fetchOne(0, classOf[String]) + ) + + valueOpt + .flatMap(v => scala.util.Try(v.toInt).toOption) + .getOrElse(DefaultsConfig.allDefaults(SingleFileUploadMaxSizeKey).toInt) + } + + /** Maximum allowed single-file upload size in bytes (MiB → bytes). */ + private def maxSingleFileUploadBytes: Long = + singleFileUploadMaxSizeMib.toLong * 1024L * 1024L + /** * Helper function to get the dataset from DB with additional information including user access privilege and owner email */ @@ -401,7 +423,6 @@ class DatasetResource { e ) } - // delete the directory on S3 if ( S3StorageClient.directoryExists(StorageConfig.lakefsBucketName, dataset.getRepositoryName) @@ -504,6 +525,7 @@ class DatasetResource { var buffered = 0 var partNumber = 1 val completedParts = ListBuffer[(Int, String)]() + var totalBytesRead = 0L @inline def flush(): Unit = { if (buffered == 0) return @@ -519,6 +541,13 @@ class DatasetResource { var read = fileStream.read(buf, buffered, buf.length - buffered) while (read != -1) { buffered += read + totalBytesRead += read + if (totalBytesRead > maxSingleFileUploadBytes) { + throw new WebApplicationException( + s"File exceeds maximum allowed size of ${singleFileUploadMaxSizeMib} MiB.", + Response.Status.REQUEST_ENTITY_TOO_LARGE + ) + } if (buffered == buf.length) flush() // buffer full read = fileStream.read(buf, buffered, buf.length - buffered) } @@ -737,7 +766,20 @@ class DatasetResource { partsList, physicalAddress ) - + val sizeBytes = Option(objectStats.getSizeBytes).map(_.longValue()).getOrElse(0L) + if (sizeBytes > maxSingleFileUploadBytes) { + // Roll back staged object to previous committed state (or remove if new). + try { + LakeFSStorageClient.resetObjectUploadOrDeletion(repositoryName, filePath) + } catch { + case _: Exception => // best-effort cleanup + } + throw new WebApplicationException( + s"File exceeds maximum allowed size of " + + s"${singleFileUploadMaxSizeMib} MiB. Upload has been rolled back.", + Response.Status.REQUEST_ENTITY_TOO_LARGE + ) + } Response .ok( Map( From 6dbcc7b77aa2f93ce12e2770f6e96be0908c68e6 Mon Sep 17 00:00:00 2001 From: Carlos Ernesto Alvarez Berumen Date: Sun, 16 Nov 2025 15:31:28 -0600 Subject: [PATCH 10/15] Revert non related changes --- .../amber/core/tuple/AttributeTypeUtils.scala | 131 ------------------ .../org/apache/amber/core/tuple/Schema.scala | 4 - .../org/apache/amber/core/tuple/Tuple.scala | 17 +-- .../aggregate/AggregationOperation.scala | 120 ++++++++++++++-- .../sortPartitions/SortPartitionsOpExec.scala | 23 +-- 5 files changed, 123 insertions(+), 172 deletions(-) diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala index 382846fd3d6..e4fdcb4611d 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/AttributeTypeUtils.scala @@ -387,137 +387,6 @@ object AttributeTypeUtils extends Serializable { } } - /** Three-way compare for the given attribute type. - * Returns < 0 if left < right, > 0 if left > right, 0 if equal. - * Null semantics: null < non-null (both null => 0). - */ - @throws[UnsupportedOperationException] - def compare(left: Any, right: Any, attrType: AttributeType): Int = - (left, right) match { - case (null, null) => 0 - case (null, _) => -1 - case (_, null) => 1 - case _ => - attrType match { - case AttributeType.INTEGER => - java.lang.Integer.compare( - left.asInstanceOf[Number].intValue(), - right.asInstanceOf[Number].intValue() - ) - case AttributeType.LONG => - java.lang.Long.compare( - left.asInstanceOf[Number].longValue(), - right.asInstanceOf[Number].longValue() - ) - case AttributeType.DOUBLE => - java.lang.Double.compare( - left.asInstanceOf[Number].doubleValue(), - right.asInstanceOf[Number].doubleValue() - ) // handles ±Inf/NaN per JDK - case AttributeType.BOOLEAN => - java.lang.Boolean.compare( - left.asInstanceOf[Boolean], - right.asInstanceOf[Boolean] - ) - case AttributeType.TIMESTAMP => - java.lang.Long.compare( - left.asInstanceOf[Timestamp].getTime, - right.asInstanceOf[Timestamp].getTime - ) - case AttributeType.STRING => - left.toString.compareTo(right.toString) - case AttributeType.BINARY => - java.util.Arrays.compareUnsigned( - left.asInstanceOf[Array[Byte]], - right.asInstanceOf[Array[Byte]] - ) - case _ => - throw new UnsupportedOperationException( - s"Unsupported attribute type for compare: $attrType" - ) - } - } - - /** Type-aware addition (null is identity). */ - @throws[UnsupportedOperationException] - def add(left: Object, right: Object, attrType: AttributeType): Object = - (left, right) match { - case (null, null) => zeroValue(attrType) - case (null, r) => r - case (l, null) => l - case (l, r) => - attrType match { - case AttributeType.INTEGER => - java.lang.Integer.valueOf( - l.asInstanceOf[Number].intValue() + r.asInstanceOf[Number].intValue() - ) - case AttributeType.LONG => - java.lang.Long.valueOf( - l.asInstanceOf[Number].longValue() + r.asInstanceOf[Number].longValue() - ) - case AttributeType.DOUBLE => - java.lang.Double.valueOf( - l.asInstanceOf[Number].doubleValue() + r.asInstanceOf[Number].doubleValue() - ) - case AttributeType.TIMESTAMP => - new Timestamp( - l.asInstanceOf[Timestamp].getTime + r.asInstanceOf[Timestamp].getTime - ) - case _ => - throw new UnsupportedOperationException( - s"Unsupported attribute type for addition: $attrType" - ) - } - } - - /** Additive identity for supported numeric/timestamp types. - * For BINARY an empty array is returned as a benign identity value. - */ - @throws[UnsupportedOperationException] - def zeroValue(attrType: AttributeType): Object = - attrType match { - case AttributeType.INTEGER => java.lang.Integer.valueOf(0) - case AttributeType.LONG => java.lang.Long.valueOf(0L) - case AttributeType.DOUBLE => java.lang.Double.valueOf(0.0d) - case AttributeType.TIMESTAMP => new Timestamp(0L) - case AttributeType.BINARY => Array.emptyByteArray - case _ => - throw new UnsupportedOperationException( - s"Unsupported attribute type for zero value: $attrType" - ) - } - - /** Maximum sentinel. */ - @throws[UnsupportedOperationException] - def maxValue(attrType: AttributeType): Object = - attrType match { - case AttributeType.INTEGER => java.lang.Integer.valueOf(Integer.MAX_VALUE) - case AttributeType.LONG => java.lang.Long.valueOf(java.lang.Long.MAX_VALUE) - case AttributeType.DOUBLE => java.lang.Double.valueOf(java.lang.Double.MAX_VALUE) - case AttributeType.TIMESTAMP => new Timestamp(java.lang.Long.MAX_VALUE) - case _ => - throw new UnsupportedOperationException( - s"Unsupported attribute type for max value: $attrType" - ) - } - - /** Minimum sentinel (note Double.MIN_VALUE is > 0). - * For BINARY under lexicographic order, the empty array is the global minimum. - */ - @throws[UnsupportedOperationException] - def minValue(attrType: AttributeType): Object = - attrType match { - case AttributeType.INTEGER => java.lang.Integer.valueOf(Integer.MIN_VALUE) - case AttributeType.LONG => java.lang.Long.valueOf(java.lang.Long.MIN_VALUE) - case AttributeType.DOUBLE => java.lang.Double.valueOf(java.lang.Double.MIN_VALUE) - case AttributeType.TIMESTAMP => new Timestamp(0L) - case AttributeType.BINARY => Array.emptyByteArray - case _ => - throw new UnsupportedOperationException( - s"Unsupported attribute type for min value: $attrType" - ) - } - class AttributeTypeException(msg: String, cause: Throwable = null) extends IllegalArgumentException(msg, cause) {} } diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Schema.scala b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Schema.scala index 5e207209578..0bdf84a9eba 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Schema.scala +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Schema.scala @@ -191,10 +191,6 @@ case class Schema @JsonCreator() ( object Schema { - /** Build a Schema with (name, type) pairs, in order, rejecting duplicates. */ - def of(attrs: (String, AttributeType)*): Schema = - attrs.foldLeft(Schema()) { case (acc, (name, tpe)) => acc.add(name, tpe) } - /** * Creates a Schema instance from a raw map representation. * Each entry in the map contains an attribute name and its type as strings. diff --git a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Tuple.scala b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Tuple.scala index f9db43d1d1e..7bee0a0fc53 100644 --- a/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Tuple.scala +++ b/common/workflow-core/src/main/scala/org/apache/amber/core/tuple/Tuple.scala @@ -112,17 +112,6 @@ case class Tuple @JsonCreator() ( object Tuple { - /** Build a Tuple from (name -> value) pairs, coercing values to the schema types. */ - def of(schema: Schema, values: (String, Any)*): Tuple = { - val nameToValue: Map[String, Any] = values.toMap - val coercedFields: Array[Any] = - schema.getAttributes.map { attribute => - val rawValue: Any = nameToValue.getOrElse(attribute.getName, null) - AttributeTypeUtils.parseField(rawValue, attribute.getType, force = true) - }.toArray - Tuple(schema, coercedFields) - } - /** * Validates that the provided attributes match the provided fields in type and order. * @@ -163,7 +152,7 @@ object Tuple { ) ) { throw new RuntimeException( - s"edu.ics.uci.amber.model.tuple.model.Attribute ${attribute.getName}'s type (${attribute.getType}) is different from field's type (${AttributeType + s"Attribute ${attribute.getName}'s type (${attribute.getType}) is different from field's type (${AttributeType .getAttributeType(field.getClass)})" ) } @@ -201,7 +190,7 @@ object Tuple { } def add(attribute: Attribute, field: Any): Builder = { - require(attribute != null, "edu.ics.uci.amber.model.tuple.model.Attribute cannot be null") + require(attribute != null, "Attribute cannot be null") checkAttributeMatchesField(attribute, field) if (!schema.containsAttribute(attribute.getName)) { @@ -217,7 +206,7 @@ object Tuple { def add(attributeName: String, attributeType: AttributeType, field: Any): Builder = { require( attributeName != null && attributeType != null, - "edu.ics.uci.amber.model.tuple.model.Attribute name and type cannot be null" + "Attribute name and type cannot be null" ) this.add(new Attribute(attributeName, attributeType), field) this diff --git a/common/workflow-operator/src/main/scala/org/apache/amber/operator/aggregate/AggregationOperation.scala b/common/workflow-operator/src/main/scala/org/apache/amber/operator/aggregate/AggregationOperation.scala index 931163e9ed3..8818d831e1c 100644 --- a/common/workflow-operator/src/main/scala/org/apache/amber/operator/aggregate/AggregationOperation.scala +++ b/common/workflow-operator/src/main/scala/org/apache/amber/operator/aggregate/AggregationOperation.scala @@ -21,9 +21,11 @@ package org.apache.amber.operator.aggregate import com.fasterxml.jackson.annotation.{JsonIgnore, JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} -import org.apache.amber.core.tuple.{Attribute, AttributeType, AttributeTypeUtils, Tuple} +import org.apache.amber.core.tuple.AttributeTypeUtils.parseTimestamp +import org.apache.amber.core.tuple.{Attribute, AttributeType, Tuple} import org.apache.amber.operator.metadata.annotations.AutofillAttributeName +import java.sql.Timestamp import javax.validation.constraints.NotNull case class AveragePartialObj(sum: Double, count: Double) extends Serializable {} @@ -128,12 +130,12 @@ class AggregationOperation { ) } new DistributedAggregation[Object]( - () => AttributeTypeUtils.zeroValue(attributeType), + () => zero(attributeType), (partial, tuple) => { val value = tuple.getField[Object](attribute) - AttributeTypeUtils.add(partial, value, attributeType) + add(partial, value, attributeType) }, - (partial1, partial2) => AttributeTypeUtils.add(partial1, partial2, attributeType), + (partial1, partial2) => add(partial1, partial2, attributeType), partial => partial ) } @@ -188,16 +190,15 @@ class AggregationOperation { ) } new DistributedAggregation[Object]( - () => AttributeTypeUtils.maxValue(attributeType), + () => maxValue(attributeType), (partial, tuple) => { val value = tuple.getField[Object](attribute) - val comp = AttributeTypeUtils.compare(value, partial, attributeType) + val comp = compare(value, partial, attributeType) if (value != null && comp < 0) value else partial }, (partial1, partial2) => - if (AttributeTypeUtils.compare(partial1, partial2, attributeType) < 0) partial1 - else partial2, - partial => if (partial == AttributeTypeUtils.maxValue(attributeType)) null else partial + if (compare(partial1, partial2, attributeType) < 0) partial1 else partial2, + partial => if (partial == maxValue(attributeType)) null else partial ) } @@ -213,16 +214,15 @@ class AggregationOperation { ) } new DistributedAggregation[Object]( - () => AttributeTypeUtils.minValue(attributeType), + () => minValue(attributeType), (partial, tuple) => { val value = tuple.getField[Object](attribute) - val comp = AttributeTypeUtils.compare(value, partial, attributeType) + val comp = compare(value, partial, attributeType) if (value != null && comp > 0) value else partial }, (partial1, partial2) => - if (AttributeTypeUtils.compare(partial1, partial2, attributeType) > 0) partial1 - else partial2, - partial => if (partial == AttributeTypeUtils.maxValue(attributeType)) null else partial + if (compare(partial1, partial2, attributeType) > 0) partial1 else partial2, + partial => if (partial == maxValue(attributeType)) null else partial ) } @@ -232,7 +232,7 @@ class AggregationOperation { return None if (tuple.getSchema.getAttribute(attribute).getType == AttributeType.TIMESTAMP) - Option(AttributeTypeUtils.parseTimestamp(value.toString).getTime.toDouble) + Option(parseTimestamp(value.toString).getTime.toDouble) else Option(value.toString.toDouble) } @@ -254,4 +254,94 @@ class AggregationOperation { } ) } + + // return a.compare(b), + // < 0 if a < b, + // > 0 if a > b, + // 0 if a = b + private def compare(a: Object, b: Object, attributeType: AttributeType): Int = { + if (a == null && b == null) { + return 0 + } else if (a == null) { + return -1 + } else if (b == null) { + return 1 + } + attributeType match { + case AttributeType.INTEGER => a.asInstanceOf[Integer].compareTo(b.asInstanceOf[Integer]) + case AttributeType.DOUBLE => + a.asInstanceOf[java.lang.Double].compareTo(b.asInstanceOf[java.lang.Double]) + case AttributeType.LONG => + a.asInstanceOf[java.lang.Long].compareTo(b.asInstanceOf[java.lang.Long]) + case AttributeType.TIMESTAMP => + a.asInstanceOf[Timestamp].getTime.compareTo(b.asInstanceOf[Timestamp].getTime) + case _ => + throw new UnsupportedOperationException( + "Unsupported attribute type for comparison: " + attributeType + ) + } + } + + private def add(a: Object, b: Object, attributeType: AttributeType): Object = { + if (a == null && b == null) { + return zero(attributeType) + } else if (a == null) { + return b + } else if (b == null) { + return a + } + attributeType match { + case AttributeType.INTEGER => + Integer.valueOf(a.asInstanceOf[Integer] + b.asInstanceOf[Integer]) + case AttributeType.DOUBLE => + java.lang.Double.valueOf( + a.asInstanceOf[java.lang.Double] + b.asInstanceOf[java.lang.Double] + ) + case AttributeType.LONG => + java.lang.Long.valueOf(a.asInstanceOf[java.lang.Long] + b.asInstanceOf[java.lang.Long]) + case AttributeType.TIMESTAMP => + new Timestamp(a.asInstanceOf[Timestamp].getTime + b.asInstanceOf[Timestamp].getTime) + case _ => + throw new UnsupportedOperationException( + "Unsupported attribute type for addition: " + attributeType + ) + } + } + + private def zero(attributeType: AttributeType): Object = + attributeType match { + case AttributeType.INTEGER => java.lang.Integer.valueOf(0) + case AttributeType.DOUBLE => java.lang.Double.valueOf(0) + case AttributeType.LONG => java.lang.Long.valueOf(0) + case AttributeType.TIMESTAMP => new Timestamp(0) + case _ => + throw new UnsupportedOperationException( + "Unsupported attribute type for zero value: " + attributeType + ) + } + + private def maxValue(attributeType: AttributeType): Object = + attributeType match { + case AttributeType.INTEGER => Integer.MAX_VALUE.asInstanceOf[Object] + case AttributeType.DOUBLE => java.lang.Double.MAX_VALUE.asInstanceOf[Object] + case AttributeType.LONG => java.lang.Long.MAX_VALUE.asInstanceOf[Object] + case AttributeType.TIMESTAMP => new Timestamp(java.lang.Long.MAX_VALUE) + case _ => + throw new UnsupportedOperationException( + "Unsupported attribute type for max value: " + attributeType + ) + } + + private def minValue(attributeType: AttributeType): Object = + attributeType match { + case AttributeType.INTEGER => Integer.MIN_VALUE.asInstanceOf[Object] + case AttributeType.DOUBLE => java.lang.Double.MIN_VALUE.asInstanceOf[Object] + case AttributeType.LONG => java.lang.Long.MIN_VALUE.asInstanceOf[Object] + case AttributeType.TIMESTAMP => new Timestamp(0) + case _ => + throw new UnsupportedOperationException( + "Unsupported attribute type for min value: " + attributeType + ) + } + } diff --git a/common/workflow-operator/src/main/scala/org/apache/amber/operator/sortPartitions/SortPartitionsOpExec.scala b/common/workflow-operator/src/main/scala/org/apache/amber/operator/sortPartitions/SortPartitionsOpExec.scala index 52d2c735bb6..ac6a9da59ce 100644 --- a/common/workflow-operator/src/main/scala/org/apache/amber/operator/sortPartitions/SortPartitionsOpExec.scala +++ b/common/workflow-operator/src/main/scala/org/apache/amber/operator/sortPartitions/SortPartitionsOpExec.scala @@ -20,7 +20,7 @@ package org.apache.amber.operator.sortPartitions import org.apache.amber.core.executor.OperatorExecutor -import org.apache.amber.core.tuple.{AttributeTypeUtils, Tuple, TupleLike} +import org.apache.amber.core.tuple.{AttributeType, Tuple, TupleLike} import org.apache.amber.util.JSONUtils.objectMapper import scala.collection.mutable.ArrayBuffer @@ -47,11 +47,18 @@ class SortPartitionsOpExec(descString: String) extends OperatorExecutor { override def onFinish(port: Int): Iterator[TupleLike] = sortTuples() - private def compareTuples(t1: Tuple, t2: Tuple): Boolean = - AttributeTypeUtils.compare( - t1.getField[Any](t1.getSchema.getIndex(desc.sortAttributeName)), - t2.getField[Any](t2.getSchema.getIndex(desc.sortAttributeName)), - t1.getSchema.getAttribute(desc.sortAttributeName).getType - ) < 0 - + private def compareTuples(t1: Tuple, t2: Tuple): Boolean = { + val attributeType = t1.getSchema.getAttribute(desc.sortAttributeName).getType + val attributeIndex = t1.getSchema.getIndex(desc.sortAttributeName) + attributeType match { + case AttributeType.LONG => + t1.getField[Long](attributeIndex) < t2.getField[Long](attributeIndex) + case AttributeType.INTEGER => + t1.getField[Int](attributeIndex) < t2.getField[Int](attributeIndex) + case AttributeType.DOUBLE => + t1.getField[Double](attributeIndex) < t2.getField[Double](attributeIndex) + case _ => + true // unsupported type + } + } } From 1e9215d1cde4cba93edb6d1a7d7175ba0e716521 Mon Sep 17 00:00:00 2001 From: Carlos Ernesto Alvarez Berumen Date: Sun, 16 Nov 2025 15:34:20 -0600 Subject: [PATCH 11/15] v2 --- .../sort/StableMergeSortOpExecSpec.scala | 265 ++++++++++-------- 1 file changed, 144 insertions(+), 121 deletions(-) diff --git a/common/workflow-operator/src/test/scala/org/apache/amber/operator/sort/StableMergeSortOpExecSpec.scala b/common/workflow-operator/src/test/scala/org/apache/amber/operator/sort/StableMergeSortOpExecSpec.scala index e0fc940ec65..ecb38cfff4f 100644 --- a/common/workflow-operator/src/test/scala/org/apache/amber/operator/sort/StableMergeSortOpExecSpec.scala +++ b/common/workflow-operator/src/test/scala/org/apache/amber/operator/sort/StableMergeSortOpExecSpec.scala @@ -19,12 +19,13 @@ package org.apache.amber.operator.sort -import org.apache.amber.core.tuple.{AttributeType, Schema, Tuple} +import org.apache.amber.core.tuple.{Attribute, AttributeType, Schema, Tuple} import org.apache.amber.util.JSONUtils.objectMapper import org.scalatest.flatspec.AnyFlatSpec import java.sql.Timestamp import scala.collection.mutable.{ArrayBuffer, ListBuffer} +import scala.jdk.CollectionConverters.IterableHasAsJava /** * Integration and internal-behavior tests for [[StableMergeSortOpExec]]. @@ -50,6 +51,28 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { // Helpers // =========================================================================== + /** Build a Schema with (name, type) pairs, in-order. */ + private def schemaOf(attributes: (String, AttributeType)*): Schema = { + attributes.foldLeft(Schema()) { + case (acc, (name, attrType)) => acc.add(new Attribute(name, attrType)) + } + } + + /** + * Construct a Tuple for the provided schema. + * + * @param values map-like varargs: "colName" -> value. Must provide every column. + * @throws NoSuchElementException if a provided key is not in the schema. + */ + private def tupleOf(schema: Schema, values: (String, Any)*): Tuple = { + val valueMap = values.toMap + val builder = Tuple.builder(schema) + schema.getAttributeNames.asJava.forEach { name => + builder.add(schema.getAttribute(name), valueMap(name)) + } + builder.build() + } + /** Convenience builder for a single sort key with direction (ASC by default). */ private def sortKey( attribute: String, @@ -105,13 +128,13 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { // =========================================================================== "StableMergeSortOpExec" should "sort integers ascending and preserve duplicate order" in { - val schema = Schema.of("value" -> AttributeType.INTEGER, "label" -> AttributeType.STRING) + val schema = schemaOf("value" -> AttributeType.INTEGER, "label" -> AttributeType.STRING) val tuples = List( - Tuple.of(schema, "value" -> 3, "label" -> "a"), - Tuple.of(schema, "value" -> 1, "label" -> "first-1"), - Tuple.of(schema, "value" -> 2, "label" -> "b"), - Tuple.of(schema, "value" -> 1, "label" -> "first-2"), - Tuple.of(schema, "value" -> 3, "label" -> "c") + tupleOf(schema, "value" -> 3, "label" -> "a"), + tupleOf(schema, "value" -> 1, "label" -> "first-1"), + tupleOf(schema, "value" -> 2, "label" -> "b"), + tupleOf(schema, "value" -> 1, "label" -> "first-2"), + tupleOf(schema, "value" -> 3, "label" -> "c") ) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("value")) } assert(result.map(_.getField[Int]("value")) == List(1, 1, 2, 3, 3)) @@ -121,12 +144,12 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "sort integers descending while preserving stability" in { - val schema = Schema.of("value" -> AttributeType.INTEGER, "label" -> AttributeType.STRING) + val schema = schemaOf("value" -> AttributeType.INTEGER, "label" -> AttributeType.STRING) val tuples = List( - Tuple.of(schema, "value" -> 2, "label" -> "first"), - Tuple.of(schema, "value" -> 2, "label" -> "second"), - Tuple.of(schema, "value" -> 1, "label" -> "third"), - Tuple.of(schema, "value" -> 3, "label" -> "fourth") + tupleOf(schema, "value" -> 2, "label" -> "first"), + tupleOf(schema, "value" -> 2, "label" -> "second"), + tupleOf(schema, "value" -> 1, "label" -> "third"), + tupleOf(schema, "value" -> 3, "label" -> "fourth") ) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("value", SortPreference.DESC)) @@ -138,12 +161,12 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "handle string ordering (case-sensitive)" in { - val schema = Schema.of("name" -> AttributeType.STRING) + val schema = schemaOf("name" -> AttributeType.STRING) val tuples = List( - Tuple.of(schema, "name" -> "apple"), - Tuple.of(schema, "name" -> "Banana"), - Tuple.of(schema, "name" -> "banana"), - Tuple.of(schema, "name" -> "APPLE") + tupleOf(schema, "name" -> "apple"), + tupleOf(schema, "name" -> "Banana"), + tupleOf(schema, "name" -> "banana"), + tupleOf(schema, "name" -> "APPLE") ) val sorted = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("name", SortPreference.ASC)) @@ -152,35 +175,35 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "order ASCII strings by Java compareTo (punctuation < digits < uppercase < lowercase)" in { - val schema = Schema.of("str" -> AttributeType.STRING) - val tuples = List("a", "A", "0", "~", "!").map(s => Tuple.of(schema, "str" -> s)) + val schema = schemaOf("str" -> AttributeType.STRING) + val tuples = List("a", "A", "0", "~", "!").map(s => tupleOf(schema, "str" -> s)) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("str")) } assert(result.map(_.getField[String]("str")) == List("!", "0", "A", "a", "~")) } it should "sort negatives and zeros correctly" in { - val schema = Schema.of("value" -> AttributeType.INTEGER) - val tuples = List(0, -1, -10, 5, -3, 2).map(v => Tuple.of(schema, "value" -> v)) + val schema = schemaOf("value" -> AttributeType.INTEGER) + val tuples = List(0, -1, -10, 5, -3, 2).map(v => tupleOf(schema, "value" -> v)) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("value")) } assert(result.map(_.getField[Int]("value")) == List(-10, -3, -1, 0, 2, 5)) } it should "sort LONG values ascending" in { - val schema = Schema.of("id" -> AttributeType.LONG) - val tuples = List(5L, 1L, 3L, 9L, 0L).map(v => Tuple.of(schema, "id" -> v)) + val schema = schemaOf("id" -> AttributeType.LONG) + val tuples = List(5L, 1L, 3L, 9L, 0L).map(v => tupleOf(schema, "id" -> v)) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("id")) } assert(result.map(_.getField[Long]("id")) == List(0L, 1L, 3L, 5L, 9L)) } it should "sort TIMESTAMP ascending" in { - val schema = Schema.of("timestamp" -> AttributeType.TIMESTAMP) + val schema = schemaOf("timestamp" -> AttributeType.TIMESTAMP) val base = Timestamp.valueOf("2022-01-01 00:00:00") val tuples = List( new Timestamp(base.getTime + 4000), new Timestamp(base.getTime + 1000), new Timestamp(base.getTime + 3000), new Timestamp(base.getTime + 2000) - ).map(ts => Tuple.of(schema, "timestamp" -> ts)) + ).map(ts => tupleOf(schema, "timestamp" -> ts)) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("timestamp", SortPreference.ASC)) } @@ -189,14 +212,14 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "sort TIMESTAMP descending" in { - val schema = Schema.of("timestamp" -> AttributeType.TIMESTAMP) + val schema = schemaOf("timestamp" -> AttributeType.TIMESTAMP) val base = Timestamp.valueOf("2023-01-01 00:00:00") val tuples = List( new Timestamp(base.getTime + 3000), base, new Timestamp(base.getTime + 1000), new Timestamp(base.getTime + 2000) - ).map(ts => Tuple.of(schema, "timestamp" -> ts)) + ).map(ts => tupleOf(schema, "timestamp" -> ts)) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("timestamp", SortPreference.DESC)) } @@ -205,15 +228,15 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "treat numeric strings as strings (lexicographic ordering)" in { - val schema = Schema.of("str" -> AttributeType.STRING) - val tuples = List("2", "10", "1", "11", "20").map(s => Tuple.of(schema, "str" -> s)) + val schema = schemaOf("str" -> AttributeType.STRING) + val tuples = List("2", "10", "1", "11", "20").map(s => tupleOf(schema, "str" -> s)) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("str")) } assert(result.map(_.getField[String]("str")) == List("1", "10", "11", "2", "20")) } it should "sort BOOLEAN ascending (false < true) and descending" in { - val schema = Schema.of("bool" -> AttributeType.BOOLEAN) - val tuples = List(true, false, true, false).map(v => Tuple.of(schema, "bool" -> v)) + val schema = schemaOf("bool" -> AttributeType.BOOLEAN) + val tuples = List(true, false, true, false).map(v => tupleOf(schema, "bool" -> v)) val asc = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("bool", SortPreference.ASC)) } @@ -225,7 +248,7 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "sort BINARY ascending (unsigned lexicographic) incl. empty and high-bit bytes" in { - val schema = Schema.of("bin" -> AttributeType.BINARY) + val schema = schemaOf("bin" -> AttributeType.BINARY) val bytesEmpty = Array[Byte]() // [] val bytes00 = Array(0x00.toByte) // [00] @@ -236,7 +259,7 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { val bytesFF = Array(0xff.toByte) // [FF] (-1) val inputTuples = List(bytes80, bytes0000, bytesEmpty, bytesFF, bytes0001, bytes00, bytes7F) - .map(arr => Tuple.of(schema, "bin" -> arr)) + .map(arr => tupleOf(schema, "bin" -> arr)) val sorted = runStableMergeSort(schema, inputTuples) { _.keys = sortKeysBuffer(sortKey("bin")) } @@ -253,10 +276,10 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { // =========================================================================== it should "sort DOUBLE values including -0.0, 0.0, infinities and NaN" in { - val schema = Schema.of("x" -> AttributeType.DOUBLE) + val schema = schemaOf("x" -> AttributeType.DOUBLE) val tuples = List(Double.NaN, Double.PositiveInfinity, 1.5, -0.0, 0.0, -3.2, Double.NegativeInfinity) - .map(v => Tuple.of(schema, "x" -> v)) + .map(v => tupleOf(schema, "x" -> v)) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("x")) } @@ -271,14 +294,14 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "place NaN before null when sorting DOUBLE ascending (nulls last policy)" in { - val schema = Schema.of("x" -> AttributeType.DOUBLE) + val schema = schemaOf("x" -> AttributeType.DOUBLE) val tuples = List( - Tuple.of(schema, "x" -> null), - Tuple.of(schema, "x" -> Double.NaN), - Tuple.of(schema, "x" -> Double.NegativeInfinity), - Tuple.of(schema, "x" -> 1.0), - Tuple.of(schema, "x" -> Double.PositiveInfinity), - Tuple.of(schema, "x" -> null) + tupleOf(schema, "x" -> null), + tupleOf(schema, "x" -> Double.NaN), + tupleOf(schema, "x" -> Double.NegativeInfinity), + tupleOf(schema, "x" -> 1.0), + tupleOf(schema, "x" -> Double.PositiveInfinity), + tupleOf(schema, "x" -> null) ) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("x")) } val values = result.map(_.getField[java.lang.Double]("x")) @@ -291,12 +314,12 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "place nulls last regardless of ascending or descending" in { - val schema = Schema.of("value" -> AttributeType.INTEGER, "label" -> AttributeType.STRING) + val schema = schemaOf("value" -> AttributeType.INTEGER, "label" -> AttributeType.STRING) val tuples = List( - Tuple.of(schema, "value" -> null, "label" -> "null-1"), - Tuple.of(schema, "value" -> 5, "label" -> "five"), - Tuple.of(schema, "value" -> null, "label" -> "null-2"), - Tuple.of(schema, "value" -> 3, "label" -> "three") + tupleOf(schema, "value" -> null, "label" -> "null-1"), + tupleOf(schema, "value" -> 5, "label" -> "five"), + tupleOf(schema, "value" -> null, "label" -> "null-2"), + tupleOf(schema, "value" -> 3, "label" -> "three") ) val asc = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("value", SortPreference.ASC)) @@ -310,20 +333,20 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "order NaN highest on secondary DESC but still place nulls last" in { - val schema = Schema.of( + val schema = schemaOf( "group" -> AttributeType.STRING, "score" -> AttributeType.DOUBLE, "label" -> AttributeType.STRING ) val tuples = List( - Tuple.of(schema, "group" -> "A", "score" -> java.lang.Double.NaN, "label" -> "nan"), - Tuple.of(schema, "group" -> "A", "score" -> Double.PositiveInfinity, "label" -> "pinf"), - Tuple.of(schema, "group" -> "A", "score" -> 1.0, "label" -> "one"), - Tuple.of(schema, "group" -> "A", "score" -> 0.0, "label" -> "zero"), - Tuple.of(schema, "group" -> "A", "score" -> -1.0, "label" -> "neg"), - Tuple.of(schema, "group" -> "A", "score" -> Double.NegativeInfinity, "label" -> "ninf"), - Tuple.of(schema, "group" -> "A", "score" -> null, "label" -> "null-1"), - Tuple.of(schema, "group" -> "A", "score" -> null, "label" -> "null-2") + tupleOf(schema, "group" -> "A", "score" -> java.lang.Double.NaN, "label" -> "nan"), + tupleOf(schema, "group" -> "A", "score" -> Double.PositiveInfinity, "label" -> "pinf"), + tupleOf(schema, "group" -> "A", "score" -> 1.0, "label" -> "one"), + tupleOf(schema, "group" -> "A", "score" -> 0.0, "label" -> "zero"), + tupleOf(schema, "group" -> "A", "score" -> -1.0, "label" -> "neg"), + tupleOf(schema, "group" -> "A", "score" -> Double.NegativeInfinity, "label" -> "ninf"), + tupleOf(schema, "group" -> "A", "score" -> null, "label" -> "null-1"), + tupleOf(schema, "group" -> "A", "score" -> null, "label" -> "null-2") ) val result = runStableMergeSort(schema, tuples) { desc => desc.keys = @@ -336,20 +359,20 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "sort BINARY descending with nulls last and preserve stability for equal byte arrays" in { - val schema = Schema.of("bin" -> AttributeType.BINARY, "id" -> AttributeType.STRING) + val schema = schemaOf("bin" -> AttributeType.BINARY, "id" -> AttributeType.STRING) val key00 = Array(0x00.toByte) val keyFF = Array(0xff.toByte) val inputTuples = List( - Tuple.of(schema, "bin" -> keyFF, "id" -> "ff-1"), - Tuple.of(schema, "bin" -> key00, "id" -> "00-1"), - Tuple.of( + tupleOf(schema, "bin" -> keyFF, "id" -> "ff-1"), + tupleOf(schema, "bin" -> key00, "id" -> "00-1"), + tupleOf( schema, "bin" -> key00, "id" -> "00-2" ), // equal to previous; stability should keep order - Tuple.of(schema, "bin" -> null, "id" -> "null-1") + tupleOf(schema, "bin" -> null, "id" -> "null-1") ) val sorted = runStableMergeSort(schema, inputTuples) { @@ -364,7 +387,7 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { // =========================================================================== it should "support multi-key sorting with mixed attribute types" in { - val schema = Schema.of( + val schema = schemaOf( "dept" -> AttributeType.STRING, "score" -> AttributeType.DOUBLE, "name" -> AttributeType.STRING, @@ -372,29 +395,29 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { ) val base = new Timestamp(Timestamp.valueOf("2020-01-01 00:00:00").getTime) val tuples = List( - Tuple.of(schema, "dept" -> "Sales", "score" -> 9.5, "name" -> "Alice", "hired" -> base), - Tuple.of( + tupleOf(schema, "dept" -> "Sales", "score" -> 9.5, "name" -> "Alice", "hired" -> base), + tupleOf( schema, "dept" -> "Sales", "score" -> 9.5, "name" -> "Bob", "hired" -> new Timestamp(base.getTime + 1000) ), - Tuple.of( + tupleOf( schema, "dept" -> "Sales", "score" -> 8.0, "name" -> "Carol", "hired" -> new Timestamp(base.getTime + 2000) ), - Tuple.of( + tupleOf( schema, "dept" -> "Engineering", "score" -> 9.5, "name" -> "Dave", "hired" -> new Timestamp(base.getTime + 3000) ), - Tuple.of( + tupleOf( schema, "dept" -> null, "score" -> 9.5, @@ -413,7 +436,7 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "handle multi-key with descending primary and ascending secondary" in { - val schema = Schema.of( + val schema = schemaOf( "major" -> AttributeType.INTEGER, "minor" -> AttributeType.INTEGER, "idx" -> AttributeType.INTEGER @@ -426,7 +449,7 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { (1, 1, 4), (3, 0, 5), (3, 2, 6) - ).map { case (ma, mi, i) => Tuple.of(schema, "major" -> ma, "minor" -> mi, "idx" -> i) } + ).map { case (ma, mi, i) => tupleOf(schema, "major" -> ma, "minor" -> mi, "idx" -> i) } val result = runStableMergeSort(schema, tuples) { desc => desc.keys = sortKeysBuffer(sortKey("major", SortPreference.DESC), sortKey("minor", SortPreference.ASC)) @@ -440,7 +463,7 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "use the third key as a tiebreaker (ASC, ASC, then DESC)" in { - val schema = Schema.of( + val schema = schemaOf( "keyA" -> AttributeType.INTEGER, "keyB" -> AttributeType.INTEGER, "keyC" -> AttributeType.INTEGER, @@ -452,7 +475,7 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { (1, 1, 2, "x2"), (1, 0, 9, "y9") ).map { - case (a, b, c, id) => Tuple.of(schema, "keyA" -> a, "keyB" -> b, "keyC" -> c, "id" -> id) + case (a, b, c, id) => tupleOf(schema, "keyA" -> a, "keyB" -> b, "keyC" -> c, "id" -> id) } val result = runStableMergeSort(schema, tuples) { _.keys = @@ -462,7 +485,7 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "place nulls last across multiple keys (primary ASC, secondary DESC)" in { - val schema = Schema.of("keyA" -> AttributeType.STRING, "keyB" -> AttributeType.INTEGER) + val schema = schemaOf("keyA" -> AttributeType.STRING, "keyB" -> AttributeType.INTEGER) val tuples = List( ("x", 2), (null, 1), @@ -470,7 +493,7 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { (null, 5), ("a", 9), ("a", 2) - ).map { case (s, i) => Tuple.of(schema, "keyA" -> s, "keyB" -> i) } + ).map { case (s, i) => tupleOf(schema, "keyA" -> s, "keyB" -> i) } val result = runStableMergeSort(schema, tuples) { desc => desc.keys = sortKeysBuffer(sortKey("keyA", SortPreference.ASC), sortKey("keyB", SortPreference.DESC)) @@ -480,16 +503,16 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "when primary keys are both null, fall back to secondary ASC (nulls still after non-nulls)" in { - val schema = Schema.of( + val schema = schemaOf( "keyA" -> AttributeType.STRING, "keyB" -> AttributeType.INTEGER, "id" -> AttributeType.STRING ) val tuples = List( - Tuple.of(schema, "keyA" -> "A", "keyB" -> 2, "id" -> "non-null-a"), - Tuple.of(schema, "keyA" -> null, "keyB" -> 5, "id" -> "null-a-5"), - Tuple.of(schema, "keyA" -> null, "keyB" -> 1, "id" -> "null-a-1"), - Tuple.of(schema, "keyA" -> "B", "keyB" -> 9, "id" -> "non-null-b") + tupleOf(schema, "keyA" -> "A", "keyB" -> 2, "id" -> "non-null-a"), + tupleOf(schema, "keyA" -> null, "keyB" -> 5, "id" -> "null-a-5"), + tupleOf(schema, "keyA" -> null, "keyB" -> 1, "id" -> "null-a-1"), + tupleOf(schema, "keyA" -> "B", "keyB" -> 9, "id" -> "non-null-b") ) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("keyA"), sortKey("keyB")) @@ -501,7 +524,7 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "use INTEGER secondary key to break ties when primary BINARY keys are equal" in { - val schema = Schema.of( + val schema = schemaOf( "bin" -> AttributeType.BINARY, "score" -> AttributeType.INTEGER, "label" -> AttributeType.STRING @@ -511,9 +534,9 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { val key01 = Array(0x01.toByte) val inputTuples = List( - Tuple.of(schema, "bin" -> key01, "score" -> 1, "label" -> "01-score1"), - Tuple.of(schema, "bin" -> key00, "score" -> 9, "label" -> "00-score9"), - Tuple.of(schema, "bin" -> key01, "score" -> 2, "label" -> "01-score2") + tupleOf(schema, "bin" -> key01, "score" -> 1, "label" -> "01-score1"), + tupleOf(schema, "bin" -> key00, "score" -> 9, "label" -> "00-score9"), + tupleOf(schema, "bin" -> key01, "score" -> 2, "label" -> "01-score2") ) val sorted = runStableMergeSort(schema, inputTuples) { desc => @@ -531,8 +554,8 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { // =========================================================================== it should "preserve original order among tuples with equal keys" in { - val schema = Schema.of("key" -> AttributeType.INTEGER, "index" -> AttributeType.INTEGER) - val tuples = (0 until 100).map(i => Tuple.of(schema, "key" -> (i % 5), "index" -> i)) + val schema = schemaOf("key" -> AttributeType.INTEGER, "index" -> AttributeType.INTEGER) + val tuples = (0 until 100).map(i => tupleOf(schema, "key" -> (i % 5), "index" -> i)) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("key")) } val grouped = result.groupBy(_.getField[Int]("key")).values grouped.foreach { group => @@ -542,9 +565,9 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "act as a stable pass-through when keys are empty" in { - val schema = Schema.of("value" -> AttributeType.INTEGER, "label" -> AttributeType.STRING) + val schema = schemaOf("value" -> AttributeType.INTEGER, "label" -> AttributeType.STRING) val tuples = List(3, 1, 4, 1, 5, 9).zipWithIndex - .map { case (v, i) => Tuple.of(schema, "value" -> v, "label" -> s"row-$i") } + .map { case (v, i) => tupleOf(schema, "value" -> v, "label" -> s"row-$i") } val result = runStableMergeSort(schema, tuples) { desc => desc.keys = ListBuffer.empty[SortCriteriaUnit] } @@ -555,8 +578,8 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "buffer tuples until onFinish is called" in { - val schema = Schema.of("value" -> AttributeType.INTEGER) - val tuple = Tuple.of(schema, "value" -> 2) + val schema = schemaOf("value" -> AttributeType.INTEGER) + val tuple = tupleOf(schema, "value" -> 2) val desc = new StableMergeSortOpDesc(); desc.keys = sortKeysBuffer(sortKey("value")) val exec = new StableMergeSortOpExec(objectMapper.writeValueAsString(desc)) exec.open() @@ -568,22 +591,22 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "return empty for empty input" in { - val schema = Schema.of("value" -> AttributeType.INTEGER) + val schema = schemaOf("value" -> AttributeType.INTEGER) val result = runStableMergeSort(schema, Seq.empty) { _.keys = sortKeysBuffer(sortKey("value")) } assert(result.isEmpty) } it should "handle single element input" in { - val schema = Schema.of("value" -> AttributeType.INTEGER) - val result = runStableMergeSort(schema, Seq(Tuple.of(schema, "value" -> 42))) { + val schema = schemaOf("value" -> AttributeType.INTEGER) + val result = runStableMergeSort(schema, Seq(tupleOf(schema, "value" -> 42))) { _.keys = sortKeysBuffer(sortKey("value")) } assert(result.map(_.getField[Int]("value")) == List(42)) } it should "sort large inputs efficiently (sanity on boundaries)" in { - val schema = Schema.of("value" -> AttributeType.INTEGER, "label" -> AttributeType.STRING) - val tuples = (50000 to 1 by -1).map(i => Tuple.of(schema, "value" -> i, "label" -> s"row-$i")) + val schema = schemaOf("value" -> AttributeType.INTEGER, "label" -> AttributeType.STRING) + val tuples = (50000 to 1 by -1).map(i => tupleOf(schema, "value" -> i, "label" -> s"row-$i")) val result = runStableMergeSort(schema, tuples) { _.keys = sortKeysBuffer(sortKey("value")) } assert(result.head.getField[Int]("value") == 1) assert(result(1).getField[Int]("value") == 2) @@ -595,14 +618,14 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { // =========================================================================== it should "merge incrementally: bucket sizes match binary decomposition after each push" in { - val schema = Schema.of("value" -> AttributeType.INTEGER) + val schema = schemaOf("value" -> AttributeType.INTEGER) val desc = new StableMergeSortOpDesc(); desc.keys = sortKeysBuffer(sortKey("value")) val exec = new StableMergeSortOpExec(objectMapper.writeValueAsString(desc)) exec.open() val totalCount = 64 for (index <- (totalCount - 1) to 0 by -1) { - exec.processTuple(Tuple.of(schema, "value" -> index), 0) + exec.processTuple(tupleOf(schema, "value" -> index), 0) val sizes = getBucketSizes(exec).sorted assert(sizes == binaryDecomposition(totalCount - index)) } @@ -611,7 +634,7 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "maintain bucket-stack invariant (no adjacent equal sizes) after each insertion" in { - val schema = Schema.of("value" -> AttributeType.INTEGER) + val schema = schemaOf("value" -> AttributeType.INTEGER) val desc = new StableMergeSortOpDesc(); desc.keys = sortKeysBuffer(sortKey("value")) val exec = new StableMergeSortOpExec(objectMapper.writeValueAsString(desc)) exec.open() @@ -619,7 +642,7 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { val totalCount = 200 val stream = (0 until totalCount by 2) ++ (1 until totalCount by 2) stream.foreach { index => - exec.processTuple(Tuple.of(schema, "value" -> (totalCount - 1 - index)), 0) + exec.processTuple(tupleOf(schema, "value" -> (totalCount - 1 - index)), 0) val sizes = getBucketSizes(exec) sizes.sliding(2).foreach { pair => if (pair.length == 2) assert(pair.head != pair.last) @@ -630,12 +653,12 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "form expected bucket sizes at milestones (1,2,3,4,7,8,15,16)" in { - val schema = Schema.of("value" -> AttributeType.INTEGER) + val schema = schemaOf("value" -> AttributeType.INTEGER) val desc = new StableMergeSortOpDesc(); desc.keys = sortKeysBuffer(sortKey("value")) val exec = new StableMergeSortOpExec(objectMapper.writeValueAsString(desc)) exec.open() - val inputSequence = (100 to 1 by -1).map(i => Tuple.of(schema, "value" -> i)) + val inputSequence = (100 to 1 by -1).map(i => tupleOf(schema, "value" -> i)) val milestones = Set(1, 2, 3, 4, 7, 8, 15, 16) var pushed = 0 inputSequence.foreach { t => @@ -654,20 +677,20 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { // =========================================================================== "mergeSortedBuckets" should "be stable: left bucket wins on equal keys" in { - val schema = Schema.of("key" -> AttributeType.INTEGER, "id" -> AttributeType.STRING) + val schema = schemaOf("key" -> AttributeType.INTEGER, "id" -> AttributeType.STRING) val desc = new StableMergeSortOpDesc(); desc.keys = sortKeysBuffer(sortKey("key")) val exec = new StableMergeSortOpExec(objectMapper.writeValueAsString(desc)); exec.open() // Seed to resolve schema/keys once. - exec.processTuple(Tuple.of(schema, "key" -> 0, "id" -> "seed"), 0) + exec.processTuple(tupleOf(schema, "key" -> 0, "id" -> "seed"), 0) val left = ArrayBuffer( - Tuple.of(schema, "key" -> 1, "id" -> "L1"), - Tuple.of(schema, "key" -> 2, "id" -> "L2") + tupleOf(schema, "key" -> 1, "id" -> "L1"), + tupleOf(schema, "key" -> 2, "id" -> "L2") ) val right = ArrayBuffer( - Tuple.of(schema, "key" -> 1, "id" -> "R1"), - Tuple.of(schema, "key" -> 3, "id" -> "R3") + tupleOf(schema, "key" -> 1, "id" -> "R1"), + tupleOf(schema, "key" -> 3, "id" -> "R3") ) val merged = exec.mergeSortedBuckets(left, right) @@ -677,15 +700,15 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } "mergeSortedBuckets" should "handle empty left bucket" in { - val schema = Schema.of("key" -> AttributeType.INTEGER, "id" -> AttributeType.STRING) + val schema = schemaOf("key" -> AttributeType.INTEGER, "id" -> AttributeType.STRING) val desc = new StableMergeSortOpDesc(); desc.keys = sortKeysBuffer(sortKey("key")) val exec = new StableMergeSortOpExec(objectMapper.writeValueAsString(desc)); exec.open() - exec.processTuple(Tuple.of(schema, "key" -> 0, "id" -> "seed"), 0) // seed keys + exec.processTuple(tupleOf(schema, "key" -> 0, "id" -> "seed"), 0) // seed keys val left = ArrayBuffer.empty[Tuple] val right = ArrayBuffer( - Tuple.of(schema, "key" -> 1, "id" -> "r1"), - Tuple.of(schema, "key" -> 2, "id" -> "r2") + tupleOf(schema, "key" -> 1, "id" -> "r1"), + tupleOf(schema, "key" -> 2, "id" -> "r2") ) val merged = exec.mergeSortedBuckets(left, right) assert(merged.map(_.getField[String]("id")).toList == List("r1", "r2")) @@ -693,14 +716,14 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } "mergeSortedBuckets" should "handle empty right bucket" in { - val schema = Schema.of("key" -> AttributeType.INTEGER, "id" -> AttributeType.STRING) + val schema = schemaOf("key" -> AttributeType.INTEGER, "id" -> AttributeType.STRING) val desc = new StableMergeSortOpDesc(); desc.keys = sortKeysBuffer(sortKey("key")) val exec = new StableMergeSortOpExec(objectMapper.writeValueAsString(desc)); exec.open() - exec.processTuple(Tuple.of(schema, "key" -> 0, "id" -> "seed"), 0) + exec.processTuple(tupleOf(schema, "key" -> 0, "id" -> "seed"), 0) val left = ArrayBuffer( - Tuple.of(schema, "key" -> 1, "id" -> "l1"), - Tuple.of(schema, "key" -> 2, "id" -> "l2") + tupleOf(schema, "key" -> 1, "id" -> "l1"), + tupleOf(schema, "key" -> 2, "id" -> "l2") ) val right = ArrayBuffer.empty[Tuple] val merged = exec.mergeSortedBuckets(left, right) @@ -713,16 +736,16 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { // =========================================================================== "pushBucketAndCombine" should "merge two size-2 buckets into size-4 on push (with existing size-1 seed)" in { - val schema = Schema.of("value" -> AttributeType.INTEGER) + val schema = schemaOf("value" -> AttributeType.INTEGER) val desc = new StableMergeSortOpDesc(); desc.keys = sortKeysBuffer(sortKey("value")) val exec = new StableMergeSortOpExec(objectMapper.writeValueAsString(desc)); exec.open() // seed to compile keys -> results in one size-1 bucket in the stack - exec.processTuple(Tuple.of(schema, "value" -> 0), 0) + exec.processTuple(tupleOf(schema, "value" -> 0), 0) // two pre-sorted buckets of size 2 - val bucket1 = ArrayBuffer(Tuple.of(schema, "value" -> 1), Tuple.of(schema, "value" -> 3)) - val bucket2 = ArrayBuffer(Tuple.of(schema, "value" -> 2), Tuple.of(schema, "value" -> 4)) + val bucket1 = ArrayBuffer(tupleOf(schema, "value" -> 1), tupleOf(schema, "value" -> 3)) + val bucket2 = ArrayBuffer(tupleOf(schema, "value" -> 2), tupleOf(schema, "value" -> 4)) exec.pushBucketAndCombine(bucket1) // sizes now [1,2] exec.pushBucketAndCombine(bucket2) // equal top [2,2] => merged to 4; sizes [1,4] @@ -733,10 +756,10 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "return the same sorted output if onFinish is called twice in a row" in { - val schema = Schema.of("value" -> AttributeType.INTEGER) + val schema = schemaOf("value" -> AttributeType.INTEGER) val desc = new StableMergeSortOpDesc(); desc.keys = sortKeysBuffer(sortKey("value")) val exec = new StableMergeSortOpExec(objectMapper.writeValueAsString(desc)); exec.open() - List(3, 1, 2).foreach(i => exec.processTuple(Tuple.of(schema, "value" -> i), 0)) + List(3, 1, 2).foreach(i => exec.processTuple(tupleOf(schema, "value" -> i), 0)) val first = exec.onFinish(0).map(_.asInstanceOf[Tuple]).toList.map(_.getField[Int]("value")) val second = exec.onFinish(0).map(_.asInstanceOf[Tuple]).toList.map(_.getField[Int]("value")) @@ -746,10 +769,10 @@ class StableMergeSortOpExecSpec extends AnyFlatSpec { } it should "have processTuple always return empty iterators until finish" in { - val schema = Schema.of("value" -> AttributeType.INTEGER) + val schema = schemaOf("value" -> AttributeType.INTEGER) val desc = new StableMergeSortOpDesc(); desc.keys = sortKeysBuffer(sortKey("value")) val exec = new StableMergeSortOpExec(objectMapper.writeValueAsString(desc)); exec.open() - val immediates = (10 to 1 by -1).map(i => exec.processTuple(Tuple.of(schema, "value" -> i), 0)) + val immediates = (10 to 1 by -1).map(i => exec.processTuple(tupleOf(schema, "value" -> i), 0)) assert(immediates.forall(_.isEmpty)) val out = exec.onFinish(0).map(_.asInstanceOf[Tuple]).toList.map(_.getField[Int]("value")) assert(out == (1 to 10).toList) From 72153aa7eb965f8d6be274c6770023392815cbd3 Mon Sep 17 00:00:00 2001 From: Carlos Ernesto Alvarez Berumen Date: Thu, 4 Dec 2025 23:50:22 -0600 Subject: [PATCH 12/15] New Design --- .../service/resource/DatasetResource.scala | 237 ++++++++--- .../dataset-detail.component.ts | 18 +- .../service/user/dataset/dataset.service.ts | 375 ++++++++++-------- 3 files changed, 386 insertions(+), 244 deletions(-) diff --git a/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala b/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala index f49588546ac..f593962cd03 100644 --- a/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala +++ b/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala @@ -63,7 +63,9 @@ import java.nio.charset.StandardCharsets import java.nio.file.{Files, Paths} import java.util import java.util.Optional +import java.util.concurrent.atomic.AtomicLong import java.util.zip.{ZipEntry, ZipOutputStream} +import scala.collection.concurrent.TrieMap import scala.collection.mutable.ListBuffer import scala.jdk.CollectionConverters._ import scala.jdk.OptionConverters._ @@ -171,6 +173,25 @@ object DatasetResource { fileNodes: List[DatasetFileNode], size: Long ) + + /** Case class to hold state of an ongoing multipart upload session */ + private case class SessionState( + token: String, + repoName: String, + did: Int, + uid: Int, + uploadId: String, + filePath: String, + physicalAddress: String, + presignedUrls: Array[String], + var totalBytes: AtomicLong = new AtomicLong(0L), + @volatile var status: String = "ongoing", + var parts: ListBuffer[(Int, String)] = ListBuffer.empty + ) + + /** In-memory map of active upload sessions (uploadToken -> SessionState) */ + private val uploadSessions = TrieMap[String, SessionState]() + } @Produces(Array(MediaType.APPLICATION_JSON, "image/jpeg", "application/pdf")) @@ -669,12 +690,11 @@ class DatasetResource { @QueryParam("ownerEmail") ownerEmail: String, @QueryParam("datasetName") datasetName: String, @QueryParam("filePath") encodedUrl: String, - @QueryParam("uploadId") uploadId: Optional[String], @QueryParam("numParts") numParts: Optional[Integer], payload: Map[ String, Any - ], // Expecting {"parts": [...], "physicalAddress": "s3://bucket/path"} + ], // Expecting {"uploadToken": "..."} for abort and finish @Auth user: SessionUser ): Response = { val uid = user.getUid @@ -702,77 +722,66 @@ class DatasetResource { throw new BadRequestException("numParts is required for initialization") ) - val presignedResponse = LakeFSStorageClient.initiatePresignedMultipartUploads( + val presign = LakeFSStorageClient.initiatePresignedMultipartUploads( repositoryName, filePath, numPartsValue ) + val uploadIdStr = presign.getUploadId + val presignedUrlsArr = presign.getPresignedUrls.asScala.toArray.map(_.toString) + val physicalAddr = presign.getPhysicalAddress + + val token = java.util.UUID.randomUUID().toString + + DatasetResource.uploadSessions.put( + token, + SessionState( + token = token, + repoName = dataset.getRepositoryName, + did = dataset.getDid, + uid = uid, + uploadId = uploadIdStr, + filePath = filePath, + physicalAddress = physicalAddr, + presignedUrls = presignedUrlsArr + ) + ) + Response .ok( Map( - "uploadId" -> presignedResponse.getUploadId, - "presignedUrls" -> presignedResponse.getPresignedUrls, - "physicalAddress" -> presignedResponse.getPhysicalAddress + "uploadToken" -> token ) ) .build() case "finish" => - val uploadIdValue = uploadId.toScala.getOrElse( - throw new BadRequestException("uploadId is required for completion") - ) - - // Extract parts from the payload - val partsList = payload.get("parts") match { - case Some(rawList: List[_]) => - try { - rawList.map { - case part: Map[_, _] => - val partMap = part.asInstanceOf[Map[String, Any]] - val partNumber = partMap.get("PartNumber") match { - case Some(i: Int) => i - case Some(s: String) => s.toInt - case _ => throw new BadRequestException("Invalid or missing PartNumber") - } - val eTag = partMap.get("ETag") match { - case Some(s: String) => s - case _ => throw new BadRequestException("Invalid or missing ETag") - } - (partNumber, eTag) - - case _ => - throw new BadRequestException("Each part must be a Map[String, Any]") - } - } catch { - case e: NumberFormatException => - throw new BadRequestException("PartNumber must be an integer", e) - } - - case _ => - throw new BadRequestException("Missing or invalid 'parts' list in payload") + val tokenValue = payload.get("uploadToken").map(_.asInstanceOf[String]).getOrElse { + throw new BadRequestException("uploadToken is required for completion") } + val session = DatasetResource.uploadSessions.getOrElse( + tokenValue, { + throw new NotFoundException("Upload session not found or already finalized") + } + ) - // Extract physical address from payload - val physicalAddress = payload.get("physicalAddress") match { - case Some(address: String) => address - case _ => throw new BadRequestException("Missing physicalAddress in payload") - } + if (user.getUid != session.uid) + throw new ForbiddenException("User has no access to this upload session") - // Complete the multipart upload with parts and physical address + DatasetResource.uploadSessions.remove(tokenValue) val objectStats = LakeFSStorageClient.completePresignedMultipartUploads( repositoryName, - filePath, - uploadIdValue, - partsList, - physicalAddress + session.filePath, + session.uploadId, + session.parts.toList, + session.physicalAddress ) val sizeBytes = Option(objectStats.getSizeBytes).map(_.longValue()).getOrElse(0L) if (sizeBytes > maxSingleFileUploadBytes) { - // Roll back staged object to previous committed state (or remove if new). try { LakeFSStorageClient.resetObjectUploadOrDeletion(repositoryName, filePath) } catch { - case _: Exception => // best-effort cleanup + case _: Exception => } throw new WebApplicationException( s"File exceeds maximum allowed size of " + @@ -790,32 +799,134 @@ class DatasetResource { .build() case "abort" => - val uploadIdValue = uploadId.toScala.getOrElse( - throw new BadRequestException("uploadId is required for abortion") + val tokenValue = payload + .get("uploadToken") + .map(_.asInstanceOf[String]) + .getOrElse { + throw new BadRequestException("uploadToken is required for abortion") + } + val session = DatasetResource.uploadSessions.getOrElse( + tokenValue, { + throw new NotFoundException("Upload session not found or already finished") + } ) - // Extract physical address from payload - val physicalAddress = payload.get("physicalAddress") match { - case Some(address: String) => address - case _ => throw new BadRequestException("Missing physicalAddress in payload") + if (user.getUid != session.uid) { + throw new ForbiddenException("User has no access to this upload session") } - // Abort the multipart upload + DatasetResource.uploadSessions.remove(tokenValue) + LakeFSStorageClient.abortPresignedMultipartUploads( - repositoryName, - filePath, - uploadIdValue, - physicalAddress + session.repoName, + session.filePath, + session.uploadId, + session.physicalAddress ) - Response.ok(Map("message" -> "Multipart upload aborted successfully")).build() - case _ => throw new BadRequestException("Invalid type parameter. Use 'init', 'finish', or 'abort'.") } } } + @POST + @RolesAllowed(Array("REGULAR", "ADMIN")) + @Path("/multipart-upload/part") + @Consumes(Array(MediaType.APPLICATION_OCTET_STREAM)) + def uploadPart( + @QueryParam("token") uploadToken: String, + @QueryParam("partNumber") partNumber: Int, + partStream: InputStream, + @Context headers: HttpHeaders, + @Auth user: SessionUser + ): Response = { + val sessionOpt = DatasetResource.uploadSessions.get(uploadToken) + if (sessionOpt.isEmpty) { + throw new NotFoundException("Upload session not found or expired") + } + val session = sessionOpt.get + + if (user.getUid != session.uid) + throw new ForbiddenException("User has no access to this upload session") + + if (session.status == "aborted") + throw new WebApplicationException("Upload session already aborted", Response.Status.GONE) + + if (partNumber < 1 || partNumber > session.presignedUrls.length) + throw new BadRequestException("Invalid partNumber") + + val presignedUrl = session.presignedUrls(partNumber - 1) + + val conn = new URL(presignedUrl).openConnection().asInstanceOf[HttpURLConnection] + conn.setDoOutput(true) + conn.setRequestMethod("PUT") + + // Don't trust Content-Length for enforcement, we only use it to hint streaming mode if present + Option(headers.getHeaderString(HttpHeaders.CONTENT_LENGTH)) + .flatMap(s => scala.util.Try(s.toLong).toOption) + .foreach(len => conn.setFixedLengthStreamingMode(len)) + conn.setRequestProperty("Content-Type", "application/octet-stream") + + val outStream = conn.getOutputStream + val buffer = new Array[Byte](8 * 1024) + var bytesRead = partStream.read(buffer) + + try { + while (bytesRead != -1) { + val newTotal = session.totalBytes.addAndGet(bytesRead.toLong) + if (newTotal > maxSingleFileUploadBytes) { + session.status = "aborted" + DatasetResource.uploadSessions.remove(uploadToken) + + // Close streams before aborting + try outStream.close() + catch { case _: Exception => () } + try partStream.close() + catch { case _: Exception => () } + + LakeFSStorageClient.abortPresignedMultipartUploads( + session.repoName, + session.filePath, + session.uploadId, + session.physicalAddress + ) + + throw new WebApplicationException( + s"File exceeds maximum allowed size of ${singleFileUploadMaxSizeMib} MiB. " + + "Upload has been rolled back.", + Response.Status.REQUEST_ENTITY_TOO_LARGE + ) + } + outStream.write(buffer, 0, bytesRead) + bytesRead = partStream.read(buffer) + } + } finally { + try outStream.close() + catch { case _: Exception => () } + try partStream.close() + catch { case _: Exception => () } + } + + val code = conn.getResponseCode + if (code != HttpURLConnection.HTTP_OK && code != HttpURLConnection.HTTP_CREATED) { + conn.disconnect() + throw new RuntimeException(s"Part $partNumber upload failed (HTTP $code)") + } + + val eTag = Option(conn.getHeaderField("ETag")).map(_.replace("\"", "")).getOrElse("") + conn.disconnect() + + session.synchronized { + if (session.status == "aborted") { + throw new WebApplicationException("Upload session already aborted", Response.Status.GONE) + } + session.parts += ((partNumber, eTag)) + } + + Response.ok().build() + } + @POST @RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/{did}/update/publicity") diff --git a/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/dataset-detail.component.ts b/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/dataset-detail.component.ts index b4d12f5a28e..fff40cdf414 100644 --- a/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/dataset-detail.component.ts +++ b/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/dataset-detail.component.ts @@ -104,8 +104,8 @@ export class DatasetDetailComponent implements OnInit { // List of upload tasks – each task tracked by its filePath public uploadTasks: Array< MultipartUploadProgress & { - filePath: string; - } + filePath: string; + } > = []; @Output() userMakeChanges = new EventEmitter(); @@ -416,8 +416,7 @@ export class DatasetDetailComponent implements OnInit { filePath: file.name, percentage: 0, status: "initializing", - uploadId: "", - physicalAddress: "", + uploadToken: "", }); // Start multipart upload const subscription = this.datasetService @@ -558,21 +557,24 @@ export class DatasetDetailComponent implements OnInit { this.onUploadComplete(); } + if (!task.uploadToken) { + this.uploadTasks = this.uploadTasks.filter(t => t.filePath !== task.filePath); + return; + } + this.datasetService .finalizeMultipartUpload( this.ownerEmail, this.datasetName, task.filePath, - task.uploadId, - [], - task.physicalAddress, + task.uploadToken, true // abort flag ) .pipe(untilDestroyed(this)) .subscribe(() => { this.notificationService.info(`${task.filePath} uploading has been terminated`); }); - // Remove the aborted task immediately + this.uploadTasks = this.uploadTasks.filter(t => t.filePath !== task.filePath); } diff --git a/frontend/src/app/dashboard/service/user/dataset/dataset.service.ts b/frontend/src/app/dashboard/service/user/dataset/dataset.service.ts index c09125d73b1..a3a55900fd9 100644 --- a/frontend/src/app/dashboard/service/user/dataset/dataset.service.ts +++ b/frontend/src/app/dashboard/service/user/dataset/dataset.service.ts @@ -27,6 +27,7 @@ import { DashboardDataset } from "../../../type/dashboard-dataset.interface"; import { DatasetFileNode } from "../../../../common/type/datasetVersionFileTree"; import { DatasetStagedObject } from "../../../../common/type/dataset-staged-object"; import { GuiConfigService } from "../../../../common/service/gui-config.service"; +import { AuthService } from "src/app/common/service/user/auth.service"; export const DATASET_BASE_URL = "dataset"; export const DATASET_CREATE_URL = DATASET_BASE_URL + "/create"; @@ -51,11 +52,10 @@ export interface MultipartUploadProgress { filePath: string; percentage: number; status: "initializing" | "uploading" | "finished" | "aborted"; - uploadId: string; - physicalAddress: string; - uploadSpeed?: number; // bytes per second - estimatedTimeRemaining?: number; // seconds - totalTime?: number; // total seconds taken + uploadToken: string; + uploadSpeed?: number; // bytes per second + estimatedTimeRemaining?: number; // seconds + totalTime?: number; // total seconds taken } @Injectable({ @@ -122,6 +122,7 @@ export class DatasetService { public retrieveAccessibleDatasets(): Observable { return this.http.get(`${AppSettings.getApiEndpoint()}/${DATASET_LIST_URL}`); } + public createDatasetVersion(did: number, newVersion: string): Observable { return this.http .post<{ @@ -141,6 +142,13 @@ export class DatasetService { /** * Handles multipart upload for large files using RxJS, * with a concurrency limit on how many parts we process in parallel. + * + * Backend flow: + * POST /dataset/multipart-upload?type=init&ownerEmail=...&datasetName=...&filePath=...&numParts=N + * -> { uploadToken } + * POST /dataset/multipart-upload/part?token=&partNumber= (body: raw chunk) + * POST /dataset/multipart-upload?type=finish (body: { uploadToken }) + * POST /dataset/multipart-upload?type=abort (body: { uploadToken }) */ public multipartUpload( ownerEmail: string, @@ -152,8 +160,8 @@ export class DatasetService { ): Observable { const partCount = Math.ceil(file.size / partSize); - return new Observable(observer => { - // Track upload progress for each part independently + return new Observable(observer => { + // Track upload progress (bytes) for each part independently const partProgress = new Map(); // Progress tracking state @@ -162,8 +170,15 @@ export class DatasetService { let lastETA = 0; let lastUpdateTime = 0; - // Calculate stats with smoothing + const lastStats = { + uploadSpeed: 0, + estimatedTimeRemaining: 0, + totalTime: 0, + }; + const getTotalTime = () => (startTime ? (Date.now() - startTime) / 1000 : 0); + + // Calculate stats with smoothing and simple throttling (~1s) const calculateStats = (totalUploaded: number) => { if (startTime === null) { startTime = Date.now(); @@ -172,25 +187,28 @@ export class DatasetService { const now = Date.now(); const elapsed = getTotalTime(); - // Throttle updates to every 1s const shouldUpdate = now - lastUpdateTime >= 1000; if (!shouldUpdate) { - return null; + // keep totalTime fresh even when throttled + lastStats.totalTime = elapsed; + return lastStats; } lastUpdateTime = now; - // Calculate speed with moving average const currentSpeed = elapsed > 0 ? totalUploaded / elapsed : 0; speedSamples.push(currentSpeed); - if (speedSamples.length > 5) speedSamples.shift(); - const avgSpeed = speedSamples.reduce((a, b) => a + b, 0) / speedSamples.length; + if (speedSamples.length > 5) { + speedSamples.shift(); + } + const avgSpeed = + speedSamples.length > 0 + ? speedSamples.reduce((a, b) => a + b, 0) / speedSamples.length + : 0; - // Calculate smooth ETA const remaining = file.size - totalUploaded; let eta = avgSpeed > 0 ? remaining / avgSpeed : 0; - eta = Math.min(eta, 24 * 60 * 60); // cap ETA at 24h, 86400 sec + eta = Math.min(eta, 24 * 60 * 60); // cap ETA at 24h - // Smooth ETA changes (limit to 30% change) if (lastETA > 0 && eta > 0) { const maxChange = lastETA * 0.3; const diff = Math.abs(eta - lastETA); @@ -200,229 +218,240 @@ export class DatasetService { } lastETA = eta; - // Near completion optimization const percentComplete = (totalUploaded / file.size) * 100; if (percentComplete > 95) { eta = Math.min(eta, 10); } - return { - uploadSpeed: avgSpeed, - estimatedTimeRemaining: Math.max(0, Math.round(eta)), - totalTime: elapsed, - }; + lastStats.uploadSpeed = avgSpeed; + lastStats.estimatedTimeRemaining = Math.max(0, Math.round(eta)); + lastStats.totalTime = elapsed; + + return lastStats; }; - const subscription = this.initiateMultipartUpload(ownerEmail, datasetName, filePath, partCount) + // 1. INIT: ask backend to create a LakeFS multipart upload session and get uploadToken + const initParams = new HttpParams() + .set("type", "init") + .set("ownerEmail", ownerEmail) + .set("datasetName", datasetName) + .set("filePath", encodeURIComponent(filePath)) + .set("numParts", partCount.toString()); + + const init$ = this.http.post<{ uploadToken: string }>( + `${AppSettings.getApiEndpoint()}/${DATASET_BASE_URL}/multipart-upload`, + {}, + { params: initParams } + ); + + const subscription = init$ .pipe( - switchMap(initiateResponse => { - const { uploadId, presignedUrls, physicalAddress } = initiateResponse; - if (!uploadId) { + switchMap(initResp => { + const uploadToken = initResp.uploadToken; + if (!uploadToken) { observer.error(new Error("Failed to initiate multipart upload")); return EMPTY; } + + // Notify UI that upload is starting observer.next({ - filePath: filePath, + filePath, percentage: 0, status: "initializing", - uploadId: uploadId, - physicalAddress: physicalAddress, + uploadToken, uploadSpeed: 0, estimatedTimeRemaining: 0, totalTime: 0, }); - // Keep track of all uploaded parts - const uploadedParts: { PartNumber: number; ETag: string }[] = []; - - // 1) Convert presignedUrls into a stream of URLs - return from(presignedUrls).pipe( - // 2) Use mergeMap with concurrency limit to upload chunk by chunk - mergeMap((url, index) => { - const partNumber = index + 1; - const start = index * partSize; - const end = Math.min(start + partSize, file.size); - const chunk = file.slice(start, end); - - // Upload the chunk - return new Observable(partObserver => { - const xhr = new XMLHttpRequest(); - - xhr.upload.addEventListener("progress", event => { - if (event.lengthComputable) { - // Update this specific part's progress - partProgress.set(partNumber, event.loaded); - - // Calculate total progress across all parts - let totalUploaded = 0; - partProgress.forEach(bytes => (totalUploaded += bytes)); - const percentage = Math.round((totalUploaded / file.size) * 100); - const stats = calculateStats(totalUploaded); - - observer.next({ - filePath, - percentage: Math.min(percentage, 99), // Cap at 99% until finalized - status: "uploading", - uploadId, - physicalAddress, - ...stats, - }); - } - }); - - xhr.addEventListener("load", () => { - if (xhr.status === 200 || xhr.status === 201) { - const etag = xhr.getResponseHeader("ETag")?.replace(/"/g, ""); - if (!etag) { - partObserver.error(new Error(`Missing ETag for part ${partNumber}`)); - return; + // 2. Upload each part to /multipart-upload/part using XMLHttpRequest + return from(Array.from({ length: partCount }, (_, i) => i)).pipe( + mergeMap( + index => { + const partNumber = index + 1; + const start = index * partSize; + const end = Math.min(start + partSize, file.size); + const chunk = file.slice(start, end); + + return new Observable(partObserver => { + const xhr = new XMLHttpRequest(); + + xhr.upload.addEventListener("progress", event => { + if (event.lengthComputable) { + partProgress.set(partNumber, event.loaded); + + let totalUploaded = 0; + partProgress.forEach(bytes => { + totalUploaded += bytes; + }); + + const percentage = Math.round((totalUploaded / file.size) * 100); + const stats = calculateStats(totalUploaded); + + observer.next({ + filePath, + percentage: Math.min(percentage, 99), + status: "uploading", + uploadToken, + ...stats, + }); + } + }); + + xhr.addEventListener("load", () => { + if (xhr.status === 200 || xhr.status === 204) { + // Mark part as fully uploaded + partProgress.set(partNumber, chunk.size); + + let totalUploaded = 0; + partProgress.forEach(bytes => { + totalUploaded += bytes; + }); + + // Force stats recompute on completion + lastUpdateTime = 0; + const percentage = Math.round((totalUploaded / file.size) * 100); + const stats = calculateStats(totalUploaded); + + observer.next({ + filePath, + percentage: Math.min(percentage, 99), + status: "uploading", + uploadToken, + ...stats, + }); + + partObserver.complete(); + } else { + partObserver.error( + new Error(`Failed to upload part ${partNumber} (HTTP ${xhr.status})`) + ); } + }); - // Mark this part as fully uploaded - partProgress.set(partNumber, chunk.size); - uploadedParts.push({ PartNumber: partNumber, ETag: etag }); - - // Recalculate progress - let totalUploaded = 0; - partProgress.forEach(bytes => (totalUploaded += bytes)); - const percentage = Math.round((totalUploaded / file.size) * 100); - lastUpdateTime = 0; - const stats = calculateStats(totalUploaded); - - observer.next({ - filePath, - percentage: Math.min(percentage, 99), - status: "uploading", - uploadId, - physicalAddress, - ...stats, - }); - partObserver.complete(); - } else { + xhr.addEventListener("error", () => { + // Remove failed part from progress + partProgress.delete(partNumber); partObserver.error(new Error(`Failed to upload part ${partNumber}`)); - } - }); + }); - xhr.addEventListener("error", () => { - // Remove failed part from progress - partProgress.delete(partNumber); - partObserver.error(new Error(`Failed to upload part ${partNumber}`)); - }); + const partUrl = + `${AppSettings.getApiEndpoint()}/${DATASET_BASE_URL}/multipart-upload/part` + + `?token=${encodeURIComponent(uploadToken)}&partNumber=${partNumber}`; - xhr.open("PUT", url); - xhr.send(chunk); - }); - }, concurrencyLimit), - - // 3) Collect results from all uploads (like forkJoin, but respects concurrency) - toArray(), - // 4) Finalize if all parts succeeded - switchMap(() => - this.finalizeMultipartUpload( - ownerEmail, - datasetName, - filePath, - uploadId, - uploadedParts, - physicalAddress, - false - ) + xhr.open("POST", partUrl); + xhr.setRequestHeader("Content-Type", "application/octet-stream"); + const token = AuthService.getAccessToken(); + if (token) { + xhr.setRequestHeader("Authorization", `Bearer ${token}`); + } + xhr.send(chunk); + return () => { + try { + xhr.abort(); + } catch {} + partProgress.delete(partNumber); + }; + }); + }, + concurrencyLimit ), + toArray(), // wait for all parts + // 3. FINISH: notify backend that all parts are done + switchMap(() => { + const finishParams = new HttpParams() + .set("type", "finish") + .set("ownerEmail", ownerEmail) + .set("datasetName", datasetName) + .set("filePath", encodeURIComponent(filePath)); + + const body = { uploadToken }; + + return this.http.post( + `${AppSettings.getApiEndpoint()}/${DATASET_BASE_URL}/multipart-upload`, + body, + { params: finishParams } + ); + }), tap(() => { + const totalTime = getTotalTime(); observer.next({ filePath, percentage: 100, status: "finished", - uploadId: uploadId, - physicalAddress: physicalAddress, + uploadToken, uploadSpeed: 0, estimatedTimeRemaining: 0, - totalTime: getTotalTime(), + totalTime, }); observer.complete(); }), - catchError((error: unknown) => { - // If an error occurred, abort the upload + catchError(error => { + // On error, compute best-effort percentage from bytes we've seen + let totalUploaded = 0; + partProgress.forEach(bytes => { + totalUploaded += bytes; + }); + const percentage = + file.size > 0 ? Math.round((totalUploaded / file.size) * 100) : 0; + observer.next({ filePath, - percentage: Math.round((uploadedParts.length / partCount) * 100), + percentage, status: "aborted", - uploadId: uploadId, - physicalAddress: physicalAddress, + uploadToken, uploadSpeed: 0, estimatedTimeRemaining: 0, totalTime: getTotalTime(), }); - return this.finalizeMultipartUpload( - ownerEmail, - datasetName, - filePath, - uploadId, - uploadedParts, - physicalAddress, - true - ).pipe(switchMap(() => throwError(() => error))); + // Abort on backend + const abortParams = new HttpParams() + .set("type", "abort") + .set("ownerEmail", ownerEmail) + .set("datasetName", datasetName) + .set("filePath", encodeURIComponent(filePath)); + + const body = { uploadToken }; + + return this.http + .post( + `${AppSettings.getApiEndpoint()}/${DATASET_BASE_URL}/multipart-upload`, + body, + { params: abortParams } + ) + .pipe( + switchMap(() => throwError(() => error)), + catchError(() => throwError(() => error)) + ); }) ); }) ) .subscribe({ - error: (err: unknown) => observer.error(err), + error: err => observer.error(err), }); + return () => subscription.unsubscribe(); }); } - /** - * Initiates a multipart upload and retrieves presigned URLs for each part. - * @param ownerEmail Owner's email - * @param datasetName Dataset Name - * @param filePath File path within the dataset - * @param numParts Number of parts for the multipart upload - */ - private initiateMultipartUpload( - ownerEmail: string, - datasetName: string, - filePath: string, - numParts: number - ): Observable<{ uploadId: string; presignedUrls: string[]; physicalAddress: string }> { - const params = new HttpParams() - .set("type", "init") - .set("ownerEmail", ownerEmail) - .set("datasetName", datasetName) - .set("filePath", encodeURIComponent(filePath)) - .set("numParts", numParts.toString()); - - return this.http.post<{ uploadId: string; presignedUrls: string[]; physicalAddress: string }>( - `${AppSettings.getApiEndpoint()}/${DATASET_BASE_URL}/multipart-upload`, - {}, - { params } - ); - } - - /** - * Completes or aborts a multipart upload, sending part numbers and ETags to the backend. - */ public finalizeMultipartUpload( ownerEmail: string, datasetName: string, filePath: string, - uploadId: string, - parts: { PartNumber: number; ETag: string }[], - physicalAddress: string, + uploadToken: string, isAbort: boolean ): Observable { const params = new HttpParams() .set("type", isAbort ? "abort" : "finish") .set("ownerEmail", ownerEmail) .set("datasetName", datasetName) - .set("filePath", encodeURIComponent(filePath)) - .set("uploadId", uploadId); + .set("filePath", encodeURIComponent(filePath)); return this.http.post( `${AppSettings.getApiEndpoint()}/${DATASET_BASE_URL}/multipart-upload`, - { parts, physicalAddress }, + { uploadToken }, { params } ); } From b44d446d049b971f4ecb30d257960aa3843506c7 Mon Sep 17 00:00:00 2001 From: Carlos Ernesto Alvarez Berumen Date: Fri, 5 Dec 2025 00:44:26 -0600 Subject: [PATCH 13/15] Update DatasetResource.scala --- .../apache/texera/service/resource/DatasetResource.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala b/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala index 478a723b3b8..3c89d3b3be7 100644 --- a/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala +++ b/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala @@ -560,10 +560,11 @@ class DatasetResource { } var read = fileStream.read(buf, buffered, buf.length - buffered) + val tmpMaxSize = maxSingleFileUploadBytes while (read != -1) { buffered += read totalBytesRead += read - if (totalBytesRead > maxSingleFileUploadBytes) { + if (totalBytesRead > tmpMaxSize) { throw new WebApplicationException( s"File exceeds maximum allowed size of ${singleFileUploadMaxSizeMib} MiB.", Response.Status.REQUEST_ENTITY_TOO_LARGE @@ -871,11 +872,12 @@ class DatasetResource { val outStream = conn.getOutputStream val buffer = new Array[Byte](8 * 1024) var bytesRead = partStream.read(buffer) + val tmpMaxSize = maxSingleFileUploadBytes try { while (bytesRead != -1) { val newTotal = session.totalBytes.addAndGet(bytesRead.toLong) - if (newTotal > maxSingleFileUploadBytes) { + if (newTotal > tmpMaxSize) { session.status = "aborted" DatasetResource.uploadSessions.remove(uploadToken) From 24c621cb2d61712ff155479c7179c3409e3c67c3 Mon Sep 17 00:00:00 2001 From: Carlos Ernesto Alvarez Berumen Date: Sat, 6 Dec 2025 19:11:39 -0600 Subject: [PATCH 14/15] db --- .../service/resource/DatasetResource.scala | 627 ++++++++++-------- .../service/user/dataset/dataset.service.ts | 1 - sql/texera_ddl.sql | 35 + 3 files changed, 398 insertions(+), 265 deletions(-) diff --git a/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala b/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala index 3c89d3b3be7..5e834f9591e 100644 --- a/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala +++ b/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala @@ -28,7 +28,6 @@ import org.apache.texera.amber.core.storage.model.OnDataset import org.apache.texera.amber.core.storage.util.LakeFSStorageClient import org.apache.texera.amber.core.storage.{DocumentFactory, FileResolver} import org.apache.texera.auth.SessionUser -import org.apache.texera.config.DefaultsConfig import org.apache.texera.dao.SqlServer import org.apache.texera.dao.SqlServer.withTransaction import org.apache.texera.dao.jooq.generated.enums.PrivilegeEnum @@ -54,7 +53,6 @@ import org.apache.texera.service.util.S3StorageClient.{ MAXIMUM_NUM_OF_MULTIPART_S3_PARTS, MINIMUM_NUM_OF_MULTIPART_S3_PART } -import org.jooq.impl.DSL import org.jooq.{DSLContext, EnumType} import java.io.{InputStream, OutputStream} @@ -63,13 +61,17 @@ import java.nio.charset.StandardCharsets import java.nio.file.{Files, Paths} import java.util import java.util.Optional -import java.util.concurrent.atomic.AtomicLong import java.util.zip.{ZipEntry, ZipOutputStream} -import scala.collection.concurrent.TrieMap import scala.collection.mutable.ListBuffer import scala.jdk.CollectionConverters._ import scala.jdk.OptionConverters._ +import org.apache.texera.dao.jooq.generated.tables.DatasetUploadSession.DATASET_UPLOAD_SESSION +import org.apache.texera.dao.jooq.generated.tables.DatasetUploadSessionPart.DATASET_UPLOAD_SESSION_PART +import org.apache.texera.dao.jooq.generated.enums.UploadPartStatusEnum + +import java.util.UUID + object DatasetResource { private val context = SqlServer @@ -93,11 +95,11 @@ object DatasetResource { */ private def put(buf: Array[Byte], len: Int, url: String, partNum: Int): String = { val conn = new URL(url).openConnection().asInstanceOf[HttpURLConnection] - conn.setDoOutput(true); + conn.setDoOutput(true) conn.setRequestMethod("PUT") conn.setFixedLengthStreamingMode(len) val out = conn.getOutputStream - out.write(buf, 0, len); + out.write(buf, 0, len) out.close() val code = conn.getResponseCode @@ -173,25 +175,6 @@ object DatasetResource { fileNodes: List[DatasetFileNode], size: Long ) - - /** Case class to hold state of an ongoing multipart upload session */ - private case class SessionState( - token: String, - repoName: String, - did: Int, - uid: Int, - uploadId: String, - filePath: String, - physicalAddress: String, - presignedUrls: Array[String], - var totalBytes: AtomicLong = new AtomicLong(0L), - @volatile var status: String = "ongoing", - var parts: ListBuffer[(Int, String)] = ListBuffer.empty - ) - - /** In-memory map of active upload sessions (uploadToken -> SessionState) */ - private val uploadSessions = TrieMap[String, SessionState]() - } @Produces(Array(MediaType.APPLICATION_JSON, "image/jpeg", "application/pdf")) @@ -201,26 +184,6 @@ class DatasetResource { private val ERR_DATASET_VERSION_NOT_FOUND_MESSAGE = "The version of the dataset not found" private val EXPIRATION_MINUTES = 5 - private val SingleFileUploadMaxSizeKey = "single_file_upload_max_size_mib" - - def singleFileUploadMaxSizeMib: Int = { - val valueOpt = Option( - context - .select(DSL.field("value", classOf[String])) - .from(DSL.table("site_settings")) - .where(DSL.field("key", classOf[String]).eq(SingleFileUploadMaxSizeKey)) - .fetchOne(0, classOf[String]) - ) - - valueOpt - .flatMap(v => scala.util.Try(v.toInt).toOption) - .getOrElse(DefaultsConfig.allDefaults(SingleFileUploadMaxSizeKey).toInt) - } - - /** Maximum allowed single-file upload size in bytes (MiB → bytes). */ - private def maxSingleFileUploadBytes: Long = - singleFileUploadMaxSizeMib.toLong * 1024L * 1024L - /** * Helper function to get the dataset from DB with additional information including user access privilege and owner email */ @@ -546,7 +509,6 @@ class DatasetResource { var buffered = 0 var partNumber = 1 val completedParts = ListBuffer[(Int, String)]() - var totalBytesRead = 0L @inline def flush(): Unit = { if (buffered == 0) return @@ -560,16 +522,8 @@ class DatasetResource { } var read = fileStream.read(buf, buffered, buf.length - buffered) - val tmpMaxSize = maxSingleFileUploadBytes while (read != -1) { buffered += read - totalBytesRead += read - if (totalBytesRead > tmpMaxSize) { - throw new WebApplicationException( - s"File exceeds maximum allowed size of ${singleFileUploadMaxSizeMib} MiB.", - Response.Status.REQUEST_ENTITY_TOO_LARGE - ) - } if (buffered == buf.length) flush() // buffer full read = fileStream.read(buf, buffered, buf.length - buffered) } @@ -692,142 +646,17 @@ class DatasetResource { @QueryParam("datasetName") datasetName: String, @QueryParam("filePath") encodedUrl: String, @QueryParam("numParts") numParts: Optional[Integer], - payload: Map[ - String, - Any - ], // Expecting {"uploadToken": "..."} for abort and finish + payload: Map[String, Any], // Expecting {"uploadToken": "..."} for abort and finish @Auth user: SessionUser ): Response = { val uid = user.getUid - withTransaction(context) { ctx => - val dataset = context - .select(DATASET.fields: _*) - .from(DATASET) - .leftJoin(USER) - .on(USER.UID.eq(DATASET.OWNER_UID)) - .where(USER.EMAIL.eq(ownerEmail)) - .and(DATASET.NAME.eq(datasetName)) - .fetchOneInto(classOf[Dataset]) - if (dataset == null || !userHasWriteAccess(ctx, dataset.getDid, uid)) { - throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE) - } - - // Decode the file path - val repositoryName = dataset.getRepositoryName - val filePath = URLDecoder.decode(encodedUrl, StandardCharsets.UTF_8.name()) - - operationType.toLowerCase match { - case "init" => - val numPartsValue = numParts.toScala.getOrElse( - throw new BadRequestException("numParts is required for initialization") - ) - - val presign = LakeFSStorageClient.initiatePresignedMultipartUploads( - repositoryName, - filePath, - numPartsValue - ) - val uploadIdStr = presign.getUploadId - val presignedUrlsArr = presign.getPresignedUrls.asScala.toArray.map(_.toString) - val physicalAddr = presign.getPhysicalAddress - - val token = java.util.UUID.randomUUID().toString - - DatasetResource.uploadSessions.put( - token, - SessionState( - token = token, - repoName = dataset.getRepositoryName, - did = dataset.getDid, - uid = uid, - uploadId = uploadIdStr, - filePath = filePath, - physicalAddress = physicalAddr, - presignedUrls = presignedUrlsArr - ) - ) - - Response - .ok( - Map( - "uploadToken" -> token - ) - ) - .build() - - case "finish" => - val tokenValue = payload.get("uploadToken").map(_.asInstanceOf[String]).getOrElse { - throw new BadRequestException("uploadToken is required for completion") - } - val session = DatasetResource.uploadSessions.getOrElse( - tokenValue, { - throw new NotFoundException("Upload session not found or already finalized") - } - ) - - if (user.getUid != session.uid) - throw new ForbiddenException("User has no access to this upload session") - - DatasetResource.uploadSessions.remove(tokenValue) - val objectStats = LakeFSStorageClient.completePresignedMultipartUploads( - repositoryName, - session.filePath, - session.uploadId, - session.parts.toList, - session.physicalAddress - ) - val sizeBytes = Option(objectStats.getSizeBytes).map(_.longValue()).getOrElse(0L) - if (sizeBytes > maxSingleFileUploadBytes) { - try { - LakeFSStorageClient.resetObjectUploadOrDeletion(repositoryName, filePath) - } catch { - case _: Exception => - } - throw new WebApplicationException( - s"File exceeds maximum allowed size of " + - s"${singleFileUploadMaxSizeMib} MiB. Upload has been rolled back.", - Response.Status.REQUEST_ENTITY_TOO_LARGE - ) - } - Response - .ok( - Map( - "message" -> "Multipart upload completed successfully", - "filePath" -> objectStats.getPath - ) - ) - .build() - - case "abort" => - val tokenValue = payload - .get("uploadToken") - .map(_.asInstanceOf[String]) - .getOrElse { - throw new BadRequestException("uploadToken is required for abortion") - } - val session = DatasetResource.uploadSessions.getOrElse( - tokenValue, { - throw new NotFoundException("Upload session not found or already finished") - } - ) - - if (user.getUid != session.uid) { - throw new ForbiddenException("User has no access to this upload session") - } - - DatasetResource.uploadSessions.remove(tokenValue) - - LakeFSStorageClient.abortPresignedMultipartUploads( - session.repoName, - session.filePath, - session.uploadId, - session.physicalAddress - ) - Response.ok(Map("message" -> "Multipart upload aborted successfully")).build() - case _ => - throw new BadRequestException("Invalid type parameter. Use 'init', 'finish', or 'abort'.") - } + operationType.toLowerCase match { + case "init" => initMultipartUpload(ownerEmail, datasetName, encodedUrl, numParts, uid) + case "finish" => finishMultipartUpload(payload, uid) + case "abort" => abortMultipartUpload(payload, uid) + case _ => + throw new BadRequestException("Invalid type parameter. Use 'init', 'finish', or 'abort'.") } } @@ -842,88 +671,55 @@ class DatasetResource { @Context headers: HttpHeaders, @Auth user: SessionUser ): Response = { - val sessionOpt = DatasetResource.uploadSessions.get(uploadToken) - if (sessionOpt.isEmpty) { - throw new NotFoundException("Upload session not found or expired") - } - val session = sessionOpt.get - if (user.getUid != session.uid) - throw new ForbiddenException("User has no access to this upload session") - - if (session.status == "aborted") - throw new WebApplicationException("Upload session already aborted", Response.Status.GONE) - - if (partNumber < 1 || partNumber > session.presignedUrls.length) - throw new BadRequestException("Invalid partNumber") + if (uploadToken == null || uploadToken.isEmpty) + throw new BadRequestException("token is required") - val presignedUrl = session.presignedUrls(partNumber - 1) + if (partNumber < 1) + throw new BadRequestException("partNumber must be >= 1") - val conn = new URL(presignedUrl).openConnection().asInstanceOf[HttpURLConnection] - conn.setDoOutput(true) - conn.setRequestMethod("PUT") - - // Don't trust Content-Length for enforcement, we only use it to hint streaming mode if present - Option(headers.getHeaderString(HttpHeaders.CONTENT_LENGTH)) - .flatMap(s => scala.util.Try(s.toLong).toOption) - .foreach(len => conn.setFixedLengthStreamingMode(len)) - conn.setRequestProperty("Content-Type", "application/octet-stream") - - val outStream = conn.getOutputStream - val buffer = new Array[Byte](8 * 1024) - var bytesRead = partStream.read(buffer) - val tmpMaxSize = maxSingleFileUploadBytes - - try { - while (bytesRead != -1) { - val newTotal = session.totalBytes.addAndGet(bytesRead.toLong) - if (newTotal > tmpMaxSize) { - session.status = "aborted" - DatasetResource.uploadSessions.remove(uploadToken) - - // Close streams before aborting - try outStream.close() - catch { case _: Exception => () } - try partStream.close() - catch { case _: Exception => () } - - LakeFSStorageClient.abortPresignedMultipartUploads( - session.repoName, - session.filePath, - session.uploadId, - session.physicalAddress - ) - - throw new WebApplicationException( - s"File exceeds maximum allowed size of ${singleFileUploadMaxSizeMib} MiB. " + - "Upload has been rolled back.", - Response.Status.REQUEST_ENTITY_TOO_LARGE - ) - } - outStream.write(buffer, 0, bytesRead) - bytesRead = partStream.read(buffer) - } - } finally { - try outStream.close() - catch { case _: Exception => () } - try partStream.close() - catch { case _: Exception => () } - } + val tokenUuid = parseUploadTokenOrBadRequest(uploadToken, "token") - val code = conn.getResponseCode - if (code != HttpURLConnection.HTTP_OK && code != HttpURLConnection.HTTP_CREATED) { - conn.disconnect() - throw new RuntimeException(s"Part $partNumber upload failed (HTTP $code)") + // -------- Step 1: lock the part row and move to UPLOADING -------- + val presignedUrl = withTransaction(context) { ctx => + val partRecord = lockPartForUploadOrFail(ctx, tokenUuid, partNumber, user.getUid) + partRecord.getPresignedUrl } - val eTag = Option(conn.getHeaderField("ETag")).map(_.replace("\"", "")).getOrElse("") - conn.disconnect() - - session.synchronized { - if (session.status == "aborted") { - throw new WebApplicationException("Upload session already aborted", Response.Status.GONE) + // -------- Step 2: stream bytes to S3 -------- + val (eTag, bytesSent) = + try uploadPartToPresignedUrl(presignedUrl, partStream, headers, partNumber) + catch { + case e: Exception => + // revert status back to PENDING on failure + withTransaction(context) { ctx => + ctx + .update(DATASET_UPLOAD_SESSION_PART) + .set(DATASET_UPLOAD_SESSION_PART.STATUS, UploadPartStatusEnum.PENDING) + .set(DATASET_UPLOAD_SESSION_PART.UPDATED_AT, java.time.OffsetDateTime.now()) + .where( + DATASET_UPLOAD_SESSION_PART.UPLOAD_TOKEN + .eq(tokenUuid) + .and(DATASET_UPLOAD_SESSION_PART.PART_NUMBER.eq(partNumber)) + ) + .execute() + } + throw e } - session.parts += ((partNumber, eTag)) + + // -------- Step 3: mark as COMPLETED and store ETag-------- + withTransaction(context) { ctx => + ctx + .update(DATASET_UPLOAD_SESSION_PART) + .set(DATASET_UPLOAD_SESSION_PART.STATUS, UploadPartStatusEnum.COMPLETED) + .set(DATASET_UPLOAD_SESSION_PART.ETAG, eTag) + .set(DATASET_UPLOAD_SESSION_PART.UPDATED_AT, java.time.OffsetDateTime.now()) + .where( + DATASET_UPLOAD_SESSION_PART.UPLOAD_TOKEN + .eq(tokenUuid) + .and(DATASET_UPLOAD_SESSION_PART.PART_NUMBER.eq(partNumber)) + ) + .execute() } Response.ok().build() @@ -1169,9 +965,8 @@ class DatasetResource { val ownerNode = DatasetFileNode .fromLakeFSRepositoryCommittedObjects( Map( - (user.getEmail, dataset.getName, latestVersion.getName) -> - LakeFSStorageClient - .retrieveObjectsOfVersion(dataset.getRepositoryName, latestVersion.getVersionHash) + (user.getEmail, dataset.getName, latestVersion.getName) -> LakeFSStorageClient + .retrieveObjectsOfVersion(dataset.getRepositoryName, latestVersion.getVersionHash) ) ) .head @@ -1527,4 +1322,308 @@ class DatasetResource { Right(response) } } + + // === Multipart helpers === + + private def initMultipartUpload( + ownerEmail: String, + datasetName: String, + encodedUrl: String, + numParts: Optional[Integer], + uid: Int + ): Response = { + withTransaction(context) { ctx => + val dataset = ctx + .select(DATASET.fields: _*) + .from(DATASET) + .leftJoin(USER) + .on(USER.UID.eq(DATASET.OWNER_UID)) + .where(USER.EMAIL.eq(ownerEmail)) + .and(DATASET.NAME.eq(datasetName)) + .fetchOneInto(classOf[Dataset]) + + if (dataset == null || !userHasWriteAccess(ctx, dataset.getDid, uid)) { + throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE) + } + + val repositoryName = dataset.getRepositoryName + val filePath = URLDecoder.decode(encodedUrl, StandardCharsets.UTF_8.name()) + + val numPartsValue = numParts.toScala.getOrElse( + throw new BadRequestException("numParts is required for initialization") + ) + + val presign = LakeFSStorageClient.initiatePresignedMultipartUploads( + repositoryName, + filePath, + numPartsValue + ) + val uploadIdStr = presign.getUploadId + val presignedUrls = presign.getPresignedUrls.asScala.toArray.map(_.toString) + val physicalAddr = presign.getPhysicalAddress + + val token = java.util.UUID.randomUUID() + + val sessionRecord = ctx.newRecord(DATASET_UPLOAD_SESSION) + sessionRecord.setUploadToken(token) + sessionRecord.setDid(dataset.getDid) + sessionRecord.setUid(uid) + sessionRecord.setFilePath(filePath) + sessionRecord.setUploadId(uploadIdStr) + sessionRecord.setPhysicalAddress(physicalAddr) + sessionRecord.store() + + presignedUrls.zipWithIndex.foreach { + case (url, idx) => + val partRecord = ctx.newRecord(DATASET_UPLOAD_SESSION_PART) + partRecord.setUploadToken(token) + partRecord.setPartNumber(idx + 1) + partRecord.setStatus(UploadPartStatusEnum.PENDING) + partRecord.setPresignedUrl(url) + partRecord.store() + } + + Response + .ok( + Map( + "uploadToken" -> token.toString + ) + ) + .build() + } + } + private def finishMultipartUpload( + payload: Map[String, Any], + uid: Int + ): Response = { + val tokenUuid = extractUploadTokenFromPayload(payload, "completion") + + withTransaction(context) { ctx => + val (session, dataset) = loadSessionAndDatasetOrFail(ctx, tokenUuid, uid) + + val partRecords = ctx + .selectFrom(DATASET_UPLOAD_SESSION_PART) + .where( + DATASET_UPLOAD_SESSION_PART.UPLOAD_TOKEN + .eq(tokenUuid) + .and(DATASET_UPLOAD_SESSION_PART.STATUS.eq(UploadPartStatusEnum.COMPLETED)) + ) + .orderBy(DATASET_UPLOAD_SESSION_PART.PART_NUMBER.asc()) + .fetch() + + if (partRecords.isEmpty) { + throw new BadRequestException("No completed parts for this upload") + } + + val partsList: List[(Int, String)] = + partRecords.asScala.toList.map { r => + val etag = Option(r.getEtag).getOrElse { + throw new WebApplicationException( + s"Missing ETag for part ${r.getPartNumber}", + Response.Status.INTERNAL_SERVER_ERROR + ) + } + (r.getPartNumber.intValue(), etag) + } + + // TODO: later enforce contiguity & total size here. + + val objectStats = LakeFSStorageClient.completePresignedMultipartUploads( + dataset.getRepositoryName, + session.getFilePath, + session.getUploadId, + partsList, + session.getPhysicalAddress + ) + + ctx + .deleteFrom(DATASET_UPLOAD_SESSION) + .where(DATASET_UPLOAD_SESSION.UPLOAD_TOKEN.eq(tokenUuid)) + .execute() + + Response + .ok( + Map( + "message" -> "Multipart upload completed successfully", + "filePath" -> objectStats.getPath + ) + ) + .build() + } + } + private def abortMultipartUpload( + payload: Map[String, Any], + uid: Int + ): Response = { + val tokenUuid = extractUploadTokenFromPayload(payload, "abortion") + + withTransaction(context) { ctx => + val (session, dataset) = loadSessionAndDatasetOrFail(ctx, tokenUuid, uid) + + LakeFSStorageClient.abortPresignedMultipartUploads( + dataset.getRepositoryName, + session.getFilePath, + session.getUploadId, + session.getPhysicalAddress + ) + + ctx + .deleteFrom(DATASET_UPLOAD_SESSION) + .where(DATASET_UPLOAD_SESSION.UPLOAD_TOKEN.eq(tokenUuid)) + .execute() + + Response.ok(Map("message" -> "Multipart upload aborted successfully")).build() + } + } + + private def parseUploadTokenOrBadRequest(raw: String, fieldName: String): UUID = { + try UUID.fromString(raw) + catch { + case _: IllegalArgumentException => + throw new BadRequestException(s"Invalid $fieldName format") + } + } + + private def extractUploadTokenFromPayload( + payload: Map[String, Any], + opName: String + ): UUID = { + val tokenValueStr = payload + .get("uploadToken") + .map(_.asInstanceOf[String]) + .getOrElse { + throw new BadRequestException(s"uploadToken is required for $opName") + } + + parseUploadTokenOrBadRequest(tokenValueStr, "uploadToken") + } + + private def loadSessionAndDatasetOrFail( + ctx: DSLContext, + tokenUuid: UUID, + uid: Int + ) = { + val session = ctx + .selectFrom(DATASET_UPLOAD_SESSION) + .where(DATASET_UPLOAD_SESSION.UPLOAD_TOKEN.eq(tokenUuid)) + .fetchOne() + + if (session == null) { + throw new NotFoundException("Upload session not found or already finalized") + } + + if (session.getUid != uid) { + throw new ForbiddenException("User has no access to this upload session") + } + + val dataset = ctx + .selectFrom(DATASET) + .where(DATASET.DID.eq(session.getDid)) + .fetchOne() + + if (dataset == null || !userHasWriteAccess(ctx, dataset.getDid, uid)) { + throw new ForbiddenException(ERR_USER_HAS_NO_ACCESS_TO_DATASET_MESSAGE) + } + + (session, dataset) + } + + private def lockPartForUploadOrFail( + ctx: DSLContext, + tokenUuid: UUID, + partNumber: Int, + uid: Int + ) = { + val session = ctx + .selectFrom(DATASET_UPLOAD_SESSION) + .where(DATASET_UPLOAD_SESSION.UPLOAD_TOKEN.eq(tokenUuid)) + .fetchOne() + + if (session == null) { + throw new NotFoundException("Upload session not found or expired") + } + + if (session.getUid != uid) { + throw new ForbiddenException("User has no access to this upload session") + } + + val partRecord = ctx + .selectFrom(DATASET_UPLOAD_SESSION_PART) + .where( + DATASET_UPLOAD_SESSION_PART.UPLOAD_TOKEN + .eq(tokenUuid) + .and(DATASET_UPLOAD_SESSION_PART.PART_NUMBER.eq(partNumber)) + ) + .forUpdate() + .fetchOne() + + if (partRecord == null) { + throw new BadRequestException("Invalid partNumber") + } + + partRecord.getStatus match { + case UploadPartStatusEnum.COMPLETED => + throw new BadRequestException("This part has already been completed") + + case UploadPartStatusEnum.UPLOADING => + throw new WebApplicationException( + "This part is already being uploaded", + Response.Status.CONFLICT + ) + + case UploadPartStatusEnum.PENDING => + partRecord.setStatus(UploadPartStatusEnum.UPLOADING) + partRecord.setUpdatedAt(java.time.OffsetDateTime.now()) + partRecord.update() + } + + partRecord + } + + private def uploadPartToPresignedUrl( + presignedUrl: String, + partStream: InputStream, + headers: HttpHeaders, + partNumber: Int + ): (String, Long) = { + val conn = new URL(presignedUrl).openConnection().asInstanceOf[HttpURLConnection] + conn.setDoOutput(true) + conn.setRequestMethod("PUT") + + // Only a hint for streaming, not trust boundary + Option(headers.getHeaderString(HttpHeaders.CONTENT_LENGTH)) + .flatMap(s => scala.util.Try(s.toLong).toOption) + .foreach(len => conn.setFixedLengthStreamingMode(len)) + + conn.setRequestProperty("Content-Type", "application/octet-stream") + + val outStream = conn.getOutputStream + val buffer = new Array[Byte](8 * 1024) + var bytesRead = partStream.read(buffer) + var sent: Long = 0L + + try { + while (bytesRead != -1) { + outStream.write(buffer, 0, bytesRead) + sent += bytesRead + bytesRead = partStream.read(buffer) + } + } finally { + try outStream.close() + catch { case _: Exception => () } + try partStream.close() + catch { case _: Exception => () } + } + + val code = conn.getResponseCode + if (code != HttpURLConnection.HTTP_OK && code != HttpURLConnection.HTTP_CREATED) { + conn.disconnect() + throw new RuntimeException(s"Part $partNumber upload failed (HTTP $code)") + } + + val eTag = Option(conn.getHeaderField("ETag")).map(_.replace("\"", "")).getOrElse("") + conn.disconnect() + (eTag, sent) + } + } diff --git a/frontend/src/app/dashboard/service/user/dataset/dataset.service.ts b/frontend/src/app/dashboard/service/user/dataset/dataset.service.ts index a3a55900fd9..eb729f9b1ee 100644 --- a/frontend/src/app/dashboard/service/user/dataset/dataset.service.ts +++ b/frontend/src/app/dashboard/service/user/dataset/dataset.service.ts @@ -350,7 +350,6 @@ export class DatasetService { try { xhr.abort(); } catch {} - partProgress.delete(partNumber); }; }); }, diff --git a/sql/texera_ddl.sql b/sql/texera_ddl.sql index 7b0f9b9063d..b0ea0162f5e 100644 --- a/sql/texera_ddl.sql +++ b/sql/texera_ddl.sql @@ -58,6 +58,8 @@ DROP TABLE IF EXISTS workflow_version CASCADE; DROP TABLE IF EXISTS project CASCADE; DROP TABLE IF EXISTS workflow_of_project CASCADE; DROP TABLE IF EXISTS workflow_executions CASCADE; +DROP TABLE IF EXISTS dataset_upload_session_part CASCADE; +DROP TABLE IF EXISTS dataset_upload_session CASCADE; DROP TABLE IF EXISTS dataset CASCADE; DROP TABLE IF EXISTS dataset_user_access CASCADE; DROP TABLE IF EXISTS dataset_version CASCADE; @@ -79,11 +81,13 @@ DROP TABLE IF EXISTS computing_unit_user_access CASCADE; DROP TYPE IF EXISTS user_role_enum CASCADE; DROP TYPE IF EXISTS privilege_enum CASCADE; DROP TYPE IF EXISTS action_enum CASCADE; +DROP TYPE IF EXISTS upload_part_status_enum CASCADE; CREATE TYPE user_role_enum AS ENUM ('INACTIVE', 'RESTRICTED', 'REGULAR', 'ADMIN'); CREATE TYPE action_enum AS ENUM ('like', 'unlike', 'view', 'clone'); CREATE TYPE privilege_enum AS ENUM ('NONE', 'READ', 'WRITE'); CREATE TYPE workflow_computing_unit_type_enum AS ENUM ('local', 'kubernetes'); +CREATE TYPE upload_part_status_enum AS ENUM ('PENDING', 'UPLOADING', 'COMPLETED'); -- ============================================ -- 5. Create tables @@ -274,6 +278,37 @@ CREATE TABLE IF NOT EXISTS dataset_version FOREIGN KEY (did) REFERENCES dataset(did) ON DELETE CASCADE ); +-- dataset_upload_session: tracks one multipart upload session for a single file +CREATE TABLE IF NOT EXISTS dataset_upload_session +( + upload_token UUID PRIMARY KEY, + did INT NOT NULL, + uid INT NOT NULL, + file_path TEXT NOT NULL, + upload_id VARCHAR(256) NOT NULL, + physical_address TEXT, + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT now(), + + FOREIGN KEY (did) REFERENCES dataset(did) ON DELETE CASCADE, + FOREIGN KEY (uid) REFERENCES "user"(uid) ON DELETE CASCADE +); + +-- dataset_upload_session_part: one row per (session, partNumber) +CREATE TABLE IF NOT EXISTS dataset_upload_session_part +( + upload_token UUID NOT NULL, + part_number INT NOT NULL, + status upload_part_status_enum NOT NULL DEFAULT 'PENDING', + etag VARCHAR(256), + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT now(), + presigned_url TEXT NOT NULL, + + PRIMARY KEY (upload_token, part_number), + FOREIGN KEY (upload_token) REFERENCES dataset_upload_session(upload_token) ON DELETE CASCADE +); + -- operator_executions (modified to match MySQL: no separate primary key; added console_messages_uri) CREATE TABLE IF NOT EXISTS operator_executions ( From 80d379ef9864c0c60488c3d5657d0ea2a903c31b Mon Sep 17 00:00:00 2001 From: Carlos Ernesto Alvarez Berumen Date: Sat, 6 Dec 2025 19:42:20 -0600 Subject: [PATCH 15/15] Update DatasetResource.scala --- .../org/apache/texera/service/resource/DatasetResource.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala b/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala index 5e834f9591e..f9038010b42 100644 --- a/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala +++ b/file-service/src/main/scala/org/apache/texera/service/resource/DatasetResource.scala @@ -1426,7 +1426,7 @@ class DatasetResource { (r.getPartNumber.intValue(), etag) } - // TODO: later enforce contiguity & total size here. + // TODO: later enforce max total size here. val objectStats = LakeFSStorageClient.completePresignedMultipartUploads( dataset.getRepositoryName,