From d7c9e55889de42377d946d7887ef0da0151c7f3d Mon Sep 17 00:00:00 2001 From: Kazuyuki Tanimura Date: Mon, 20 Apr 2026 18:40:23 -0700 Subject: [PATCH 1/4] fix: substring with negative start index --- .../expressions/string/substring.sql | 9 +++++++ .../comet/CometStringExpressionSuite.scala | 24 +++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/spark/src/test/resources/sql-tests/expressions/string/substring.sql b/spark/src/test/resources/sql-tests/expressions/string/substring.sql index 4e6217fd5f..fb15a830cd 100644 --- a/spark/src/test/resources/sql-tests/expressions/string/substring.sql +++ b/spark/src/test/resources/sql-tests/expressions/string/substring.sql @@ -39,6 +39,15 @@ SELECT substring(s, 1, -1) FROM test_substring query SELECT substring(s, 100) FROM test_substring +query +SELECT substring(s, -2, 3) FROM test_substring + +query +SELECT substring(s, -10, 3) FROM test_substring + +query +SELECT substring(s, -300, 3) FROM test_substring + -- literal + literal + literal query ignore(https://github.com/apache/datafusion-comet/issues/3337) SELECT substring('hello world', 1, 5), substring('hello world', -3), substring('', 1, 5), substring(NULL, 1, 5) diff --git a/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala index 121d7f7d5a..247df5d761 100644 --- a/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala @@ -478,4 +478,28 @@ class CometStringExpressionSuite extends CometTestBase { } } + test("substring") { + val data = Seq(("hello world",), ("",), (null,), ("abc",)) + withParquetTable(data, "tbl") { + // positive start + checkSparkAnswerAndOperator("SELECT substring(_1, 1, 5) FROM tbl") + // negative start, no length + checkSparkAnswerAndOperator("SELECT substring(_1, -3) FROM tbl") + // zero start + checkSparkAnswerAndOperator("SELECT substring(_1, 0, 3) FROM tbl") + // zero length + checkSparkAnswerAndOperator("SELECT substring(_1, 1, 0) FROM tbl") + // negative length + checkSparkAnswerAndOperator("SELECT substring(_1, 1, -1) FROM tbl") + // start beyond string length + checkSparkAnswerAndOperator("SELECT substring(_1, 100) FROM tbl") + // negative start with length + checkSparkAnswerAndOperator("SELECT substring(_1, -2, 3) FROM tbl") + // negative start beyond string length with length + checkSparkAnswerAndOperator("SELECT substring(_1, -10, 3) FROM tbl") + // large negative start with length + checkSparkAnswerAndOperator("SELECT substring(_1, -300, 3) FROM tbl") + } + } + } From e6c517a10883a5928d4384de528919cd932e440a Mon Sep 17 00:00:00 2001 From: Kazuyuki Tanimura Date: Tue, 21 Apr 2026 00:42:36 -0700 Subject: [PATCH 2/4] fix: substring with negative start index --- .../spark-expr/src/string_funcs/substring.rs | 64 ++++++++++++++++++- .../comet/CometStringExpressionSuite.scala | 2 +- 2 files changed, 63 insertions(+), 3 deletions(-) diff --git a/native/spark-expr/src/string_funcs/substring.rs b/native/spark-expr/src/string_funcs/substring.rs index e6f11fc39a..246d8c32dc 100644 --- a/native/spark-expr/src/string_funcs/substring.rs +++ b/native/spark-expr/src/string_funcs/substring.rs @@ -18,7 +18,8 @@ #![allow(deprecated)] use crate::kernels::strings::substring; -use arrow::datatypes::{DataType, Schema}; +use arrow::array::{as_dictionary_array, as_largestring_array, as_string_array, Array, ArrayRef}; +use arrow::datatypes::{DataType, Int32Type, Schema}; use arrow::record_batch::RecordBatch; use datafusion::common::DataFusionError; use datafusion::logical_expr::ColumnarValue; @@ -88,8 +89,14 @@ impl PhysicalExpr for SubstringExpr { let arg = self.child.evaluate(batch)?; match arg { ColumnarValue::Array(array) => { + // Spark returns empty when negative start exceeds string length. + // Arrow clamps to 0 instead, so we must fix up per-element. + let array = if self.start < 0 { + clamp_negative_start(&array, self.start)? + } else { + array + }; let result = substring(&array, self.start, self.len)?; - Ok(ColumnarValue::Array(result)) } _ => Err(DataFusionError::Execution( @@ -113,3 +120,56 @@ impl PhysicalExpr for SubstringExpr { ))) } } + +/// For negative start, Spark returns empty when abs(start) > string length. +/// Arrow's substring_by_char clamps to 0 instead. This function replaces +/// such strings with "" so the subsequent Arrow substring returns "". +fn clamp_negative_start(array: &ArrayRef, start: i64) -> datafusion::common::Result { + use arrow::array::{DictionaryArray, GenericStringBuilder}; + + let abs_start = start.unsigned_abs() as usize; + + match array.data_type() { + DataType::Utf8 => { + let str_array = as_string_array(array); + let mut builder = GenericStringBuilder::::new(); + for i in 0..str_array.len() { + if str_array.is_null(i) { + builder.append_null(); + } else { + let val = str_array.value(i); + if val.chars().count() < abs_start { + builder.append_value(""); + } else { + builder.append_value(val); + } + } + } + Ok(Arc::new(builder.finish()) as ArrayRef) + } + DataType::LargeUtf8 => { + let str_array = as_largestring_array(array); + let mut builder = GenericStringBuilder::::new(); + for i in 0..str_array.len() { + if str_array.is_null(i) { + builder.append_null(); + } else { + let val = str_array.value(i); + if val.chars().count() < abs_start { + builder.append_value(""); + } else { + builder.append_value(val); + } + } + } + Ok(Arc::new(builder.finish()) as ArrayRef) + } + DataType::Dictionary(_, _) => { + let dict = as_dictionary_array::(array); + let values = clamp_negative_start(dict.values(), start)?; + let result = DictionaryArray::try_new(dict.keys().clone(), values)?; + Ok(Arc::new(result) as ArrayRef) + } + _ => Ok(Arc::clone(array)), + } +} diff --git a/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala index 247df5d761..b089ac3ce7 100644 --- a/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala @@ -479,7 +479,7 @@ class CometStringExpressionSuite extends CometTestBase { } test("substring") { - val data = Seq(("hello world",), ("",), (null,), ("abc",)) + val data = Seq(("hello world", ""), ("", ""), (null, ""), ("abc", "")) withParquetTable(data, "tbl") { // positive start checkSparkAnswerAndOperator("SELECT substring(_1, 1, 5) FROM tbl") From 363b730f12165c76271bc16f71bef1a9ce09c5c7 Mon Sep 17 00:00:00 2001 From: Kazuyuki Tanimura Date: Tue, 21 Apr 2026 01:09:29 -0700 Subject: [PATCH 3/4] fix: substring with negative start index --- .../comet/CometStringExpressionSuite.scala | 128 ++++++++++++++++++ 1 file changed, 128 insertions(+) diff --git a/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala index b089ac3ce7..ef3658076a 100644 --- a/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala @@ -502,4 +502,132 @@ class CometStringExpressionSuite extends CometTestBase { } } + test("substring - negative start boundary cases") { + // "abc" has length 3, so -3 means start at first char, -4 exceeds length + val data = Seq(("abc", ""), ("a", ""), ("ab", ""), ("", ""), (null, "")) + withParquetTable(data, "tbl") { + // abs(start) == string length exactly (boundary: should return from first char) + checkSparkAnswerAndOperator("SELECT substring(_1, -3, 2) FROM tbl") + checkSparkAnswerAndOperator("SELECT substring(_1, -3) FROM tbl") + // abs(start) == length + 1 (one past boundary: should return empty) + checkSparkAnswerAndOperator("SELECT substring(_1, -4, 2) FROM tbl") + checkSparkAnswerAndOperator("SELECT substring(_1, -4) FROM tbl") + // abs(start) == length - 1 (one before boundary) + checkSparkAnswerAndOperator("SELECT substring(_1, -2, 5) FROM tbl") + checkSparkAnswerAndOperator("SELECT substring(_1, -2) FROM tbl") + // -1: last character + checkSparkAnswerAndOperator("SELECT substring(_1, -1, 1) FROM tbl") + checkSparkAnswerAndOperator("SELECT substring(_1, -1) FROM tbl") + // -1 with length exceeding remaining chars + checkSparkAnswerAndOperator("SELECT substring(_1, -1, 100) FROM tbl") + } + } + + test("substring - negative start with zero and negative length") { + val data = Seq(("hello", ""), ("ab", ""), ("", ""), (null, "")) + withParquetTable(data, "tbl") { + // negative start + zero length + checkSparkAnswerAndOperator("SELECT substring(_1, -3, 0) FROM tbl") + checkSparkAnswerAndOperator("SELECT substring(_1, -100, 0) FROM tbl") + // negative start + negative length + checkSparkAnswerAndOperator("SELECT substring(_1, -3, -1) FROM tbl") + checkSparkAnswerAndOperator("SELECT substring(_1, -1, -5) FROM tbl") + // negative start exceeding length + zero length + checkSparkAnswerAndOperator("SELECT substring(_1, -10, 0) FROM tbl") + // negative start exceeding length + negative length + checkSparkAnswerAndOperator("SELECT substring(_1, -10, -1) FROM tbl") + } + } + + test("substring - single character and empty strings") { + val data = Seq(("x", ""), ("", ""), (null, "")) + withParquetTable(data, "tbl") { + for (start <- Seq(-2, -1, 0, 1, 2)) { + for (len <- Seq(0, 1, 5)) { + checkSparkAnswerAndOperator(s"SELECT substring(_1, $start, $len) FROM tbl") + } + // without explicit length + checkSparkAnswerAndOperator(s"SELECT substring(_1, $start) FROM tbl") + } + } + } + + test("substring - unicode multi-byte characters") { + // scalastyle:off + val data = Seq( + ("苹果手机", ""), // 4 Chinese characters (3 bytes each in UTF-8) + ("café", ""), // combining accent + ("😀🎉🔥", ""), // emoji (4 bytes each in UTF-8) + ("aé苹😀", ""), // mixed: ASCII + 2-byte + 3-byte + 4-byte + ("", ""), + (null, "")) + // scalastyle:on + withParquetTable(data, "tbl") { + // positive start into multi-byte + checkSparkAnswerAndOperator("SELECT substring(_1, 2, 2) FROM tbl") + checkSparkAnswerAndOperator("SELECT substring(_1, 1, 1) FROM tbl") + // negative start with multi-byte + checkSparkAnswerAndOperator("SELECT substring(_1, -2) FROM tbl") + checkSparkAnswerAndOperator("SELECT substring(_1, -2, 1) FROM tbl") + // negative start exceeding multi-byte string length + checkSparkAnswerAndOperator("SELECT substring(_1, -10, 2) FROM tbl") + checkSparkAnswerAndOperator("SELECT substring(_1, -10) FROM tbl") + // abs(start) == char length boundary for 4-char string + checkSparkAnswerAndOperator("SELECT substring(_1, -4, 2) FROM tbl") + checkSparkAnswerAndOperator("SELECT substring(_1, -5, 2) FROM tbl") + // extract entire string + checkSparkAnswerAndOperator("SELECT substring(_1, 1, 100) FROM tbl") + checkSparkAnswerAndOperator("SELECT substring(_1, 1) FROM tbl") + } + } + + test("substring - large start and length values") { + val data = Seq(("hello world", ""), ("abc", ""), ("", ""), (null, "")) + withParquetTable(data, "tbl") { + checkSparkAnswerAndOperator( + s"SELECT substring(_1, ${Int.MaxValue}, 5) FROM tbl") + checkSparkAnswerAndOperator( + s"SELECT substring(_1, 1, ${Int.MaxValue}) FROM tbl") + checkSparkAnswerAndOperator( + s"SELECT substring(_1, ${Int.MinValue + 1}, 5) FROM tbl") + checkSparkAnswerAndOperator( + s"SELECT substring(_1, ${Int.MinValue + 1}) FROM tbl") + checkSparkAnswerAndOperator( + s"SELECT substring(_1, ${Int.MaxValue}, ${Int.MaxValue}) FROM tbl") + } + } + + test("substring - dictionary encoded strings") { + // repeated values to trigger dictionary encoding + val data = (0 until 1000).map { i => + val s = i % 5 match { + case 0 => "hello" + case 1 => "ab" + case 2 => "" + case 3 => null + case 4 => "world!" + } + Tuple1(s) + } + withSQLConf("parquet.enable.dictionary" -> "true") { + withParquetTable(data, "tbl") { + // positive start + checkSparkAnswerAndOperator("SELECT substring(_1, 2, 3) FROM tbl") + // negative start within bounds + checkSparkAnswerAndOperator("SELECT substring(_1, -3, 2) FROM tbl") + checkSparkAnswerAndOperator("SELECT substring(_1, -3) FROM tbl") + // negative start exceeding length for some values + checkSparkAnswerAndOperator("SELECT substring(_1, -4, 2) FROM tbl") + checkSparkAnswerAndOperator("SELECT substring(_1, -4) FROM tbl") + // negative start exceeding all string lengths + checkSparkAnswerAndOperator("SELECT substring(_1, -100, 3) FROM tbl") + checkSparkAnswerAndOperator("SELECT substring(_1, -100) FROM tbl") + // zero start + checkSparkAnswerAndOperator("SELECT substring(_1, 0, 3) FROM tbl") + // -1 last char + checkSparkAnswerAndOperator("SELECT substring(_1, -1, 1) FROM tbl") + } + } + } + } From eb3b752427fae4b9f66edfee049d50575ea96dad Mon Sep 17 00:00:00 2001 From: Kazuyuki Tanimura Date: Tue, 21 Apr 2026 02:07:32 -0700 Subject: [PATCH 4/4] fix: substring with negative start index --- .../spark-expr/src/string_funcs/substring.rs | 58 +++++++++++-------- .../comet/CometStringExpressionSuite.scala | 12 ++-- 2 files changed, 37 insertions(+), 33 deletions(-) diff --git a/native/spark-expr/src/string_funcs/substring.rs b/native/spark-expr/src/string_funcs/substring.rs index 246d8c32dc..fc5cdd96fc 100644 --- a/native/spark-expr/src/string_funcs/substring.rs +++ b/native/spark-expr/src/string_funcs/substring.rs @@ -89,14 +89,14 @@ impl PhysicalExpr for SubstringExpr { let arg = self.child.evaluate(batch)?; match arg { ColumnarValue::Array(array) => { - // Spark returns empty when negative start exceeds string length. - // Arrow clamps to 0 instead, so we must fix up per-element. - let array = if self.start < 0 { - clamp_negative_start(&array, self.start)? + let result = if self.start < 0 { + // Spark and Arrow differ for negative start: Arrow clamps + // start to 0 then takes `len` chars, but Spark computes + // end = unclamped_start + len, then clamps both independently. + spark_substring_negative_start(&array, self.start, self.len)? } else { - array + substring(&array, self.start, self.len)? }; - let result = substring(&array, self.start, self.len)?; Ok(ColumnarValue::Array(result)) } _ => Err(DataFusionError::Execution( @@ -121,14 +121,16 @@ impl PhysicalExpr for SubstringExpr { } } -/// For negative start, Spark returns empty when abs(start) > string length. -/// Arrow's substring_by_char clamps to 0 instead. This function replaces -/// such strings with "" so the subsequent Arrow substring returns "". -fn clamp_negative_start(array: &ArrayRef, start: i64) -> datafusion::common::Result { +/// Implement Spark's substring semantics for negative start positions. +/// Spark: start = numChars + pos, end = start + len, clamp both, empty if start >= end. +/// Arrow: start = max(0, numChars + pos), take len chars — differs when start is clamped. +fn spark_substring_negative_start( + array: &ArrayRef, + start: i64, + len: u64, +) -> datafusion::common::Result { use arrow::array::{DictionaryArray, GenericStringBuilder}; - let abs_start = start.unsigned_abs() as usize; - match array.data_type() { DataType::Utf8 => { let str_array = as_string_array(array); @@ -137,12 +139,7 @@ fn clamp_negative_start(array: &ArrayRef, start: i64) -> datafusion::common::Res if str_array.is_null(i) { builder.append_null(); } else { - let val = str_array.value(i); - if val.chars().count() < abs_start { - builder.append_value(""); - } else { - builder.append_value(val); - } + builder.append_value(spark_substr_negative(str_array.value(i), start, len)); } } Ok(Arc::new(builder.finish()) as ArrayRef) @@ -154,22 +151,33 @@ fn clamp_negative_start(array: &ArrayRef, start: i64) -> datafusion::common::Res if str_array.is_null(i) { builder.append_null(); } else { - let val = str_array.value(i); - if val.chars().count() < abs_start { - builder.append_value(""); - } else { - builder.append_value(val); - } + builder.append_value(spark_substr_negative(str_array.value(i), start, len)); } } Ok(Arc::new(builder.finish()) as ArrayRef) } DataType::Dictionary(_, _) => { let dict = as_dictionary_array::(array); - let values = clamp_negative_start(dict.values(), start)?; + let values = spark_substring_negative_start(dict.values(), start, len)?; let result = DictionaryArray::try_new(dict.keys().clone(), values)?; Ok(Arc::new(result) as ArrayRef) } _ => Ok(Arc::clone(array)), } } + +fn spark_substr_negative(s: &str, pos: i64, len: u64) -> String { + let num_chars = s.chars().count() as i64; + let start = num_chars + pos; + let end = start.saturating_add(len as i64).min(num_chars); + let start = start.max(0); + + if start >= end { + return String::new(); + } + + s.chars() + .skip(start as usize) + .take((end - start) as usize) + .collect() +} diff --git a/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala index ef3658076a..a53379e1d1 100644 --- a/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala @@ -584,14 +584,10 @@ class CometStringExpressionSuite extends CometTestBase { test("substring - large start and length values") { val data = Seq(("hello world", ""), ("abc", ""), ("", ""), (null, "")) withParquetTable(data, "tbl") { - checkSparkAnswerAndOperator( - s"SELECT substring(_1, ${Int.MaxValue}, 5) FROM tbl") - checkSparkAnswerAndOperator( - s"SELECT substring(_1, 1, ${Int.MaxValue}) FROM tbl") - checkSparkAnswerAndOperator( - s"SELECT substring(_1, ${Int.MinValue + 1}, 5) FROM tbl") - checkSparkAnswerAndOperator( - s"SELECT substring(_1, ${Int.MinValue + 1}) FROM tbl") + checkSparkAnswerAndOperator(s"SELECT substring(_1, ${Int.MaxValue}, 5) FROM tbl") + checkSparkAnswerAndOperator(s"SELECT substring(_1, 1, ${Int.MaxValue}) FROM tbl") + checkSparkAnswerAndOperator(s"SELECT substring(_1, ${Int.MinValue + 1}, 5) FROM tbl") + checkSparkAnswerAndOperator(s"SELECT substring(_1, ${Int.MinValue + 1}) FROM tbl") checkSparkAnswerAndOperator( s"SELECT substring(_1, ${Int.MaxValue}, ${Int.MaxValue}) FROM tbl") }