Skip to content
  •  
  •  
  •  
6 changes: 0 additions & 6 deletions .github/actions/java-test/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,6 @@ inputs:
description: 'Maven options passed to the mvn command'
required: false
default: ''
scan_impl:
description: 'The default Parquet scan implementation'
required: false
default: 'auto'
upload-test-reports:
description: 'Whether to upload test results including coverage to GitHub'
required: false
Expand Down Expand Up @@ -72,7 +68,6 @@ runs:
shell: bash
if: ${{ inputs.suites == '' }}
env:
COMET_PARQUET_SCAN_IMPL: ${{ inputs.scan_impl }}
SPARK_LOCAL_HOSTNAME: "localhost"
SPARK_LOCAL_IP: "127.0.0.1"
run: |
Expand All @@ -81,7 +76,6 @@ runs:
shell: bash
if: ${{ inputs.suites != '' }}
env:
COMET_PARQUET_SCAN_IMPL: ${{ inputs.scan_impl }}
SPARK_LOCAL_HOSTNAME: "localhost"
SPARK_LOCAL_IP: "127.0.0.1"
run: |
Expand Down
10 changes: 2 additions & 8 deletions .github/workflows/pr_build_linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -242,27 +242,22 @@ jobs:
- name: "Spark 3.4, JDK 11, Scala 2.12"
java_version: "11"
maven_opts: "-Pspark-3.4 -Pscala-2.12"
scan_impl: "auto"

- name: "Spark 3.5.5, JDK 17, Scala 2.13"
java_version: "17"
maven_opts: "-Pspark-3.5 -Dspark.version=3.5.5 -Pscala-2.13"
scan_impl: "auto"

- name: "Spark 3.5.6, JDK 17, Scala 2.13"
java_version: "17"
maven_opts: "-Pspark-3.5 -Dspark.version=3.5.6 -Pscala-2.13"
scan_impl: "auto"

- name: "Spark 3.5, JDK 17, Scala 2.12"
java_version: "17"
maven_opts: "-Pspark-3.5 -Pscala-2.12"
scan_impl: "native_iceberg_compat"

- name: "Spark 4.0, JDK 17"
java_version: "17"
maven_opts: "-Pspark-4.0"
scan_impl: "auto"
suite:
- name: "fuzz"
value: |
Expand Down Expand Up @@ -347,7 +342,7 @@ jobs:
value: |
org.apache.spark.sql.CometToPrettyStringSuite
fail-fast: false
name: ${{ matrix.profile.name }}/${{ matrix.profile.scan_impl }} [${{ matrix.suite.name }}]
name: ${{ matrix.profile.name }} [${{ matrix.suite.name }}]
runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion-comet', github.run_id) || 'ubuntu-latest' }}
container:
image: amd64/rust
Expand Down Expand Up @@ -385,10 +380,9 @@ jobs:
- name: Java test steps
uses: ./.github/actions/java-test
with:
artifact_name: ${{ matrix.profile.name }}-${{ matrix.suite.name }}-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}-${{ matrix.profile.scan_impl }}
artifact_name: ${{ matrix.profile.name }}-${{ matrix.suite.name }}-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}
suites: ${{ matrix.suite.name == 'sql' && matrix.profile.name == 'Spark 3.4, JDK 11, Scala 2.12' && '' || matrix.suite.value }}
maven_opts: ${{ matrix.profile.maven_opts }}
scan_impl: ${{ matrix.profile.scan_impl }}
upload-test-reports: true
skip-native-build: true

Expand Down
16 changes: 7 additions & 9 deletions .github/workflows/spark_sql_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -131,18 +131,16 @@ jobs:
- {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
- {name: "sql_hive-2", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.ExtendedHiveTest"}
- {name: "sql_hive-3", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.SlowHiveTest"}
# Since 4f5eaf0, auto mode uses native_datafusion for V1 scans,
# so we only need to test with auto.
config:
- {spark-short: '3.4', spark-full: '3.4.3', java: 11, scan-impl: 'auto'}
- {spark-short: '3.5', spark-full: '3.5.8', java: 11, scan-impl: 'auto'}
- {spark-short: '4.0', spark-full: '4.0.1', java: 17, scan-impl: 'auto'}
- {spark-short: '3.4', spark-full: '3.4.3', java: 11}
- {spark-short: '3.5', spark-full: '3.5.8', java: 11}
- {spark-short: '4.0', spark-full: '4.0.1', java: 17}
# Skip sql_hive-1 for Spark 4.0 due to https://github.com/apache/datafusion-comet/issues/2946
exclude:
- config: {spark-short: '4.0', spark-full: '4.0.1', java: 17, scan-impl: 'auto'}
- config: {spark-short: '4.0', spark-full: '4.0.1', java: 17}
module: {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
fail-fast: false
name: spark-sql-${{ matrix.config.scan-impl }}-${{ matrix.module.name }}/spark-${{ matrix.config.spark-full }}
name: spark-sql-${{ matrix.module.name }}/spark-${{ matrix.config.spark-full }}
runs-on: ${{ matrix.os }}
container:
image: amd64/rust
Expand All @@ -168,7 +166,7 @@ jobs:
run: |
cd apache-spark
rm -rf /root/.m2/repository/org/apache/parquet # somehow parquet cache requires cleanups
NOLINT_ON_COMPILE=true ENABLE_COMET=true ENABLE_COMET_ONHEAP=true COMET_PARQUET_SCAN_IMPL=${{ matrix.config.scan-impl }} ENABLE_COMET_LOG_FALLBACK_REASONS=${{ github.event.inputs.collect-fallback-logs || 'false' }} \
NOLINT_ON_COMPILE=true ENABLE_COMET=true ENABLE_COMET_ONHEAP=true ENABLE_COMET_LOG_FALLBACK_REASONS=${{ github.event.inputs.collect-fallback-logs || 'false' }} \
build/sbt -Dsbt.log.noformat=true ${{ matrix.module.args1 }} "${{ matrix.module.args2 }}"
if [ "${{ github.event.inputs.collect-fallback-logs }}" = "true" ]; then
find . -type f -name "unit-tests.log" -print0 | xargs -0 grep -h "Comet cannot accelerate" | sed 's/.*Comet cannot accelerate/Comet cannot accelerate/' | sort -u > fallback.log
Expand All @@ -179,7 +177,7 @@ jobs:
if: ${{ github.event.inputs.collect-fallback-logs == 'true' }}
uses: actions/upload-artifact@v7
with:
name: fallback-log-spark-sql-${{ matrix.config.scan-impl }}-${{ matrix.module.name }}-spark-${{ matrix.config.spark-full }}
name: fallback-log-spark-sql-${{ matrix.module.name }}-spark-${{ matrix.config.spark-full }}
path: "**/fallback.log"

merge-fallback-logs:
Expand Down
72 changes: 0 additions & 72 deletions .github/workflows/spark_sql_test_native_iceberg_compat.yml

This file was deleted.

19 changes: 10 additions & 9 deletions common/src/main/scala/org/apache/comet/CometConf.scala
Original file line number Diff line number Diff line change
Expand Up @@ -114,22 +114,23 @@ object CometConf extends ShimCometConf {
.booleanConf
.createWithEnvVarOrDefault("ENABLE_COMET_WRITE", false)

@deprecated
val SCAN_NATIVE_DATAFUSION = "native_datafusion"

@deprecated
val SCAN_NATIVE_ICEBERG_COMPAT = "native_iceberg_compat"

@deprecated
val SCAN_AUTO = "auto"

@deprecated
val COMET_NATIVE_SCAN_IMPL: ConfigEntry[String] = conf("spark.comet.scan.impl")
.category(CATEGORY_PARQUET)
.doc(
"The implementation of Comet's Parquet scan to use. Available scans are " +
s"`$SCAN_NATIVE_DATAFUSION`, and `$SCAN_NATIVE_ICEBERG_COMPAT`. " +
s"`$SCAN_NATIVE_DATAFUSION` is a fully native implementation, and " +
s"`$SCAN_NATIVE_ICEBERG_COMPAT` is a hybrid implementation that supports some " +
"additional features, such as row indexes and field ids. " +
s"`$SCAN_AUTO` (default) chooses the best available scan based on the scan schema.")
.category(CATEGORY_TESTING)
.internal()
.doc("This configuration option is deprecated and has no effect on Comet behavior.")
.stringConf
.transform(_.toLowerCase(Locale.ROOT))
.checkValues(Set(SCAN_NATIVE_DATAFUSION, SCAN_NATIVE_ICEBERG_COMPAT, SCAN_AUTO))
.checkValues(Set(SCAN_NATIVE_DATAFUSION, SCAN_AUTO))
.createWithEnvVarOrDefault("COMET_PARQUET_SCAN_IMPL", SCAN_AUTO)

val COMET_ICEBERG_NATIVE_ENABLED: ConfigEntry[Boolean] =
Expand Down
8 changes: 3 additions & 5 deletions docs/source/contributor-guide/bug_triage.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,7 @@ help contributors find bugs in their area of expertise.
| `area:ffi` | Arrow FFI / JNI boundary |
| `area:ci` | CI/CD, GitHub Actions, build tooling |

The following pre-existing labels also serve as area indicators: `native_datafusion`,
`native_iceberg_compat`, `spark 4`, `spark sql tests`.
The following pre-existing labels also serve as area indicators: `spark 4`, `spark sql tests`.

## Triage Process

Expand Down Expand Up @@ -109,9 +108,8 @@ Periodically review open bugs to ensure priorities are still accurate:
crashes, because crashes are at least visible.
2. **User-reported over test-only.** A bug hit by a real user on a real workload takes priority
over one found only in test suites.
3. **Core path over experimental.** Bugs in the default scan mode (`native_comet`) or widely-used
expressions take priority over bugs in experimental features like `native_datafusion` or
`native_iceberg_compat`.
3. **Core path over experimental.** Bugs in widely-used expressions and operators take priority over
bugs in experimental features.
4. **Production safety over feature completeness.** Fixing a data corruption bug is more important
than adding support for a new expression.

Expand Down
Loading
Loading