Quantco · Marius Merkle (MariusMerkleQC) · Mar 27, 2026 · Mar 27, 2026 · Mar 27, 2026 · Mar 27, 2026
@@ -7,6 +7,13 @@
 import polars as pl
 
 from diffly import compare_frames
+from diffly._conditions import condition_equal_columns
+from diffly._utils import (
+    ABS_TOL_DEFAULT,
+    ABS_TOL_TEMPORAL_DEFAULT,
+    REL_TOL_DEFAULT,
+    Side,
+)
 
 
 def test_summary_lazyframe_not_slower_than_dataframe() -> None:
@@ -74,3 +81,99 @@ def expensive_computation(col: pl.Expr) -> pl.Expr:
         f"({mean_time_lf:.3f}s vs {mean_time_df:.3f}s). "
         f"This suggests unnecessary re-collection of LazyFrames."
     )
+
+
+def test_element_wise_comparison_slower_than_eq_missing_for_list_columns() -> None:
+    """Confirm that comparing list columns with non-tolerance inner types via
+    eq_missing() is significantly faster than the element-wise
+    _compare_sequence_columns() path."""
+    n_rows = 500_000
+    list_len = 20
+    num_runs_measured = 10
+    num_runs_warmup = 2
+
+    col_left = f"val_{Side.LEFT}"
+    col_right = f"val_{Side.RIGHT}"
+    df = pl.DataFrame(
+        {
+            col_left: [list(range(list_len)) for _ in range(n_rows)],
+            col_right: [list(range(list_len)) for _ in range(n_rows)],
+        }
+    )
+
+    times_eq = []
+    times_cond = []
+    for _ in range(num_runs_warmup + num_runs_measured):
+        start = time.perf_counter()
+        df.select(pl.col(col_left).eq_missing(pl.col(col_right))).to_series()
+        times_eq.append(time.perf_counter() - start)
+
+        start = time.perf_counter()
+        df.select(
+            condition_equal_columns(
+                column="val",
+                dtype_left=df.schema[col_left],
+                dtype_right=df.schema[col_right],
+                max_list_length=list_len,
+                abs_tol=ABS_TOL_DEFAULT,
+                rel_tol=REL_TOL_DEFAULT,
+                abs_tol_temporal=ABS_TOL_TEMPORAL_DEFAULT,
+            )
+        ).to_series()
+        times_cond.append(time.perf_counter() - start)
+
+    mean_time_eq = statistics.mean(times_eq[num_runs_warmup:])
+    mean_time_cond = statistics.mean(times_cond[num_runs_warmup:])
+
+    ratio = mean_time_cond / mean_time_eq
+    assert ratio > 2.0, (
+        f"Element-wise comparison was only {ratio:.1f}x slower than eq_missing "
+        f"({mean_time_cond:.3f}s vs {mean_time_eq:.3f}s). "
+        f"Expected at least 2x slowdown to justify the optimization."
+    )
+
+
+def test_eq_missing_not_slower_than_field_wise_for_struct_columns() -> None:
+    """Ensure that comparing struct columns with non-tolerance fields via eq_missing()
+    is not slower than the field-wise decomposition path."""
+    n_rows = 500_000
+    n_fields = 20
+    num_runs_measured = 10
+    num_runs_warmup = 2
+
+    col_left = f"val_{Side.LEFT}"
+    col_right = f"val_{Side.RIGHT}"
+    struct_data = [{f"f{i}": row + i for i in range(n_fields)} for row in range(n_rows)]
+    df = pl.DataFrame({col_left: struct_data, col_right: struct_data})
+
+    times_eq = []
+    times_cond = []
+    for _ in range(num_runs_warmup + num_runs_measured):
+        start = time.perf_counter()
+        df.select(pl.col(col_left).eq_missing(pl.col(col_right))).to_series()
+        times_eq.append(time.perf_counter() - start)
+
+        start = time.perf_counter()
+        df.select(
+            condition_equal_columns(
+                column="val",
+                dtype_left=df.schema[col_left],
+                dtype_right=df.schema[col_right],
+                max_list_length=None,
+                abs_tol=ABS_TOL_DEFAULT,
+                rel_tol=REL_TOL_DEFAULT,
+                abs_tol_temporal=ABS_TOL_TEMPORAL_DEFAULT,
+            )
+        ).to_series()
+        times_cond.append(time.perf_counter() - start)
+
+    mean_time_eq = statistics.mean(times_eq[num_runs_warmup:])
+    mean_time_cond = statistics.mean(times_cond[num_runs_warmup:])
+
+    ratio = mean_time_cond / mean_time_eq
+    assert ratio < 1.25, (
+        f"condition_equal_columns was {ratio:.1f}x slower than eq_missing "
+        f"({mean_time_cond:.3f}s vs {mean_time_eq:.3f}s). "
+        f"Expected comparable performance since struct<i64> fields should use "
+        f"eq_missing directly."
+    )