diff --git a/tests/test_performance.py b/tests/test_performance.py index 79de4d8..b776028 100644 --- a/tests/test_performance.py +++ b/tests/test_performance.py @@ -7,6 +7,13 @@ import polars as pl from diffly import compare_frames +from diffly._conditions import condition_equal_columns +from diffly._utils import ( + ABS_TOL_DEFAULT, + ABS_TOL_TEMPORAL_DEFAULT, + REL_TOL_DEFAULT, + Side, +) def test_summary_lazyframe_not_slower_than_dataframe() -> None: @@ -74,3 +81,99 @@ def expensive_computation(col: pl.Expr) -> pl.Expr: f"({mean_time_lf:.3f}s vs {mean_time_df:.3f}s). " f"This suggests unnecessary re-collection of LazyFrames." ) + + +def test_element_wise_comparison_slower_than_eq_missing_for_list_columns() -> None: + """Confirm that comparing list columns with non-tolerance inner types via + eq_missing() is significantly faster than the element-wise + _compare_sequence_columns() path.""" + n_rows = 500_000 + list_len = 20 + num_runs_measured = 10 + num_runs_warmup = 2 + + col_left = f"val_{Side.LEFT}" + col_right = f"val_{Side.RIGHT}" + df = pl.DataFrame( + { + col_left: [list(range(list_len)) for _ in range(n_rows)], + col_right: [list(range(list_len)) for _ in range(n_rows)], + } + ) + + times_eq = [] + times_cond = [] + for _ in range(num_runs_warmup + num_runs_measured): + start = time.perf_counter() + df.select(pl.col(col_left).eq_missing(pl.col(col_right))).to_series() + times_eq.append(time.perf_counter() - start) + + start = time.perf_counter() + df.select( + condition_equal_columns( + column="val", + dtype_left=df.schema[col_left], + dtype_right=df.schema[col_right], + max_list_length=list_len, + abs_tol=ABS_TOL_DEFAULT, + rel_tol=REL_TOL_DEFAULT, + abs_tol_temporal=ABS_TOL_TEMPORAL_DEFAULT, + ) + ).to_series() + times_cond.append(time.perf_counter() - start) + + mean_time_eq = statistics.mean(times_eq[num_runs_warmup:]) + mean_time_cond = statistics.mean(times_cond[num_runs_warmup:]) + + ratio = mean_time_cond / mean_time_eq + assert ratio > 2.0, ( + f"Element-wise comparison was only {ratio:.1f}x slower than eq_missing " + f"({mean_time_cond:.3f}s vs {mean_time_eq:.3f}s). " + f"Expected at least 2x slowdown to justify the optimization." + ) + + +def test_eq_missing_not_slower_than_field_wise_for_struct_columns() -> None: + """Ensure that comparing struct columns with non-tolerance fields via eq_missing() + is not slower than the field-wise decomposition path.""" + n_rows = 500_000 + n_fields = 20 + num_runs_measured = 10 + num_runs_warmup = 2 + + col_left = f"val_{Side.LEFT}" + col_right = f"val_{Side.RIGHT}" + struct_data = [{f"f{i}": row + i for i in range(n_fields)} for row in range(n_rows)] + df = pl.DataFrame({col_left: struct_data, col_right: struct_data}) + + times_eq = [] + times_cond = [] + for _ in range(num_runs_warmup + num_runs_measured): + start = time.perf_counter() + df.select(pl.col(col_left).eq_missing(pl.col(col_right))).to_series() + times_eq.append(time.perf_counter() - start) + + start = time.perf_counter() + df.select( + condition_equal_columns( + column="val", + dtype_left=df.schema[col_left], + dtype_right=df.schema[col_right], + max_list_length=None, + abs_tol=ABS_TOL_DEFAULT, + rel_tol=REL_TOL_DEFAULT, + abs_tol_temporal=ABS_TOL_TEMPORAL_DEFAULT, + ) + ).to_series() + times_cond.append(time.perf_counter() - start) + + mean_time_eq = statistics.mean(times_eq[num_runs_warmup:]) + mean_time_cond = statistics.mean(times_cond[num_runs_warmup:]) + + ratio = mean_time_cond / mean_time_eq + assert ratio < 1.25, ( + f"condition_equal_columns was {ratio:.1f}x slower than eq_missing " + f"({mean_time_cond:.3f}s vs {mean_time_eq:.3f}s). " + f"Expected comparable performance since struct fields should use " + f"eq_missing directly." + )