Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions tests/test_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@
import polars as pl

from diffly import compare_frames
from diffly._conditions import condition_equal_columns
from diffly._utils import (
ABS_TOL_DEFAULT,
ABS_TOL_TEMPORAL_DEFAULT,
REL_TOL_DEFAULT,
Side,
)


def test_summary_lazyframe_not_slower_than_dataframe() -> None:
Expand Down Expand Up @@ -74,3 +81,99 @@ def expensive_computation(col: pl.Expr) -> pl.Expr:
f"({mean_time_lf:.3f}s vs {mean_time_df:.3f}s). "
f"This suggests unnecessary re-collection of LazyFrames."
)


def test_element_wise_comparison_slower_than_eq_missing_for_list_columns() -> None:
"""Confirm that comparing list columns with non-tolerance inner types via
eq_missing() is significantly faster than the element-wise
_compare_sequence_columns() path."""
n_rows = 500_000
list_len = 20
num_runs_measured = 10
num_runs_warmup = 2

col_left = f"val_{Side.LEFT}"
col_right = f"val_{Side.RIGHT}"
df = pl.DataFrame(
{
col_left: [list(range(list_len)) for _ in range(n_rows)],
col_right: [list(range(list_len)) for _ in range(n_rows)],
}
)

times_eq = []
times_cond = []
for _ in range(num_runs_warmup + num_runs_measured):
start = time.perf_counter()
df.select(pl.col(col_left).eq_missing(pl.col(col_right))).to_series()
times_eq.append(time.perf_counter() - start)

start = time.perf_counter()
df.select(
condition_equal_columns(
column="val",
dtype_left=df.schema[col_left],
dtype_right=df.schema[col_right],
max_list_length=list_len,
abs_tol=ABS_TOL_DEFAULT,
rel_tol=REL_TOL_DEFAULT,
abs_tol_temporal=ABS_TOL_TEMPORAL_DEFAULT,
)
).to_series()
times_cond.append(time.perf_counter() - start)

mean_time_eq = statistics.mean(times_eq[num_runs_warmup:])
mean_time_cond = statistics.mean(times_cond[num_runs_warmup:])

ratio = mean_time_cond / mean_time_eq
assert ratio > 2.0, (
f"Element-wise comparison was only {ratio:.1f}x slower than eq_missing "
f"({mean_time_cond:.3f}s vs {mean_time_eq:.3f}s). "
f"Expected at least 2x slowdown to justify the optimization."
)


def test_eq_missing_not_slower_than_field_wise_for_struct_columns() -> None:
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shall I remove it again?

"""Ensure that comparing struct columns with non-tolerance fields via eq_missing()
is not slower than the field-wise decomposition path."""
n_rows = 500_000
n_fields = 20
num_runs_measured = 10
num_runs_warmup = 2

col_left = f"val_{Side.LEFT}"
col_right = f"val_{Side.RIGHT}"
struct_data = [{f"f{i}": row + i for i in range(n_fields)} for row in range(n_rows)]
df = pl.DataFrame({col_left: struct_data, col_right: struct_data})

times_eq = []
times_cond = []
for _ in range(num_runs_warmup + num_runs_measured):
start = time.perf_counter()
df.select(pl.col(col_left).eq_missing(pl.col(col_right))).to_series()
times_eq.append(time.perf_counter() - start)

start = time.perf_counter()
df.select(
condition_equal_columns(
column="val",
dtype_left=df.schema[col_left],
dtype_right=df.schema[col_right],
max_list_length=None,
abs_tol=ABS_TOL_DEFAULT,
rel_tol=REL_TOL_DEFAULT,
abs_tol_temporal=ABS_TOL_TEMPORAL_DEFAULT,
)
).to_series()
times_cond.append(time.perf_counter() - start)

mean_time_eq = statistics.mean(times_eq[num_runs_warmup:])
mean_time_cond = statistics.mean(times_cond[num_runs_warmup:])

ratio = mean_time_cond / mean_time_eq
assert ratio < 1.25, (
f"condition_equal_columns was {ratio:.1f}x slower than eq_missing "
f"({mean_time_cond:.3f}s vs {mean_time_eq:.3f}s). "
f"Expected comparable performance since struct<i64> fields should use "
f"eq_missing directly."
)
Loading