From 460a2a6e05fb0dbd3454b99c1cf1d22cd22d8186 Mon Sep 17 00:00:00 2001 From: Tom Palmer Date: Sun, 22 Mar 2026 19:21:15 +0000 Subject: [PATCH 1/9] Fix bootstrap CI for RD and RR to use paired per-iteration estimates instead of independent delta method Centre RR CI on the point estimate rather than mean Compute RR SE CI on log scale using bootstrap mean and SE of log(RR), then back-transform --- pySEQTarget/analysis/_risk_estimates.py | 123 +++++++++++++++++++----- pySEQTarget/analysis/_survival_pred.py | 27 ++++++ 2 files changed, 124 insertions(+), 26 deletions(-) diff --git a/pySEQTarget/analysis/_risk_estimates.py b/pySEQTarget/analysis/_risk_estimates.py index c5f5e4a..0199796 100644 --- a/pySEQTarget/analysis/_risk_estimates.py +++ b/pySEQTarget/analysis/_risk_estimates.py @@ -1,3 +1,5 @@ +import math + import polars as pl from scipy import stats @@ -5,7 +7,7 @@ def _compute_rd_rr(comp, has_bootstrap, z=None, group_cols=None): """ Compute Risk Difference and Risk Ratio from a comparison dataframe. - Consolidates the repeated calculation logic. + Fallback used when paired bootstrap data is unavailable (e.g. subgroups). """ if group_cols is None: group_cols = [] @@ -80,20 +82,30 @@ def _risk_estimates(self): group_cols = [self.subgroup_colname] if self.subgroup_colname else [] has_bootstrap = self.bootstrap_nboot > 0 + # Paired approach: compute RD_i and RR_i per bootstrap iteration then take + # SE or percentile of those — correct because arms share the same bootstrap + # samples, so pairing captures their correlation. Falls back to the + # independent delta method for subgroups (not yet supported) or when + # _boot_risks is unavailable. + use_paired = ( + has_bootstrap + and not group_cols + and hasattr(self, "_boot_risks") + and all(tx in self._boot_risks for tx in self.treatment_level) + ) + if has_bootstrap: alpha = 1 - self.bootstrap_CI z = stats.norm.ppf(1 - alpha / 2) else: z = None + alpha = None - # Pre-extract data for each treatment level once (avoid repeated filtering) risk_by_level = {} for tx in self.treatment_level: level_data = risk.filter(pl.col(self.treatment_col) == tx) - risk_by_level[tx] = { - "pred": level_data.select(group_cols + ["pred"]), - } - if has_bootstrap: + risk_by_level[tx] = {"pred": level_data.select(group_cols + ["pred"])} + if has_bootstrap and not use_paired: risk_by_level[tx]["SE"] = level_data.select(group_cols + ["SE"]) rd_comparisons = [] @@ -104,31 +116,90 @@ def _risk_estimates(self): if tx_x == tx_y: continue - # Use pre-extracted data instead of filtering again - risk_x = risk_by_level[tx_x]["pred"].rename({"pred": "risk_x"}) - risk_y = risk_by_level[tx_y]["pred"].rename({"pred": "risk_y"}) - - if group_cols: - comp = risk_x.join(risk_y, on=group_cols, how="left") + if use_paired: + boot_x = ( + self._boot_risks[tx_x] + .filter(pl.col("followup") == last_followup) + .select(["boot_idx", pl.col("risk").alias("risk_x")]) + ) + boot_y = ( + self._boot_risks[tx_y] + .filter(pl.col("followup") == last_followup) + .select(["boot_idx", pl.col("risk").alias("risk_y")]) + ) + paired = boot_x.join(boot_y, on="boot_idx").with_columns( + (pl.col("risk_x") - pl.col("risk_y")).alias("RD") + ) + + risk_x_val = float(risk_by_level[tx_x]["pred"]["pred"][0]) + risk_y_val = float(risk_by_level[tx_y]["pred"]["pred"][0]) + rd_point = risk_x_val - risk_y_val + rr_point = risk_x_val / risk_y_val if risk_y_val != 0 else float("inf") + + # Filter degenerate RR bootstrap values (risk_y == 0 or negative) + valid_rr = paired.filter( + (pl.col("risk_y") > 0) & (pl.col("risk_x") >= 0) + ).with_columns( + (pl.col("risk_x") / pl.col("risk_y")).alias("RR") + ) + + if self.bootstrap_CI_method == "percentile": + rd_lci = float(paired["RD"].quantile(alpha / 2)) + rd_uci = float(paired["RD"].quantile(1 - alpha / 2)) + rr_lci = float(valid_rr["RR"].quantile(alpha / 2)) + rr_uci = float(valid_rr["RR"].quantile(1 - alpha / 2)) + else: + rd_se = float(paired["RD"].std()) + rd_lci = rd_point - z * rd_se + rd_uci = rd_point + z * rd_se + log_rr_se = float(valid_rr["RR"].log().std()) + rr_lci = math.exp(math.log(rr_point) - z * log_rr_se) + rr_uci = math.exp(math.log(rr_point) + z * log_rr_se) + + rd_comp = pl.DataFrame( + { + "A_x": [tx_x], + "A_y": [tx_y], + "Risk Difference": [rd_point], + "RD 95% LCI": [rd_lci], + "RD 95% UCI": [rd_uci], + } + ) + rr_comp = pl.DataFrame( + { + "A_x": [tx_x], + "A_y": [tx_y], + "Risk Ratio": [rr_point], + "RR 95% LCI": [rr_lci], + "RR 95% UCI": [rr_uci], + } + ) else: - comp = risk_x.join(risk_y, how="cross") - - comp = comp.with_columns( - [pl.lit(tx_x).alias("A_x"), pl.lit(tx_y).alias("A_y")] - ) - - if has_bootstrap: - se_x = risk_by_level[tx_x]["SE"].rename({"SE": "se_x"}) - se_y = risk_by_level[tx_y]["SE"].rename({"SE": "se_y"}) + # Fall back to independent delta method + risk_x = risk_by_level[tx_x]["pred"].rename({"pred": "risk_x"}) + risk_y = risk_by_level[tx_y]["pred"].rename({"pred": "risk_y"}) if group_cols: - comp = comp.join(se_x, on=group_cols, how="left") - comp = comp.join(se_y, on=group_cols, how="left") + comp = risk_x.join(risk_y, on=group_cols, how="left") else: - comp = comp.join(se_x, how="cross") - comp = comp.join(se_y, how="cross") + comp = risk_x.join(risk_y, how="cross") + + comp = comp.with_columns( + [pl.lit(tx_x).alias("A_x"), pl.lit(tx_y).alias("A_y")] + ) + + if has_bootstrap: + se_x = risk_by_level[tx_x]["SE"].rename({"SE": "se_x"}) + se_y = risk_by_level[tx_y]["SE"].rename({"SE": "se_y"}) + if group_cols: + comp = comp.join(se_x, on=group_cols, how="left") + comp = comp.join(se_y, on=group_cols, how="left") + else: + comp = comp.join(se_x, how="cross") + comp = comp.join(se_y, how="cross") + + rd_comp, rr_comp = _compute_rd_rr(comp, has_bootstrap, z, group_cols) - rd_comp, rr_comp = _compute_rd_rr(comp, has_bootstrap, z, group_cols) rd_comparisons.append(rd_comp) rr_comparisons.append(rr_comp) diff --git a/pySEQTarget/analysis/_survival_pred.py b/pySEQTarget/analysis/_survival_pred.py index 8ec8dbd..428ee6e 100644 --- a/pySEQTarget/analysis/_survival_pred.py +++ b/pySEQTarget/analysis/_survival_pred.py @@ -4,6 +4,27 @@ from ._outcome_fit import _cast_categories +def _store_boot_risks(obj, treatment_val, TxDT, boot_cols, is_survival=False): + """Store per-bootstrap mean risks per followup for paired RD/RR CI computation.""" + if not boot_cols: + return + df = TxDT.select(["followup"] + boot_cols) + if is_survival: + df = df.with_columns([(1 - pl.col(c)).alias(c) for c in boot_cols]) + obj._boot_risks[treatment_val] = ( + df.unpivot( + index="followup", + on=boot_cols, + variable_name="_col", + value_name="risk", + ) + .with_columns( + pl.col("_col").str.extract(r"(\d+)$").cast(pl.Int32).alias("boot_idx") + ) + .drop("_col") + ) + + def _get_outcome_predictions(self, TxDT, idx=None): data = _cast_categories(self, TxDT.to_pandas()) predictions = {"outcome": []} @@ -27,6 +48,8 @@ def _pred_risk(self): isinstance(self.outcome_model[0], list) if self.outcome_model else False ) + self._boot_risks = {} + if not has_subgroups: return _calculate_risk(self, self.DT, idx=None, val=None) @@ -138,6 +161,8 @@ def _calculate_risk(self, data, idx=None, val=None): ) main_col = "surv" boot_cols = [col for col in surv_names if col != "surv"] + if val is None: + _store_boot_risks(self, treatment_val, TxDT, boot_cols, is_survival=True) else: TxDT = ( TxDT.with_columns( @@ -153,6 +178,8 @@ def _calculate_risk(self, data, idx=None, val=None): ) main_col = "pred_outcome" boot_cols = [col for col in outcome_names if col != "pred_outcome"] + if val is None: + _store_boot_risks(self, treatment_val, TxDT, boot_cols) if boot_cols: risk = ( From f872f594f07f4af5eca2a7194dd64441a74a7735 Mon Sep 17 00:00:00 2001 From: Tom Palmer Date: Sun, 22 Mar 2026 19:25:35 +0000 Subject: [PATCH 2/9] Bump version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 059b744..e0e73dc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pySEQTarget" -version = "0.12.6" +version = "0.12.7" description = "Sequentially Nested Target Trial Emulation" readme = "README.md" license = {text = "MIT"} From 6ec9cffb2be9eae3f54f904e52b4907adf7b97c5 Mon Sep 17 00:00:00 2001 From: Tom Palmer Date: Mon, 23 Mar 2026 16:07:52 +0000 Subject: [PATCH 3/9] Document list field defaults in SEQopts docstrings to fix factory display on ReadTheDocs site --- pySEQTarget/SEQopts.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pySEQTarget/SEQopts.py b/pySEQTarget/SEQopts.py index dc51d14..e2072d7 100644 --- a/pySEQTarget/SEQopts.py +++ b/pySEQTarget/SEQopts.py @@ -34,8 +34,8 @@ class SEQopts: :type denominator: Optional[str] or None :param excused: Boolean to allow excused conditions when method is censoring :type excused: bool - :param excused_colnames: Column names (at the same length of treatment_level) specifying excused conditions - :type excused_colnames: List[str] or [] + :param excused_colnames: Column names (at the same length of treatment_level) specifying excused conditions, default ``[]`` + :type excused_colnames: List[str] :param followup_class: Boolean to force followup values to be treated as classes :type followup_class: bool :param followup_include: Boolean to force regular followup values into model covariates @@ -65,9 +65,9 @@ class SEQopts: :type offload_dir: str :param parallel: Boolean to run model fitting in parallel :type parallel: bool - :param plot_colors: List of colors for KM plots, if applicable + :param plot_colors: List of colors for KM plots, if applicable, default ``["#F8766D", "#00BFC4", "#555555"]`` :type plot_colors: List[str] - :param plot_labels: List of length treat_level to specify treatment labeling + :param plot_labels: List of length treat_level to specify treatment labeling, default ``[]`` :type plot_labels: List[str] :param plot_title: Plot title :type plot_title: str @@ -83,14 +83,14 @@ class SEQopts: :type selection_random: bool :param subgroup_colname: Column name for subgroups to share the same weighting but different outcome model fits :type subgroup_colname: str - :param treatment_level: List of eligible treatment levels within treatment_col + :param treatment_level: List of eligible treatment levels within treatment_col, default ``[0, 1]`` :type treatment_level: List[int] :param trial_include: Boolean to force trial values into model covariates :type trial_include: bool :param visit_colname: Column name specifying visit number :type visit_colname: str :param weight_eligible_colnames: List of column names of length - treatment_level to identify which rows are eligible for weight fitting + treatment_level to identify which rows are eligible for weight fitting, default ``[]`` :type weight_eligible_colnames: List[str] :param weight_fit_method: The fitting method to be used ["newton", "bfgs", "lbfgs", "nm"], default "newton" :type weight_fit_method: str From e9f2f5080724fc6f421456efb7c6d6e39f811b39 Mon Sep 17 00:00:00 2001 From: Tom Palmer Date: Mon, 23 Mar 2026 16:30:12 +0000 Subject: [PATCH 4/9] Resolve ncores default at post_init to avoid misleading ncores=1 on ReadTheDocs --- pySEQTarget/SEQopts.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pySEQTarget/SEQopts.py b/pySEQTarget/SEQopts.py index e2072d7..6dea2ea 100644 --- a/pySEQTarget/SEQopts.py +++ b/pySEQTarget/SEQopts.py @@ -54,7 +54,7 @@ class SEQopts: :type indicator_squared: str :param km_curves: Boolean to create survival, risk, and incidence (if applicable) estimates :type km_curves: bool - :param ncores: Number of cores to use if running in parallel + :param ncores: Number of cores to use if running in parallel, default ``max(1, cpu_count() - 1)`` :type ncores: int :param numerator: Override to specify the outcome patsy formula for numerator models; "1" or "" indicate intercept only model @@ -130,7 +130,7 @@ class SEQopts: indicator_baseline: str = "_bas" indicator_squared: str = "_sq" km_curves: bool = False - ncores: int = max(1, multiprocessing.cpu_count() - 1) + ncores: Optional[int] = None numerator: Optional[str] = None offload: bool = False offload_dir: str = "_seq_models" @@ -220,6 +220,8 @@ def _normalize_formulas(self): setattr(self, i, "".join(attr.split())) def __post_init__(self): + if self.ncores is None: + self.ncores = max(1, multiprocessing.cpu_count() - 1) self._validate_bools() self._validate_ranges() self._validate_choices() From 73f3549d822b73ca08df2af08a6cdfd659bdf23d Mon Sep 17 00:00:00 2001 From: Tom Palmer Date: Mon, 23 Mar 2026 16:33:28 +0000 Subject: [PATCH 5/9] Fix RST docstring formatting in SEQoutput summary and retrieve_data methods --- pySEQTarget/SEQoutput.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pySEQTarget/SEQoutput.py b/pySEQTarget/SEQoutput.py index 5587441..cdc11eb 100644 --- a/pySEQTarget/SEQoutput.py +++ b/pySEQTarget/SEQoutput.py @@ -76,6 +76,7 @@ def summary( ) -> List: """ Returns a list of model summaries of either the numerator, denominator, outcome, or competing event models + :param type: Indicator for which model list you would like returned :type type: str """ @@ -108,6 +109,7 @@ def retrieve_data( ) -> pl.DataFrame: """ Getter for data stored within ``SEQoutput`` + :param type: Data which you would like to access, ['km_data', 'hazard', 'risk_ratio', 'risk_difference', 'unique_outcomes', 'nonunique_outcomes', 'unique_switches', 'nonunique_switches'] From 2a80fc6a4f9555ea079caed170b2e78a2f729789 Mon Sep 17 00:00:00 2001 From: Tom Palmer Date: Mon, 23 Mar 2026 16:36:26 +0000 Subject: [PATCH 6/9] Update weight_preexpansion to True in advanced models vignette to match new default --- docs/vignettes/more_advanced_models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/vignettes/more_advanced_models.md b/docs/vignettes/more_advanced_models.md index f526304..59ca0de 100644 --- a/docs/vignettes/more_advanced_models.md +++ b/docs/vignettes/more_advanced_models.md @@ -25,7 +25,7 @@ my_options = SEQopts( weighted = True, # enables the weighting weight_lag_condition=False, # turn off lag condition when weighting for adherance weight_p99 = True, # bounds weights by the 1st and 99th percentile - weight_preexpansion = False # weights are predicted using post-expansion data as a stabilizer + weight_preexpansion = True # weights are predicted using pre-expansion data ) ``` From 06bc564583c3d601bf8d4e3406d507e921cc0ebb Mon Sep 17 00:00:00 2001 From: Tom Palmer Date: Mon, 23 Mar 2026 16:39:00 +0000 Subject: [PATCH 7/9] Add note in vignette that bootstrap_nboot=20 is for demonstration only --- docs/vignettes/more_advanced_models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/vignettes/more_advanced_models.md b/docs/vignettes/more_advanced_models.md index 59ca0de..ed9fd5a 100644 --- a/docs/vignettes/more_advanced_models.md +++ b/docs/vignettes/more_advanced_models.md @@ -15,7 +15,7 @@ from pySEQTarget.data import load_data data = load_data("SEQdata_LTFU") my_options = SEQopts( - bootstrap_nboot = 20, # 20 bootstrap iterations + bootstrap_nboot = 20, # 20 bootstrap iterations (for demonstration only — use 500+ in practice) cense_colname = "LTFU", # control for losses-to-followup as a censor excused = True, # allow excused treatment swapping excused_colnames = ["excusedZero", "excusedOne"], From c025b0fa92f899a8b226216e4099bbdd33e56312 Mon Sep 17 00:00:00 2001 From: Tom Palmer Date: Mon, 23 Mar 2026 16:46:15 +0000 Subject: [PATCH 8/9] Fix link --- docs/vignettes/exploring_results.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/vignettes/exploring_results.md b/docs/vignettes/exploring_results.md index 1f89b8e..0984e98 100644 --- a/docs/vignettes/exploring_results.md +++ b/docs/vignettes/exploring_results.md @@ -1,6 +1,6 @@ # Exploring Results -Recall our previous example, {doc}`~vignettes/more_advanced_models`, where we finalized and collected our results with +Recall our previous example, {doc}`More Advanced Analysis `, where we finalized and collected our results with ```python my_output = my_analysis.collect() From 90b50d942cf355abca2820200c72e2c5d14beb98 Mon Sep 17 00:00:00 2001 From: Tom Palmer Date: Mon, 23 Mar 2026 16:46:30 +0000 Subject: [PATCH 9/9] File ends with a newline --- docs/vignettes/exploring_results.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/vignettes/exploring_results.md b/docs/vignettes/exploring_results.md index 0984e98..f78ad06 100644 --- a/docs/vignettes/exploring_results.md +++ b/docs/vignettes/exploring_results.md @@ -185,4 +185,4 @@ Because we have an excused-censoring analysis, we are also provided with informa | 1 | False | 1 | 1256 | | 1 | False | 0 | 527107 | | 1 | True | 0 | 18508 | -| 0 | False | 0 | 91300 | \ No newline at end of file +| 0 | False | 0 | 91300 |