diff --git a/src/kernelbench/timing.py b/src/kernelbench/timing.py
index f22920b4..10e4da39 100644
--- a/src/kernelbench/timing.py
+++ b/src/kernelbench/timing.py
@@ -238,7 +238,7 @@ def time_execution_with_cuda_event(
         # Warm ups
         for _ in range(num_warmup):
             kernel_fn(*args)
-            torch.cuda.synchronize(device=device)
+        torch.cuda.synchronize(device=device)
         
         # note this only release PyTorch’s CUDA caching allocator, not necessarily clearing device's L2 cache
         torch.cuda.empty_cache()
@@ -465,7 +465,7 @@ def time_execution_with_host_time(
     # Warm ups
     for _ in range(num_warmup):
         kernel_fn(*args)
-        torch.cuda.synchronize(device=device)
+    torch.cuda.synchronize(device=device)
 
     print(f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}")
     elapsed_times = []
@@ -534,7 +534,7 @@ def time_execution_with_nsight_python(
         # Warm ups
         for _ in range(num_warmup):
             kernel_fn(*args)
-            torch.cuda.synchronize(device=device)
+        torch.cuda.synchronize(device=device)
         
         # Clear cache for cold start
         torch.cuda.empty_cache()