From dad92d89747b808502a0ca70a5fb20a7da296c8b Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 26 Mar 2026 15:37:02 -0400
Subject: [PATCH] Solid particle implementation for E-L solver

---
 docs/documentation/case.md                |   52 +-
 docs/documentation/equations.md           |   40 +
 docs/module_categories.json               |    2 +
 src/common/m_boundary_common.fpp          |  529 ++++-
 src/common/m_constants.fpp                |   12 +-
 src/common/m_derived_types.fpp            |  141 +-
 src/common/m_helper.fpp                   |  111 +-
 src/common/m_helper_basic.fpp             |   42 +-
 src/common/m_mpi_common.fpp               |  518 +++-
 src/post_process/m_global_parameters.fpp  |  136 +-
 src/post_process/m_mpi_proxy.fpp          |   53 +-
 src/post_process/m_start_up.fpp           |   98 +-
 src/pre_process/m_global_parameters.fpp   |  207 +-
 src/pre_process/m_mpi_proxy.fpp           |   33 +-
 src/pre_process/m_start_up.fpp            |  152 +-
 src/simulation/m_bubbles.fpp              |  300 ++-
 src/simulation/m_bubbles_EE.fpp           |   37 +-
 src/simulation/m_bubbles_EL.fpp           | 1347 +++++++----
 src/simulation/m_bubbles_EL_kernels.fpp   |  623 +++--
 src/simulation/m_compute_levelset.fpp     |   87 +-
 src/simulation/m_data_output.fpp          |  264 +-
 src/simulation/m_global_parameters.fpp    |  229 +-
 src/simulation/m_ib_patches.fpp           |  218 +-
 src/simulation/m_ibm.fpp                  |  140 +-
 src/simulation/m_mpi_proxy.fpp            | 1231 +++++++++-
 src/simulation/m_particles_EL.fpp         | 2645 +++++++++++++++++++++
 src/simulation/m_particles_EL_kernels.fpp |  904 +++++++
 src/simulation/m_rhs.fpp                  |  165 +-
 src/simulation/m_sim_helpers.fpp          |  109 +-
 src/simulation/m_start_up.fpp             |  140 +-
 src/simulation/m_time_steppers.fpp        |   74 +-
 toolchain/mfc/params/definitions.py       |   35 +-
 toolchain/mfc/params/descriptions.py      |    8 +
 33 files changed, 9195 insertions(+), 1487 deletions(-)
 create mode 100644 src/simulation/m_particles_EL.fpp
 create mode 100644 src/simulation/m_particles_EL_kernels.fpp

diff --git a/docs/documentation/case.md b/docs/documentation/case.md
index ecdeb786d4..1ae3851b33 100644
--- a/docs/documentation/case.md
+++ b/docs/documentation/case.md
@@ -266,6 +266,16 @@ Setup: Only requires specifying `init_dir` and filename pattern via `zeros_defau
 Implementation: All variables and file handling are managed in `src/common/include/ExtrusionHardcodedIC.fpp` with no manual grid configuration needed.
 Usage: Ideal for initializing simulations from lower-dimensional solutions, enabling users to add perturbations or modifications to the base extruded fields for flow instability studies.
 
+The following parameters support hardcoded initial conditions that read interface data from files:
+
+| Parameter        | Type    | Description                                               |
+| ---:             | :---:   | :---                                                      |
+| `interface_file` | String  | Path to interface geometry data file                      |
+| `normFac`        | Real    | Interface normalization factor                            |
+| `normMag`        | Real    | Interface normal magnitude                                |
+| `g0_ic`          | Real    | Initial gas volume fraction for interfacial IC            |
+| `p0_ic`          | Real    | Initial pressure for interfacial IC                       |
+
 #### Parameter Descriptions
 
 - `num_patches` defines the total number of patches defined in the domain.
@@ -788,7 +798,7 @@ Details of the transducer acoustic source model can be found in \cite Maeda17.
 | ---:               | :----:  |          :---                                  |
 | `bubbles_euler`    | Logical | Ensemble-averaged bubble modeling |
 | `bubbles_lagrange` | Logical | Volume-averaged bubble modeling |
-| `bubble_model`     | Integer | [1] Gilmore; [2] Keller--Miksis; [3] Rayleigh-Plesset |
+| `bubble_model`     | Integer | [0] Particle; [1] Gilmore; [2] Keller--Miksis; [3] Rayleigh-Plesset |
 | `Ca`               | Real    | Cavitation number |
 | `Web`              | Real    | Weber number |
 | `Re_inv`           | Real    | Inverse Reynolds number |
@@ -892,6 +902,13 @@ When ``polytropic = 'F'``, the gas compression is modeled as non-polytropic due
 | `epsilonb`            | Real    | Standard deviation scaling for the gaussian function      |
 | `charwidth`           | Real    | Domain virtual depth (z direction, for 2D simulations)    |
 | `valmaxvoid`          | Real    | Maximum void fraction permitted                           |
+| `drag_model`          | Integer | Drag model for bubble dynamics                            |
+| `vel_model`           | Integer | Velocity model for bubble interface                       |
+| `charNz`              | Integer | Characteristic size parameter                             |
+| `input_path`          | String  | Path to bubble input file (default: `./input`)            |
+| `pressure_force`      | Logical | Enable pressure gradient force                            |
+| `gravity_force`       | Logical | Enable gravitational force                                |
+| `write_void_evol`     | Logical | Write void fraction evolution data                        |
 
 - `nBubs_glb` Total number of bubbles. Their initial conditions need to be specified in the ./input/lag_bubbles.dat file. See the example cases for additional information.
 
@@ -905,6 +922,39 @@ When ``polytropic = 'F'``, the gas compression is modeled as non-polytropic due
 
 - `massTransfer_model` Activates the mass transfer model at the bubble's interface based on (\cite Preston07).
 
+#### 9.3 Lagrangian Solid Particle Model
+
+| Parameter                        | Type    | Description                                                     |
+| ---:                             | :---:   | :---                                                            |
+| `particles_lagrange`             | Logical | Lagrangian solid particle model switch                          |
+| `nParticles_glb`                 | Integer | Global number of particles                                      |
+| `solver_approach`                | Integer | 1: One-way coupling, 2: Two-way coupling                       |
+| `smooth_type`                    | Integer | Smoothing function. 1: Gaussian, 2: Delta 3x3                  |
+| `stokes_drag`                    | Integer | Stokes drag model flag                                          |
+| `qs_drag_model`                  | Integer | Quasi-steady drag model (0: off, 1: Parmar, 2: Modified Parmar, 3: Osnes, 4: Gidaspow) |
+| `added_mass_model`               | Integer | Added mass model (0: off, >0: active)                           |
+| `interpolation_order`            | Integer | Polynomial order for barycentric field interpolation            |
+| `collision_force`                | Logical | Enable soft-sphere DEM particle-particle collisions             |
+| `pressure_force`                 | Logical | Enable pressure gradient force on particles                     |
+| `gravity_force`                  | Logical | Enable gravitational force on particles                         |
+| `write_void_evol`                | Logical | Write void fraction evolution data                              |
+| `epsilonb`                       | Real    | Standard deviation scaling for the Gaussian kernel              |
+| `valmaxvoid`                     | Real    | Maximum void fraction permitted                                 |
+| `particle_pp%%rho0ref_particle`  | Real    | Reference particle material density                             |
+| `particle_pp%%cp_particle`       | Real    | Particle specific heat capacity                                 |
+
+- `particles_lagrange` activates the Euler-Lagrange solid particle solver. Particle initial conditions are read from `./input/lag_particles.dat`. The solver tracks non-deformable spherical particles in a compressible carrier flow using volume-averaged source terms (\cite Maeda18).
+
+- `nParticles_glb` specifies the total number of particles across all MPI ranks. Their initial positions, velocities, and radii must be specified in the input file.
+
+- `solver_approach` specifies the coupling method: [1] one-way coupling where particles are advected by the flow but do not influence it, [2] two-way coupling where particle forces are projected back onto the Eulerian grid as source terms.
+
+- `qs_drag_model` selects the quasi-steady drag correlation: [1] Parmar et al. (2010) with Sangani volume fraction correction, [2] Modified Parmar with Osnes et al. (2023) volume fraction correction, [3] Osnes et al. (2023) full correlation with Loth et al. (2021) rarefied regime, [4] Gidaspow (1994) correlation for dense particle suspensions.
+
+- `collision_force` activates soft-sphere DEM collisions using a spring-dashpot contact model with Hertzian stiffness. Collision forces between particles on different MPI ranks are communicated via non-blocking point-to-point messaging.
+
+- `interpolation_order` sets the order of the barycentric Lagrange polynomial used to interpolate Eulerian field quantities (pressure, velocity, density) to particle positions. Must be even; the interpolation stencil uses `N/2` points in each direction.
+
 ### 10. Velocity Field Setup {#sec-velocity-field-setup}
 
 | Parameter              | Type    | Description |
diff --git a/docs/documentation/equations.md b/docs/documentation/equations.md
index 6bb17aa9de..b69a29adf2 100644
--- a/docs/documentation/equations.md
+++ b/docs/documentation/equations.md
@@ -514,6 +514,46 @@ with \f$\sigma = \varepsilon_b \max(\Delta x^{1/3}_\text{cell},\;R_\text{bubble}
 
 Each bubble is tracked individually with Keller-Miksis dynamics and 4th-order adaptive Runge-Kutta time integration.
 
+### 6.3 Euler-Lagrange Solid Particles (`particles_lagrange = .true.`)
+
+**Source:** `src/simulation/m_particles_EL.fpp`, `src/simulation/m_particles_EL_kernels.fpp`
+
+The Euler-Lagrange particle solver tracks non-deformable solid particles in a compressible carrier flow.
+The volume-averaged carrier flow equations use the same source term framework as the bubble model (Section 6.2),
+with the particle volume fraction \f$\alpha_p\f$ replacing the bubble void fraction \f$\alpha\f$.
+
+**Particle volume fraction via Gaussian kernel:**
+
+\f[\alpha_p(\mathbf{x}) = \sum_n V_{p,n}\,\delta_\sigma(\mathbf{x} - \mathbf{x}_n)\f]
+
+where \f$V_{p,n} = \frac{4}{3}\pi R_n^3\f$ is the volume of particle \f$n\f$ and \f$\delta_\sigma\f$ is the Gaussian regularization kernel from Section 6.2.
+
+**Particle equation of motion:**
+
+\f[m_p \frac{d\mathbf{u}_p}{dt} = \mathbf{F}_\text{drag} + \mathbf{F}_\text{pressure} + \mathbf{F}_\text{AM} + \mathbf{F}_\text{gravity} + \mathbf{F}_\text{collision}\f]
+
+**Quasi-steady drag force:**
+
+\f[\mathbf{F}_\text{drag} = \beta\,(\mathbf{u}_f - \mathbf{u}_p)\f]
+
+where \f$\beta\f$ is a drag coefficient computed from one of several correlations selected via `qs_drag_model`:
+- Parmar et al. (2010): Re and Ma corrections with Sangani et al. (1991) volume fraction correction
+- Modified Parmar: Re and Ma corrections with Osnes et al. (2023) volume fraction correction
+- Osnes et al. (2023): comprehensive correlation for compressible flow through random particle suspensions
+- Gidaspow (1994): Ergun/Wen-Yu correlation for dense suspensions
+
+**Pressure gradient force** (`pressure_force = .true.`):
+
+\f[\mathbf{F}_\text{pressure} = -V_p\,\nabla p\f]
+
+**Added mass force** (`added_mass_model > 0`):
+
+\f[\mathbf{F}_\text{AM} = C_\text{AM}\,\rho_f\,V_p\left(\frac{D\mathbf{u}_f}{Dt} - \frac{d\mathbf{u}_p}{dt}\right)\f]
+
+**Collision force** (`collision_force = .true.`):
+
+Soft-sphere DEM model with Hertzian contact stiffness and viscous damping.
+
 ---
 
 ## 7. Fluid-Structure Interaction
diff --git a/docs/module_categories.json b/docs/module_categories.json
index d1da0ed5ed..6d64a0e42f 100644
--- a/docs/module_categories.json
+++ b/docs/module_categories.json
@@ -19,6 +19,8 @@
             "m_bubbles_EE",
             "m_bubbles_EL",
             "m_bubbles_EL_kernels",
+            "m_particles_EL",
+            "m_particles_EL_kernels",
             "m_qbmm",
             "m_hyperelastic",
             "m_hypoelastic",
diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index 918b0daea5..eb3b32a342 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -20,13 +20,16 @@ module m_boundary_common
     type(scalar_field), dimension(:,:), allocatable :: bc_buffers
     $:GPU_DECLARE(create='[bc_buffers]')
 
+    type(int_bounds_info), dimension(3) :: beta_bc_bounds
+    $:GPU_DECLARE(create='[beta_bc_bounds]')
+
 #ifdef MFC_MPI
     integer, dimension(1:3,1:2) :: MPI_BC_TYPE_TYPE
     integer, dimension(1:3,1:2) :: MPI_BC_BUFFER_TYPE
 #endif
 
-    private; public :: s_initialize_boundary_common_module, s_populate_variables_buffers, s_create_mpi_types, &
-        & s_populate_capillary_buffers, s_populate_F_igr_buffers, s_write_serial_boundary_condition_files, &
+    private; public :: s_initialize_boundary_common_module, s_populate_variables_buffers, s_populate_beta_buffers, &
+        & s_create_mpi_types, s_populate_capillary_buffers, s_populate_F_igr_buffers, s_write_serial_boundary_condition_files, &
         & s_write_parallel_boundary_condition_files, s_read_serial_boundary_condition_files, &
         & s_read_parallel_boundary_condition_files, s_assign_default_bc_type, s_populate_grid_variables_buffers, &
         & s_finalize_boundary_common_module
@@ -39,7 +42,7 @@ module m_boundary_common
 
 contains
 
-    !> Allocate and set up boundary condition buffer arrays for all coordinate directions.
+    !> @brief Allocates and sets up boundary condition buffer arrays for all coordinate directions.
     impure subroutine s_initialize_boundary_common_module()
 
         integer :: i, j
@@ -68,9 +71,26 @@ contains
             end do
         end if
 
+        if (bubbles_lagrange .or. particles_lagrange) then
+            beta_bc_bounds(1)%beg = -mapcells - 1
+            beta_bc_bounds(1)%end = m + mapcells + 1
+            ! n > 0 always for bubbles_lagrange
+            beta_bc_bounds(2)%beg = -mapcells - 1
+            beta_bc_bounds(2)%end = n + mapcells + 1
+            if (p == 0) then
+                beta_bc_bounds(3)%beg = 0
+                beta_bc_bounds(3)%end = 0
+            else
+                beta_bc_bounds(3)%beg = -mapcells - 1
+                beta_bc_bounds(3)%end = p + mapcells + 1
+            end if
+        end if
+        $:GPU_UPDATE(device='[beta_bc_bounds]')
+
     end subroutine s_initialize_boundary_common_module
 
-    !> Populate the buffers of the primitive variables based on the selected boundary conditions.
+    !> The purpose of this procedure is to populate the buffers of the primitive variables, depending on the selected boundary
+    !! conditions.
     impure subroutine s_populate_variables_buffers(bc_type, q_prim_vf, pb_in, mv_in)
 
         type(scalar_field), dimension(sys_size), intent(inout)                                               :: q_prim_vf
@@ -78,7 +98,7 @@ contains
         type(integer_field), dimension(1:num_dims,1:2), intent(in)                                           :: bc_type
         integer                                                                                              :: k, l
 
-        ! BC type codes defined in m_constants.fpp; non-negative values are MPI boundaries
+        ! Population of Buffers in x-direction
 
         if (bc_x%beg >= 0) then
             call s_mpi_sendrecv_variables_buffers(q_prim_vf, 1, -1, sys_size, pb_in, mv_in)
@@ -87,7 +107,7 @@ contains
             do l = 0, p
                 do k = 0, n
                     select case (int(bc_type(1, 1)%sf(0, k, l)))
-                    case (BC_CHAR_SUP_OUTFLOW:BC_GHOST_EXTRAP)
+                    case (BC_CHAR_SUP_OUTFLOW:BC_GHOST_EXTRAP)  ! Ghost-cell extrap. BC at end
                         call s_ghost_cell_extrapolation(q_prim_vf, 1, -1, k, l)
                     case (BC_REFLECTIVE)
                         call s_symmetry(q_prim_vf, 1, -1, k, l, pb_in, mv_in)
@@ -268,10 +288,11 @@ contains
                 $:END_GPU_PARALLEL_LOOP()
             end if
         #:endif
+        ! END: Population of Buffers in z-direction
 
     end subroutine s_populate_variables_buffers
 
-    !> Fill ghost cells by copying the nearest boundary cell value along the specified direction.
+    !> @brief Fills ghost cells by copying the nearest boundary cell value along the specified direction.
     subroutine s_ghost_cell_extrapolation(q_prim_vf, bc_dir, bc_loc, k, l)
 
         $:GPU_ROUTINE(function_name='s_ghost_cell_extrapolation', parallelism='[seq]', cray_inline=True)
@@ -281,13 +302,13 @@ contains
         integer                                                :: j, i
 
         if (bc_dir == 1) then  !< x-direction
-            if (bc_loc == -1) then  ! bc_x%beg
+            if (bc_loc == -1) then  !< bc_z%beg
                 do i = 1, sys_size
                     do j = 1, buff_size
                         q_prim_vf(i)%sf(-j, k, l) = q_prim_vf(i)%sf(0, k, l)
                     end do
                 end do
-            else  !< bc_x%end
+            else
                 do i = 1, sys_size
                     do j = 1, buff_size
                         q_prim_vf(i)%sf(m + j, k, l) = q_prim_vf(i)%sf(m, k, l)
@@ -295,13 +316,13 @@ contains
                 end do
             end if
         else if (bc_dir == 2) then  !< y-direction
-            if (bc_loc == -1) then  !< bc_y%beg
+            if (bc_loc == -1) then  !< bc_z%beg
                 do i = 1, sys_size
                     do j = 1, buff_size
                         q_prim_vf(i)%sf(k, -j, l) = q_prim_vf(i)%sf(k, 0, l)
                     end do
                 end do
-            else  !< bc_y%end
+            else
                 do i = 1, sys_size
                     do j = 1, buff_size
                         q_prim_vf(i)%sf(k, n + j, l) = q_prim_vf(i)%sf(k, n, l)
@@ -315,7 +336,7 @@ contains
                         q_prim_vf(i)%sf(k, l, -j) = q_prim_vf(i)%sf(k, l, 0)
                     end do
                 end do
-            else  !< bc_z%end
+            else
                 do i = 1, sys_size
                     do j = 1, buff_size
                         q_prim_vf(i)%sf(k, l, p + j) = q_prim_vf(i)%sf(k, l, p)
@@ -326,7 +347,8 @@ contains
 
     end subroutine s_ghost_cell_extrapolation
 
-    !> Apply reflective (symmetry) boundary conditions by mirroring primitive variables and flipping the normal velocity component.
+    !> @brief Applies reflective (symmetry) boundary conditions by mirroring primitive variables and flipping the normal velocity
+    !! component.
     subroutine s_symmetry(q_prim_vf, bc_dir, bc_loc, k, l, pb_in, mv_in)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -337,7 +359,7 @@ contains
         integer                                                                                              :: j, q, i
 
         if (bc_dir == 1) then  !< x-direction
-            if (bc_loc == -1) then  !< bc_x%beg
+            if (bc_loc == -1) then  !< bc_z%beg
                 do j = 1, buff_size
                     do i = 1, contxe
                         q_prim_vf(i)%sf(-j, k, l) = q_prim_vf(i)%sf(j - 1, k, l)
@@ -371,7 +393,7 @@ contains
                         end do
                     end do
                 end if
-            else  !< bc_x%end
+            else
                 do j = 1, buff_size
                     do i = 1, contxe
                         q_prim_vf(i)%sf(m + j, k, l) = q_prim_vf(i)%sf(m - (j - 1), k, l)
@@ -406,7 +428,7 @@ contains
                 end if
             end if
         else if (bc_dir == 2) then  !< y-direction
-            if (bc_loc == -1) then  !< bc_y%beg
+            if (bc_loc == -1) then  !< bc_z%beg
                 do j = 1, buff_size
                     do i = 1, momxb
                         q_prim_vf(i)%sf(k, -j, l) = q_prim_vf(i)%sf(k, j - 1, l)
@@ -440,7 +462,7 @@ contains
                         end do
                     end do
                 end if
-            else  !< bc_y%end
+            else
                 do j = 1, buff_size
                     do i = 1, momxb
                         q_prim_vf(i)%sf(k, n + j, l) = q_prim_vf(i)%sf(k, n - (j - 1), l)
@@ -510,7 +532,7 @@ contains
                         end do
                     end do
                 end if
-            else  !< bc_z%end
+            else
                 do j = 1, buff_size
                     do i = 1, momxb + 1
                         q_prim_vf(i)%sf(k, l, p + j) = q_prim_vf(i)%sf(k, l, p - (j - 1))
@@ -549,7 +571,7 @@ contains
 
     end subroutine s_symmetry
 
-    !> Apply periodic boundary conditions by copying values from the opposite domain boundary.
+    !> @brief Applies periodic boundary conditions by copying values from the opposite domain boundary.
     subroutine s_periodic(q_prim_vf, bc_dir, bc_loc, k, l, pb_in, mv_in)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -560,7 +582,7 @@ contains
         integer                                                                                              :: j, q, i
 
         if (bc_dir == 1) then  !< x-direction
-            if (bc_loc == -1) then  !< bc_x%beg
+            if (bc_loc == -1) then  !< bc_z%beg
                 do i = 1, sys_size
                     do j = 1, buff_size
                         q_prim_vf(i)%sf(-j, k, l) = q_prim_vf(i)%sf(m - (j - 1), k, l)
@@ -577,7 +599,7 @@ contains
                         end do
                     end do
                 end if
-            else  !< bc_x%end
+            else
                 do i = 1, sys_size
                     do j = 1, buff_size
                         q_prim_vf(i)%sf(m + j, k, l) = q_prim_vf(i)%sf(j - 1, k, l)
@@ -596,7 +618,7 @@ contains
                 end if
             end if
         else if (bc_dir == 2) then  !< y-direction
-            if (bc_loc == -1) then  !< bc_y%beg
+            if (bc_loc == -1) then  !< bc_z%beg
                 do i = 1, sys_size
                     do j = 1, buff_size
                         q_prim_vf(i)%sf(k, -j, l) = q_prim_vf(i)%sf(k, n - (j - 1), l)
@@ -613,7 +635,7 @@ contains
                         end do
                     end do
                 end if
-            else  !< bc_y%end
+            else
                 do i = 1, sys_size
                     do j = 1, buff_size
                         q_prim_vf(i)%sf(k, n + j, l) = q_prim_vf(i)%sf(k, j - 1, l)
@@ -649,7 +671,7 @@ contains
                         end do
                     end do
                 end if
-            else  !< bc_z%end
+            else
                 do i = 1, sys_size
                     do j = 1, buff_size
                         q_prim_vf(i)%sf(k, l, p + j) = q_prim_vf(i)%sf(k, l, j - 1)
@@ -671,7 +693,8 @@ contains
 
     end subroutine s_periodic
 
-    !> Apply axis boundary conditions for cylindrical coordinates by reflecting values across the axis with azimuthal phase shift.
+    !> @brief Applies axis boundary conditions for cylindrical coordinates by reflecting values across the axis with azimuthal phase
+    !! shift.
     subroutine s_axis(q_prim_vf, pb_in, mv_in, k, l)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -721,7 +744,7 @@ contains
 
     end subroutine s_axis
 
-    !> Apply slip wall boundary conditions by extrapolating scalars and reflecting the wall-normal velocity component.
+    !> @brief Applies slip wall boundary conditions by extrapolating scalars and reflecting the wall-normal velocity component.
     subroutine s_slip_wall(q_prim_vf, bc_dir, bc_loc, k, l)
 
         $:GPU_ROUTINE(function_name='s_slip_wall',parallelism='[seq]', cray_inline=True)
@@ -731,7 +754,7 @@ contains
         integer                                                :: j, i
 
         if (bc_dir == 1) then  !< x-direction
-            if (bc_loc == -1) then  !< bc_x%beg
+            if (bc_loc == -1) then  !< bc_z%beg
                 do i = 1, sys_size
                     do j = 1, buff_size
                         if (i == momxb) then
@@ -741,7 +764,7 @@ contains
                         end if
                     end do
                 end do
-            else  !< bc_x%end
+            else
                 do i = 1, sys_size
                     do j = 1, buff_size
                         if (i == momxb) then
@@ -753,7 +776,7 @@ contains
                 end do
             end if
         else if (bc_dir == 2) then  !< y-direction
-            if (bc_loc == -1) then  !< bc_y%beg
+            if (bc_loc == -1) then  !< bc_z%beg
                 do i = 1, sys_size
                     do j = 1, buff_size
                         if (i == momxb + 1) then
@@ -763,7 +786,7 @@ contains
                         end if
                     end do
                 end do
-            else  !< bc_y%end
+            else
                 do i = 1, sys_size
                     do j = 1, buff_size
                         if (i == momxb + 1) then
@@ -785,7 +808,7 @@ contains
                         end if
                     end do
                 end do
-            else  !< bc_z%end
+            else
                 do i = 1, sys_size
                     do j = 1, buff_size
                         if (i == momxe) then
@@ -800,7 +823,7 @@ contains
 
     end subroutine s_slip_wall
 
-    !> Apply no-slip wall boundary conditions by reflecting and negating all velocity components at the wall.
+    !> @brief Applies no-slip wall boundary conditions by reflecting and negating all velocity components at the wall.
     subroutine s_no_slip_wall(q_prim_vf, bc_dir, bc_loc, k, l)
 
         $:GPU_ROUTINE(function_name='s_no_slip_wall',parallelism='[seq]', cray_inline=True)
@@ -811,7 +834,7 @@ contains
         integer                                                :: j, i
 
         if (bc_dir == 1) then  !< x-direction
-            if (bc_loc == -1) then  !< bc_x%beg
+            if (bc_loc == -1) then  !< bc_z%beg
                 do i = 1, sys_size
                     do j = 1, buff_size
                         if (i == momxb) then
@@ -825,7 +848,7 @@ contains
                         end if
                     end do
                 end do
-            else  !< bc_x%end
+            else
                 do i = 1, sys_size
                     do j = 1, buff_size
                         if (i == momxb) then
@@ -841,7 +864,7 @@ contains
                 end do
             end if
         else if (bc_dir == 2) then  !< y-direction
-            if (bc_loc == -1) then  !< bc_y%beg
+            if (bc_loc == -1) then  !< bc_z%beg
                 do i = 1, sys_size
                     do j = 1, buff_size
                         if (i == momxb) then
@@ -855,7 +878,7 @@ contains
                         end if
                     end do
                 end do
-            else  !< bc_y%end
+            else
                 do i = 1, sys_size
                     do j = 1, buff_size
                         if (i == momxb) then
@@ -885,7 +908,7 @@ contains
                         end if
                     end do
                 end do
-            else  !< bc_z%end
+            else
                 do i = 1, sys_size
                     do j = 1, buff_size
                         if (i == momxb) then
@@ -904,7 +927,7 @@ contains
 
     end subroutine s_no_slip_wall
 
-    !> Apply Dirichlet boundary conditions by prescribing ghost cell values from stored boundary buffers.
+    !> @brief Applies Dirichlet boundary conditions by prescribing ghost cell values from stored boundary buffers.
     subroutine s_dirichlet(q_prim_vf, bc_dir, bc_loc, k, l)
 
         $:GPU_ROUTINE(function_name='s_dirichlet',parallelism='[seq]', cray_inline=True)
@@ -915,13 +938,13 @@ contains
 
 #ifdef MFC_SIMULATION
         if (bc_dir == 1) then  !< x-direction
-            if (bc_loc == -1) then  ! bc_x%beg
+            if (bc_loc == -1) then  !< bc_z%beg
                 do i = 1, sys_size
                     do j = 1, buff_size
                         q_prim_vf(i)%sf(-j, k, l) = bc_buffers(1, 1)%sf(i, k, l)
                     end do
                 end do
-            else  !< bc_x%end
+            else
                 do i = 1, sys_size
                     do j = 1, buff_size
                         q_prim_vf(i)%sf(m + j, k, l) = bc_buffers(1, 2)%sf(i, k, l)
@@ -930,13 +953,13 @@ contains
             end if
         else if (bc_dir == 2) then  !< y-direction
             #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                if (bc_loc == -1) then  !< bc_y%beg
+                if (bc_loc == -1) then  !< bc_z%beg
                     do i = 1, sys_size
                         do j = 1, buff_size
                             q_prim_vf(i)%sf(k, -j, l) = bc_buffers(2, 1)%sf(k, i, l)
                         end do
                     end do
-                else  !< bc_y%end
+                else  !< bc_z%end
                     do i = 1, sys_size
                         do j = 1, buff_size
                             q_prim_vf(i)%sf(k, n + j, l) = bc_buffers(2, 2)%sf(k, i, l)
@@ -967,7 +990,7 @@ contains
 
     end subroutine s_dirichlet
 
-    !> Extrapolate QBMM bubble pressure and mass-vapor variables into ghost cells by copying boundary values.
+    !> @brief Extrapolates QBMM bubble pressure and mass-vapor variables into ghost cells by copying boundary values.
     subroutine s_qbmm_extrapolation(bc_dir, bc_loc, k, l, pb_in, mv_in)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -977,7 +1000,7 @@ contains
         integer                                                                                              :: j, q, i
 
         if (bc_dir == 1) then  !< x-direction
-            if (bc_loc == -1) then  ! bc_x%beg
+            if (bc_loc == -1) then  !< bc_z%beg
                 do i = 1, nb
                     do q = 1, nnode
                         do j = 1, buff_size
@@ -986,7 +1009,7 @@ contains
                         end do
                     end do
                 end do
-            else  !< bc_x%end
+            else
                 do i = 1, nb
                     do q = 1, nnode
                         do j = 1, buff_size
@@ -997,7 +1020,7 @@ contains
                 end do
             end if
         else if (bc_dir == 2) then  !< y-direction
-            if (bc_loc == -1) then  !< bc_y%beg
+            if (bc_loc == -1) then  !< bc_z%beg
                 do i = 1, nb
                     do q = 1, nnode
                         do j = 1, buff_size
@@ -1006,7 +1029,7 @@ contains
                         end do
                     end do
                 end do
-            else  !< bc_y%end
+            else
                 do i = 1, nb
                     do q = 1, nnode
                         do j = 1, buff_size
@@ -1026,7 +1049,7 @@ contains
                         end do
                     end do
                 end do
-            else  !< bc_z%end
+            else
                 do i = 1, nb
                     do q = 1, nnode
                         do j = 1, buff_size
@@ -1040,7 +1063,329 @@ contains
 
     end subroutine s_qbmm_extrapolation
 
-    !> Populate ghost cell buffers for the color function and its divergence used in capillary surface tension.
+    impure subroutine s_populate_beta_buffers(q_beta, bc_type, nvar)
+
+        type(scalar_field), dimension(:), intent(inout)            :: q_beta
+        type(integer_field), dimension(1:num_dims,1:2), intent(in) :: bc_type
+        integer, intent(in)                                        :: nvar
+        integer                                                    :: k, l
+
+        !> x-direction
+
+        if (bc_x%beg >= 0) then
+            call s_mpi_reduce_beta_variables_buffers(q_beta, 1, -1, nvar)
+        else
+            $:GPU_PARALLEL_LOOP(private='[l, k]', collapse=2)
+            do l = beta_bc_bounds(3)%beg, beta_bc_bounds(3)%end
+                do k = beta_bc_bounds(2)%beg, beta_bc_bounds(2)%end
+                    select case (bc_x%beg)
+                    case (BC_PERIODIC)
+                        call s_beta_periodic(q_beta, 1, -1, k, l, nvar)
+                    case (BC_REFLECTIVE)
+                        call s_beta_reflective(q_beta, 1, -1, k, l, nvar)
+                    case default
+                    end select
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
+
+        if (bc_x%end >= 0) then
+            call s_mpi_reduce_beta_variables_buffers(q_beta, 1, 1, nvar)
+        else
+            $:GPU_PARALLEL_LOOP(private='[l, k]', collapse=2)
+            do l = beta_bc_bounds(3)%beg, beta_bc_bounds(3)%end
+                do k = beta_bc_bounds(2)%beg, beta_bc_bounds(2)%end
+                    select case (bc_x%end)
+                    case (BC_PERIODIC)
+                        call s_beta_periodic(q_beta, 1, 1, k, l, nvar)
+                    case (BC_REFLECTIVE)
+                        call s_beta_reflective(q_beta, 1, 1, k, l, nvar)
+                    case default
+                    end select
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
+
+        !> y-direction
+        if (bc_y%beg >= 0) then
+            call s_mpi_reduce_beta_variables_buffers(q_beta, 2, -1, nvar)
+        else
+            $:GPU_PARALLEL_LOOP(private='[l, k]', collapse=2)
+            do l = beta_bc_bounds(3)%beg, beta_bc_bounds(3)%end
+                do k = beta_bc_bounds(1)%beg, beta_bc_bounds(1)%end
+                    select case (bc_y%beg)
+                    case (BC_PERIODIC)
+                        call s_beta_periodic(q_beta, 2, -1, k, l, nvar)
+                    case (BC_REFLECTIVE)
+                        call s_beta_reflective(q_beta, 2, -1, k, l, nvar)
+                    case default
+                    end select
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
+
+        if (bc_y%end >= 0) then
+            call s_mpi_reduce_beta_variables_buffers(q_beta, 2, 1, nvar)
+        else
+            $:GPU_PARALLEL_LOOP(private='[l, k]', collapse=2)
+            do l = beta_bc_bounds(3)%beg, beta_bc_bounds(3)%end
+                do k = beta_bc_bounds(1)%beg, beta_bc_bounds(1)%end
+                    select case (bc_y%end)
+                    case (BC_PERIODIC)
+                        call s_beta_periodic(q_beta, 2, 1, k, l, nvar)
+                    case (BC_REFLECTIVE)
+                        call s_beta_reflective(q_beta, 2, 1, k, l, nvar)
+                    case default
+                    end select
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
+
+        if (num_dims == 2) return
+
+        #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+            !> z-direction
+            if (bc_z%beg >= 0) then
+                call s_mpi_reduce_beta_variables_buffers(q_beta, 3, -1, nvar)
+            else
+                $:GPU_PARALLEL_LOOP(private='[l, k]', collapse=2)
+                do l = beta_bc_bounds(2)%beg, beta_bc_bounds(2)%end
+                    do k = beta_bc_bounds(1)%beg, beta_bc_bounds(1)%end
+                        select case (bc_type(3, 1)%sf(k, l, 0))
+                        case (BC_PERIODIC)
+                            call s_beta_periodic(q_beta, 3, -1, k, l, nvar)
+                        case (BC_REFLECTIVE)
+                            call s_beta_reflective(q_beta, 3, -1, k, l, nvar)
+                        case default
+                        end select
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+
+            if (bc_z%end >= 0) then
+                call s_mpi_reduce_beta_variables_buffers(q_beta, 3, 1, nvar)
+            else
+                $:GPU_PARALLEL_LOOP(private='[l, k]', collapse=2)
+                do l = beta_bc_bounds(2)%beg, beta_bc_bounds(2)%end
+                    do k = beta_bc_bounds(1)%beg, beta_bc_bounds(1)%end
+                        select case (bc_type(3, 2)%sf(k, l, 0))
+                        case (BC_PERIODIC)
+                            call s_beta_periodic(q_beta, 3, 1, k, l, nvar)
+                        case (BC_REFLECTIVE)
+                            call s_beta_reflective(q_beta, 3, 1, k, l, nvar)
+                        case default
+                        end select
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+        #:endif
+
+    end subroutine s_populate_beta_buffers
+
+    subroutine s_beta_periodic(q_beta, bc_dir, bc_loc, k, l, nvar)
+
+        $:GPU_ROUTINE(function_name='s_beta_periodic', parallelism='[seq]', cray_inline=True)
+        type(scalar_field), dimension(num_dims + 1), intent(inout) :: q_beta
+        integer, intent(in)                                        :: bc_dir, bc_loc
+        integer, intent(in)                                        :: k, l
+        integer, intent(in)                                        :: nvar
+        integer                                                    :: j, i
+
+        if (bc_dir == 1) then  !< x-direction
+            if (bc_loc == -1) then  !< bc_z%beg
+                do i = 1, nvar
+                    do j = -mapCells - 1, mapCells
+                        q_beta(beta_vars(i))%sf(j, k, l) = q_beta(beta_vars(i))%sf(j, k, l) + q_beta(beta_vars(i))%sf(m + j + 1, &
+                               & k, l)
+                    end do
+                end do
+            else
+                do i = 1, nvar
+                    do j = -mapcells, mapcells + 1
+                        q_beta(beta_vars(i))%sf(m + j, k, l) = q_beta(beta_vars(i))%sf(j - 1, k, l)
+                    end do
+                end do
+            end if
+        else if (bc_dir == 2) then  !< y-direction
+            if (bc_loc == -1) then  !< bc_z%beg
+                do i = 1, nvar
+                    do j = -mapcells - 1, mapcells
+                        q_beta(beta_vars(i))%sf(k, j, l) = q_beta(beta_vars(i))%sf(k, j, l) + q_beta(beta_vars(i))%sf(k, &
+                               & n + j + 1, l)
+                    end do
+                end do
+            else
+                do i = 1, nvar
+                    do j = -mapcells, mapcells + 1
+                        q_beta(beta_vars(i))%sf(k, n + j, l) = q_beta(beta_vars(i))%sf(k, j - 1, l)
+                    end do
+                end do
+            end if
+        else if (bc_dir == 3) then  !< z-direction
+            if (bc_loc == -1) then  !< bc_z%beg
+                do i = 1, nvar
+                    do j = -mapcells - 1, mapcells
+                        q_beta(beta_vars(i))%sf(k, l, j) = q_beta(beta_vars(i))%sf(k, l, j) + q_beta(beta_vars(i))%sf(k, l, &
+                               & p + j + 1)
+                    end do
+                end do
+            else
+                do i = 1, nvar
+                    do j = -mapcells, mapcells + 1
+                        q_beta(beta_vars(i))%sf(k, l, p + j) = q_beta(beta_vars(i))%sf(k, l, j - 1)
+                    end do
+                end do
+            end if
+        end if
+
+    end subroutine s_beta_periodic
+
+    subroutine s_beta_extrapolation(q_beta, bc_dir, bc_loc, k, l, nvar)
+
+        $:GPU_ROUTINE(function_name='s_beta_extrapolation', parallelism='[seq]', cray_inline=True)
+        type(scalar_field), dimension(num_dims + 1), intent(inout) :: q_beta
+        integer, intent(in)                                        :: bc_dir, bc_loc
+        integer, intent(in)                                        :: k, l
+        integer, intent(in)                                        :: nvar
+        integer                                                    :: j, i
+
+        ! Set beta in buffer regions equal to zero
+
+        if (bc_dir == 1) then  !< x-direction
+            if (bc_loc == -1) then  !< bc_z%beg
+                do i = 1, nvar
+                    do j = 1, buff_size
+                        q_beta(beta_vars(i))%sf(-j, k, l) = 0._wp
+                    end do
+                end do
+            else
+                do i = 1, nvar
+                    do j = 1, buff_size
+                        q_beta(beta_vars(i))%sf(m + j, k, l) = 0._wp
+                    end do
+                end do
+            end if
+        else if (bc_dir == 2) then  !< y-direction
+            if (bc_loc == -1) then  !< bc_z%beg
+                do i = 1, nvar
+                    do j = 1, buff_size
+                        q_beta(beta_vars(i))%sf(k, -j, l) = 0._wp
+                    end do
+                end do
+            else
+                do i = 1, nvar
+                    do j = 1, buff_size
+                        q_beta(beta_vars(i))%sf(k, n + j, l) = 0._wp
+                    end do
+                end do
+            end if
+        else if (bc_dir == 3) then  !< z-direction
+            if (bc_loc == -1) then  !< bc_z%beg
+                do i = 1, nvar
+                    do j = 1, buff_size
+                        q_beta(beta_vars(i))%sf(k, l, -j) = 0._wp
+                    end do
+                end do
+            else
+                do i = 1, nvar
+                    do j = 1, buff_size
+                        q_beta(beta_vars(i))%sf(k, l, p + j) = 0._wp
+                    end do
+                end do
+            end if
+        end if
+
+    end subroutine s_beta_extrapolation
+
+    subroutine s_beta_reflective(q_beta, bc_dir, bc_loc, k, l, nvar)
+
+        $:GPU_ROUTINE(function_name='s_beta_reflective', parallelism='[seq]', cray_inline=True)
+        type(scalar_field), dimension(num_dims + 1), intent(inout) :: q_beta
+        integer, intent(in)                                        :: bc_dir, bc_loc
+        integer, intent(in)                                        :: k, l
+        integer, intent(in)                                        :: nvar
+        integer                                                    :: j, i
+
+        ! Reflective BC for void fraction: 1) Fold ghost-cell contributions back onto their mirror interior cells 2) Set ghost cells
+        ! = mirror of (now-folded) interior values
+
+        if (bc_dir == 1) then  !< x-direction
+            if (bc_loc == -1) then  !< bc_z%beg
+                do i = 1, nvar
+                    do j = 1, mapCells + 1
+                        q_beta(beta_vars(i))%sf(j - 1, k, l) = q_beta(beta_vars(i))%sf(j - 1, k, l) + q_beta(beta_vars(i))%sf(-j, &
+                               & k, l)
+                    end do
+                    do j = 1, mapCells + 1
+                        q_beta(beta_vars(i))%sf(-j, k, l) = q_beta(beta_vars(i))%sf(j - 1, k, l)
+                    end do
+                end do
+            else
+                do i = 1, nvar
+                    do j = 1, mapCells + 1
+                        q_beta(beta_vars(i))%sf(m - (j - 1), k, l) = q_beta(beta_vars(i))%sf(m - (j - 1), k, &
+                               & l) + q_beta(beta_vars(i))%sf(m + j, k, l)
+                    end do
+                    do j = 1, mapCells + 1
+                        q_beta(beta_vars(i))%sf(m + j, k, l) = q_beta(beta_vars(i))%sf(m - (j - 1), k, l)
+                    end do
+                end do
+            end if
+        else if (bc_dir == 2) then  !< y-direction
+            if (bc_loc == -1) then  !< bc_z%beg
+                do i = 1, nvar
+                    do j = 1, mapCells + 1
+                        q_beta(beta_vars(i))%sf(k, j - 1, l) = q_beta(beta_vars(i))%sf(k, j - 1, l) + q_beta(beta_vars(i))%sf(k, &
+                               & -j, l)
+                    end do
+                    do j = 1, mapCells + 1
+                        q_beta(beta_vars(i))%sf(k, -j, l) = q_beta(beta_vars(i))%sf(k, j - 1, l)
+                    end do
+                end do
+            else
+                do i = 1, nvar
+                    do j = 1, mapCells + 1
+                        q_beta(beta_vars(i))%sf(k, n - (j - 1), l) = q_beta(beta_vars(i))%sf(k, n - (j - 1), &
+                               & l) + q_beta(beta_vars(i))%sf(k, n + j, l)
+                    end do
+                    do j = 1, mapCells + 1
+                        q_beta(beta_vars(i))%sf(k, n + j, l) = q_beta(beta_vars(i))%sf(k, n - (j - 1), l)
+                    end do
+                end do
+            end if
+        else if (bc_dir == 3) then  !< z-direction
+            if (bc_loc == -1) then  !< bc_z%beg
+                do i = 1, nvar
+                    do j = 1, mapCells + 1
+                        q_beta(beta_vars(i))%sf(k, l, j - 1) = q_beta(beta_vars(i))%sf(k, l, j - 1) + q_beta(beta_vars(i))%sf(k, &
+                               & l, -j)
+                    end do
+                    do j = 1, mapCells + 1
+                        q_beta(beta_vars(i))%sf(k, l, -j) = q_beta(beta_vars(i))%sf(k, l, j - 1)
+                    end do
+                end do
+            else
+                do i = 1, nvar
+                    do j = 1, mapCells + 1
+                        q_beta(beta_vars(i))%sf(k, l, p - (j - 1)) = q_beta(beta_vars(i))%sf(k, l, &
+                               & p - (j - 1)) + q_beta(beta_vars(i))%sf(k, l, p + j)
+                    end do
+                    do j = 1, mapCells + 1
+                        q_beta(beta_vars(i))%sf(k, l, p + j) = q_beta(beta_vars(i))%sf(k, l, p - (j - 1))
+                    end do
+                end do
+            end if
+        end if
+
+    end subroutine s_beta_reflective
+
+    !> @brief Populates ghost cell buffers for the color function and its divergence used in capillary surface tension.
     impure subroutine s_populate_capillary_buffers(c_divs, bc_type)
 
         type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
@@ -1175,7 +1520,7 @@ contains
 
     end subroutine s_populate_capillary_buffers
 
-    !> Apply periodic boundary conditions to the color function and its divergence fields.
+    !> @brief Applies periodic boundary conditions to the color function and its divergence fields.
     subroutine s_color_function_periodic(c_divs, bc_dir, bc_loc, k, l)
 
         $:GPU_ROUTINE(function_name='s_color_function_periodic', parallelism='[seq]', cray_inline=True)
@@ -1185,13 +1530,13 @@ contains
         integer                                                    :: j, i
 
         if (bc_dir == 1) then  !< x-direction
-            if (bc_loc == -1) then  ! bc_x%beg
+            if (bc_loc == -1) then  !< bc_z%beg
                 do i = 1, num_dims + 1
                     do j = 1, buff_size
                         c_divs(i)%sf(-j, k, l) = c_divs(i)%sf(m - (j - 1), k, l)
                     end do
                 end do
-            else  !< bc_x%end
+            else
                 do i = 1, num_dims + 1
                     do j = 1, buff_size
                         c_divs(i)%sf(m + j, k, l) = c_divs(i)%sf(j - 1, k, l)
@@ -1199,13 +1544,13 @@ contains
                 end do
             end if
         else if (bc_dir == 2) then  !< y-direction
-            if (bc_loc == -1) then  !< bc_y%beg
+            if (bc_loc == -1) then  !< bc_z%beg
                 do i = 1, num_dims + 1
                     do j = 1, buff_size
                         c_divs(i)%sf(k, -j, l) = c_divs(i)%sf(k, n - (j - 1), l)
                     end do
                 end do
-            else  !< bc_y%end
+            else
                 do i = 1, num_dims + 1
                     do j = 1, buff_size
                         c_divs(i)%sf(k, n + j, l) = c_divs(i)%sf(k, j - 1, l)
@@ -1219,7 +1564,7 @@ contains
                         c_divs(i)%sf(k, l, -j) = c_divs(i)%sf(k, l, p - (j - 1))
                     end do
                 end do
-            else  !< bc_z%end
+            else
                 do i = 1, num_dims + 1
                     do j = 1, buff_size
                         c_divs(i)%sf(k, l, p + j) = c_divs(i)%sf(k, l, j - 1)
@@ -1230,7 +1575,7 @@ contains
 
     end subroutine s_color_function_periodic
 
-    !> Apply reflective boundary conditions to the color function and its divergence fields.
+    !> @brief Applies reflective boundary conditions to the color function and its divergence fields.
     subroutine s_color_function_reflective(c_divs, bc_dir, bc_loc, k, l)
 
         $:GPU_ROUTINE(function_name='s_color_function_reflective', parallelism='[seq]', cray_inline=True)
@@ -1240,7 +1585,7 @@ contains
         integer                                                    :: j, i
 
         if (bc_dir == 1) then  !< x-direction
-            if (bc_loc == -1) then  ! bc_x%beg
+            if (bc_loc == -1) then  !< bc_z%beg
                 do i = 1, num_dims + 1
                     do j = 1, buff_size
                         if (i == bc_dir) then
@@ -1250,7 +1595,7 @@ contains
                         end if
                     end do
                 end do
-            else  !< bc_x%end
+            else
                 do i = 1, num_dims + 1
                     do j = 1, buff_size
                         if (i == bc_dir) then
@@ -1262,7 +1607,7 @@ contains
                 end do
             end if
         else if (bc_dir == 2) then  !< y-direction
-            if (bc_loc == -1) then  !< bc_y%beg
+            if (bc_loc == -1) then  !< bc_z%beg
                 do i = 1, num_dims + 1
                     do j = 1, buff_size
                         if (i == bc_dir) then
@@ -1272,7 +1617,7 @@ contains
                         end if
                     end do
                 end do
-            else  !< bc_y%end
+            else
                 do i = 1, num_dims + 1
                     do j = 1, buff_size
                         if (i == bc_dir) then
@@ -1294,7 +1639,7 @@ contains
                         end if
                     end do
                 end do
-            else  !< bc_z%end
+            else
                 do i = 1, num_dims + 1
                     do j = 1, buff_size
                         if (i == bc_dir) then
@@ -1309,7 +1654,7 @@ contains
 
     end subroutine s_color_function_reflective
 
-    !> Extrapolate the color function and its divergence into ghost cells by copying boundary values.
+    !> @brief Extrapolates the color function and its divergence into ghost cells by copying boundary values.
     subroutine s_color_function_ghost_cell_extrapolation(c_divs, bc_dir, bc_loc, k, l)
 
         $:GPU_ROUTINE(function_name='s_color_function_ghost_cell_extrapolation', parallelism='[seq]', cray_inline=True)
@@ -1319,13 +1664,13 @@ contains
         integer                                                    :: j, i
 
         if (bc_dir == 1) then  !< x-direction
-            if (bc_loc == -1) then  ! bc_x%beg
+            if (bc_loc == -1) then  !< bc_z%beg
                 do i = 1, num_dims + 1
                     do j = 1, buff_size
                         c_divs(i)%sf(-j, k, l) = c_divs(i)%sf(0, k, l)
                     end do
                 end do
-            else  !< bc_x%end
+            else
                 do i = 1, num_dims + 1
                     do j = 1, buff_size
                         c_divs(i)%sf(m + j, k, l) = c_divs(i)%sf(m, k, l)
@@ -1333,13 +1678,13 @@ contains
                 end do
             end if
         else if (bc_dir == 2) then  !< y-direction
-            if (bc_loc == -1) then  !< bc_y%beg
+            if (bc_loc == -1) then  !< bc_z%beg
                 do i = 1, num_dims + 1
                     do j = 1, buff_size
                         c_divs(i)%sf(k, -j, l) = c_divs(i)%sf(k, 0, l)
                     end do
                 end do
-            else  !< bc_y%end
+            else
                 do i = 1, num_dims + 1
                     do j = 1, buff_size
                         c_divs(i)%sf(k, n + j, l) = c_divs(i)%sf(k, n, l)
@@ -1353,7 +1698,7 @@ contains
                         c_divs(i)%sf(k, l, -j) = c_divs(i)%sf(k, l, 0)
                     end do
                 end do
-            else  !< bc_z%end
+            else
                 do i = 1, num_dims + 1
                     do j = 1, buff_size
                         c_divs(i)%sf(k, l, p + j) = c_divs(i)%sf(k, l, p)
@@ -1364,7 +1709,7 @@ contains
 
     end subroutine s_color_function_ghost_cell_extrapolation
 
-    !> Populate ghost cell buffers for the Jacobian scalar field used in the IGR elliptic solver.
+    !> @brief Populates ghost cell buffers for the Jacobian scalar field used in the IGR elliptic solver.
     impure subroutine s_populate_F_igr_buffers(bc_type, jac_sf)
 
         type(integer_field), dimension(1:num_dims,1:2), intent(in) :: bc_type
@@ -1531,7 +1876,7 @@ contains
 
     end subroutine s_populate_F_igr_buffers
 
-    !> Create MPI derived datatypes for boundary condition type arrays and buffer arrays used in parallel I/O.
+    !> @brief Creates MPI derived datatypes for boundary condition type arrays and buffer arrays used in parallel I/O.
     impure subroutine s_create_mpi_types(bc_type)
 
         type(integer_field), dimension(1:num_dims,1:2), intent(in) :: bc_type
@@ -1566,7 +1911,7 @@ contains
 
     end subroutine s_create_mpi_types
 
-    !> Write boundary condition type and buffer data to serial (unformatted) restart files.
+    !> @brief Writes boundary condition type and buffer data to serial (unformatted) restart files.
     subroutine s_write_serial_boundary_condition_files(q_prim_vf, bc_type, step_dirpath, old_grid_in)
 
         type(scalar_field), dimension(sys_size), intent(in)        :: q_prim_vf
@@ -1605,7 +1950,7 @@ contains
 
     end subroutine s_write_serial_boundary_condition_files
 
-    !> Write boundary condition type and buffer data to per-rank parallel files using MPI I/O.
+    !> @brief Writes boundary condition type and buffer data to per-rank parallel files using MPI I/O.
     subroutine s_write_parallel_boundary_condition_files(q_prim_vf, bc_type)
 
         type(scalar_field), dimension(sys_size), intent(in)        :: q_prim_vf
@@ -1670,7 +2015,7 @@ contains
 
     end subroutine s_write_parallel_boundary_condition_files
 
-    !> Read boundary condition type and buffer data from serial (unformatted) restart files.
+    !> @brief Reads boundary condition type and buffer data from serial (unformatted) restart files.
     subroutine s_read_serial_boundary_condition_files(step_dirpath, bc_type)
 
         character(LEN=*), intent(in)                                  :: step_dirpath
@@ -1715,7 +2060,7 @@ contains
 
     end subroutine s_read_serial_boundary_condition_files
 
-    !> Read boundary condition type and buffer data from per-rank parallel files using MPI I/O.
+    !> @brief Reads boundary condition type and buffer data from per-rank parallel files using MPI I/O.
     subroutine s_read_parallel_boundary_condition_files(bc_type)
 
         type(integer_field), dimension(1:num_dims,1:2), intent(inout) :: bc_type
@@ -1780,7 +2125,7 @@ contains
 
     end subroutine s_read_parallel_boundary_condition_files
 
-    !> Pack primitive variable boundary slices into bc_buffers arrays for serialization.
+    !> @brief Packs primitive variable boundary slices into bc_buffers arrays for serialization.
     subroutine s_pack_boundary_condition_buffers(q_prim_vf)
 
         type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
@@ -1823,7 +2168,7 @@ contains
 
     end subroutine s_pack_boundary_condition_buffers
 
-    !> Initialize the per-cell boundary condition type arrays with the global default BC values.
+    !> @brief Initializes the per-cell boundary condition type arrays with the global default BC values.
     subroutine s_assign_default_bc_type(bc_type)
 
         type(integer_field), dimension(1:num_dims,1:2), intent(in) :: bc_type
@@ -1849,8 +2194,8 @@ contains
 
     end subroutine s_assign_default_bc_type
 
-    !> Populate the buffers of the grid variables, which are constituted of the cell-boundary locations and cell-width
-    !! distributions, based on the boundary conditions.
+    !> The purpose of this subroutine is to populate the buffers of the grid variables, which are constituted of the cell- boundary
+    !! locations and cell-width distributions, based on the boundary conditions.
     subroutine s_populate_grid_variables_buffers
 
         integer :: i
@@ -1858,10 +2203,33 @@ contains
 #ifdef MFC_SIMULATION
         ! Required for compatibility between codes
         type(int_bounds_info) :: offset_x, offset_y, offset_z
-
         offset_x%beg = buff_size; offset_x%end = buff_size
         offset_y%beg = buff_size; offset_y%end = buff_size
         offset_z%beg = buff_size; offset_z%end = buff_size
+
+#ifdef MFC_MPI
+        ! Populate global domain boundaries with stretched grids
+        call s_mpi_allreduce_min(x_cb(-1), glb_bounds(1)%beg)
+        call s_mpi_allreduce_max(x_cb(m), glb_bounds(1)%end)
+
+        if (n > 0) then
+            call s_mpi_allreduce_min(y_cb(-1), glb_bounds(2)%beg)
+            call s_mpi_allreduce_max(y_cb(n), glb_bounds(2)%end)
+            if (p > 0) then
+                call s_mpi_allreduce_min(z_cb(-1), glb_bounds(3)%beg)
+                call s_mpi_allreduce_max(z_cb(p), glb_bounds(3)%end)
+            end if
+        end if
+#else
+        glb_bounds(1)%beg = x_cb(-1); glb_bounds(1)%end = x_cb(m)
+        if (n > 0) then
+            glb_bounds(2)%beg = y_cb(-1); glb_bounds(2)%end = y_cb(n)
+            if (p > 0) then
+                glb_bounds(3)%beg = z_cb(-1); glb_bounds(3)%end = z_cb(p)
+            end if
+        end if
+#endif
+        $:GPU_UPDATE(device='[glb_bounds]')
 #endif
 
 #ifndef MFC_PRE_PROCESS
@@ -1918,6 +2286,7 @@ contains
         do i = 1, buff_size
             x_cc(m + i) = x_cc(m + (i - 1)) + (dx(m + (i - 1)) + dx(m + i))/2._wp
         end do
+        ! END: Population of Buffers in x-direction
 
         ! Population of Buffers in y-direction
 
@@ -1974,6 +2343,7 @@ contains
         do i = 1, buff_size
             y_cc(n + i) = y_cc(n + (i - 1)) + (dy(n + (i - 1)) + dy(n + i))/2._wp
         end do
+        ! END: Population of Buffers in y-direction
 
         ! Population of Buffers in z-direction
 
@@ -2030,11 +2400,12 @@ contains
         do i = 1, buff_size
             z_cc(p + i) = z_cc(p + (i - 1)) + (dz(p + (i - 1)) + dz(p + i))/2._wp
         end do
+        ! END: Population of Buffers in z-direction
 #endif
 
     end subroutine s_populate_grid_variables_buffers
 
-    !> Deallocate boundary condition buffer arrays allocated during module initialization.
+    !> @brief Deallocates boundary condition buffer arrays allocated during module initialization.
     subroutine s_finalize_boundary_common_module()
 
         if (bc_io) then
diff --git a/src/common/m_constants.fpp b/src/common/m_constants.fpp
index 8e01059691..3f9d136dce 100644
--- a/src/common/m_constants.fpp
+++ b/src/common/m_constants.fpp
@@ -42,13 +42,16 @@ module m_constants
     real(wp), parameter :: broadband_spectral_level_constant = 20._wp
     !> The spectral level constant to correct the magnitude at each frequency to ensure the source is overall broadband
     real(wp), parameter :: broadband_spectral_level_growth_rate = 10._wp
+
     ! Reconstruction Types
     integer, parameter :: WENO_TYPE = 1   !< Using WENO for reconstruction type
     integer, parameter :: MUSCL_TYPE = 2  !< Using MUSCL for reconstruction type
+
     ! Interface Compression
     real(wp), parameter :: dflt_ic_eps = 1e-4_wp    !< Ensure compression is only applied to surface cells in THINC
     real(wp), parameter :: dflt_ic_beta = 1.6_wp    !< Sharpness parameter's default value used in THINC
     real(wp), parameter :: moncon_cutoff = 1e-8_wp  !< Monotonicity constraint's limiter to prevent extremas in THINC
+
     ! Chemistry
     real(wp), parameter :: dflt_T_guess = 1200._wp  !< Default guess for temperature (when a previous value is not available)
 
@@ -58,14 +61,16 @@ module m_constants
     real(wp), parameter :: threshold_vector_zero = 1.e-10_wp   !< Threshold to treat the component of a vector to be zero
     real(wp), parameter :: threshold_edge_zero = 1.e-10_wp     !< Threshold to treat two edges to be overlapped
     real(wp), parameter :: initial_distance_buffer = 1.e12_wp  !< Initialized levelset distance for the shortest path pair algorithm
+
     ! Lagrange bubbles constants
-    integer, parameter  :: mapCells = 3      !< Number of cells around the bubble where the smoothening function will have effect
+    integer, parameter :: mapCells = 3  !< Number of cells around the bubble/particle where the smoothening function will have effect
     real(wp), parameter :: R_uni = 8314._wp  !< Universal gas constant - J/kmol/K
-    integer, parameter  :: lag_io_vars = 21  !< Number of variables per particle for MPI_IO
+    integer, parameter :: lag_io_vars = 21  !< Number of variables per particle for MPI_IO
 
     ! Strang Splitting constants
     real(wp), parameter :: dflt_adap_dt_tol = 1.e-4_wp   !< Default tolerance for adaptive step size
     integer, parameter  :: dflt_adap_dt_max_iters = 100  !< Default max iteration for adaptive step size
+
     ! Constants of the algorithm described by Heirer, E. Hairer, S. P.Norsett, G. Wanner, Solving Ordinary Differential Equations I,
     ! Chapter II.4 to choose the initial time step size for the adaptive time stepping routine
     real(wp), parameter :: threshold_first_guess = 1.e-5_wp    !< Threshold for initial step size estimate
@@ -75,10 +80,9 @@ module m_constants
     real(wp), parameter :: small_guess = 1.e-6_wp              !< Minimum initial step size
 
     ! Relativity
-    !> Max Newton-Raphson iterations for relativistic primitive recovery
     integer, parameter :: relativity_cons_to_prim_max_iter = 100
 
-    ! Linear congruential pseudo-random number generator parameters
+    ! Pseudo-random number generator
     integer, parameter  :: modulus = 2**30 - 1     !< PRNG modulus
     integer, parameter  :: multiplier = 1664525    !< PRNG multiplier
     integer, parameter  :: increment = 1013904223  !< PRNG increment
diff --git a/src/common/m_derived_types.fpp b/src/common/m_derived_types.fpp
index 0c00de90d7..d7dae2840e 100644
--- a/src/common/m_derived_types.fpp
+++ b/src/common/m_derived_types.fpp
@@ -187,9 +187,15 @@ module m_derived_types
     type ic_patch_parameters
 
         integer :: geometry  !< Type of geometry for the patch
+
+        !> Location of the geometric center, i.e. the centroid, of the patch. It is specified through its x-, y- and z-coordinates,
+        !! respectively.
         real(wp) :: x_centroid, y_centroid, z_centroid  !< Geometric center coordinates of the patch
-        real(wp) :: length_x, length_y, length_z  !< Dimensions of the patch. x,y,z Lengths.
-        real(wp) :: radius  !< Dimensions of the patch. radius.
+        real(wp) :: length_x, length_y, length_z        !< Dimensions of the patch. x,y,z Lengths.
+        real(wp) :: radius                              !< Dimensions of the patch. radius.
+
+        !> Vector indicating the various radii for the elliptical and ellipsoidal patch geometries. It is specified through its x-,
+        !! y-, and z-components respectively.
         real(wp), dimension(3) :: radii  !< Elliptical/ellipsoidal patch radii in x, y, z
         real(wp) :: epsilon, beta  !< The isentropic vortex parameters for the amplitude of the disturbance and domain of influence.
         real(wp), dimension(2:9) :: a  !< Used by hardcoded IC and as temporary variables.
@@ -200,53 +206,68 @@ module m_derived_types
         logical :: modal_clip_r_to_min  !< When true, clip boundary radius: R(theta) = max(R(theta), modal_r_min) (Non-exp form only)
         real(wp) :: modal_r_min  !< Minimum boundary radius when modal_clip_r_to_min is true (Non-exp form only)
         logical :: modal_use_exp_form  !< When true, boundary = radius*exp(Fourier series)
+
         ! Geometry 14 (3D spherical harmonic): sph_har_coeff(l,m) for real Y_lm
         real(wp), dimension(0:max_sph_harm_degree,-max_sph_harm_degree:max_sph_harm_degree) :: sph_har_coeff
+
+        !> Normal vector indicating the orientation of the patch. It is specified through its x-, y- and z-components, respectively.
         real(wp), dimension(3) :: normal  !< Patch orientation normal vector (x, y, z)
+
+        !> List of permissions that indicate to the current patch which preceding patches it is allowed to overwrite when it is in
+        !! process of being laid out in the domain
         logical, dimension(0:num_patches_max - 1) :: alter_patch  !< Overwrite permissions for preceding patches
-        logical :: smoothen  !< Whether patch boundaries are smoothed across cells
+
+        !> Permission indicating to the current patch whether its boundaries will be smoothed out across a few cells or whether they
+        !! are to remain sharp
+        logical :: smoothen         !< Whether patch boundaries are smoothed across cells
         integer :: smooth_patch_id  !< Identity (id) of the patch with which current patch is to get smoothed
-        real(wp) :: smooth_coeff  !< Smoothing stencil size coefficient
+
+        !> Smoothing coefficient (coeff) for the size of the stencil of cells across which boundaries of the current patch will be
+        !! smeared out
+        real(wp)                            :: smooth_coeff
         real(wp), dimension(num_fluids_max) :: alpha_rho
-        real(wp) :: rho
-        real(wp), dimension(3) :: vel
-        real(wp) :: pres
+        real(wp)                            :: rho
+        real(wp), dimension(3)              :: vel
+        real(wp)                            :: pres
         real(wp), dimension(num_fluids_max) :: alpha
-        real(wp) :: gamma
-        real(wp) :: pi_inf
-        real(wp) :: cv
-        real(wp) :: qv
-        real(wp) :: qvp  !< Reference entropy per unit mass (SGEOS)
-        real(wp) :: Bx, By, Bz  !< Magnetic field components; B%x is not used for 1D
-        real(wp), dimension(6) :: tau_e  !< Elastic stresses added to primitive variables if hypoelasticity = True
-        real(wp) :: R0  !< Bubble size
-        real(wp) :: V0  !< Bubble velocity
-        real(wp) :: p0  !< Bubble size
-        real(wp) :: m0  !< Bubble velocity
-        integer :: hcid  !< Hardcoded initial condition ID
-        real(wp) :: cf_val  !< Color function value
-        real(wp) :: Y(1:num_species)  !< Species mass fractions
+        real(wp)                            :: gamma
+        real(wp)                            :: pi_inf
+        real(wp)                            :: cv
+        real(wp)                            :: qv
+        !> Primitive variables associated with the patch. In order, these include the partial densities, density, velocity,
+        !! pressure, volume fractions, specific heat ratio function and the liquid stiffness function.
+        real(wp)               :: qvp         !< reference entropy per unit mass for SGEOS, q' (see Le Metayer (2004))
+        real(wp)               :: Bx, By, Bz  !< Magnetic field components; B%x is not used for 1D
+        real(wp), dimension(6) :: tau_e       !< Elastic stresses added to primitive variables if hypoelasticity = True
+        real(wp)               :: R0          !< Bubble size
+        real(wp)               :: V0          !< Bubble velocity
+        real(wp)               :: p0          !< Bubble size
+        real(wp)               :: m0          !< Bubble velocity
+        integer                :: hcid
+        !! id for hard coded initial condition
 
-        ! STL or OBJ model input parameter
-        character(LEN=pathlen_max) :: model_filepath   !< Path the STL file relative to case_dir.
-        real(wp), dimension(1:3)   :: model_translate  !< Translation of the STL object.
-        real(wp), dimension(1:3)   :: model_scale      !< Scale factor for the STL object.
-        !> Angle to rotate the STL object along each cartesian coordinate axis, in radians.
+        real(wp) :: cf_val  !< Color function value
+        real(wp) :: Y(1:num_species)  !< Species mass fractions STL or OBJ model input parameter
+        character(LEN=pathlen_max) :: model_filepath  !< Path the STL file relative to case_dir.
+        real(wp), dimension(1:3) :: model_translate  !< Translation of the STL object.
+        real(wp), dimension(1:3) :: model_scale  !< Scale factor for the STL object.
         real(wp), dimension(1:3) :: model_rotate
-        integer                  :: model_spc        !< Number of samples per cell to use when discretizing the STL object.
-        real(wp)                 :: model_threshold  !< Threshold to turn on smoothen STL patch.
+        integer :: model_spc  !< Number of samples per cell to use when discretizing the STL object.
+        real(wp) :: model_threshold  !< Threshold to turn on smoothen STL patch. Patch conditions for moving imersed boundaries
     end type ic_patch_parameters
 
     type ib_patch_parameters
 
-        integer  :: geometry                            !< Type of geometry for the patch
+        integer :: geometry  !< Type of geometry for the patch
+
+        !> Location of the geometric center, i.e. the centroid, of the patch. It is specified through its x-, y- and z-coordinates,
+        !! respectively.
         real(wp) :: x_centroid, y_centroid, z_centroid  !< Geometric center coordinates of the patch
         !> Centroid locations of intermediate steps in the time_stepper module
-        real(wp)                 :: step_x_centroid, step_y_centroid, step_z_centroid
-        real(wp), dimension(1:3) :: centroid_offset  !< offset of center of mass from computed cell center for odd-shaped IBs
-        real(wp), dimension(1:3) :: angles
-        real(wp), dimension(1:3) :: step_angles
-        !> matrix that converts from IB reference frame to fluid reference frame
+        real(wp)                     :: step_x_centroid, step_y_centroid, step_z_centroid
+        real(wp), dimension(1:3)     :: centroid_offset  ! offset of center of mass from computed cell center for odd-shaped IBs
+        real(wp), dimension(1:3)     :: angles
+        real(wp), dimension(1:3)     :: step_angles
         real(wp), dimension(1:3,1:3) :: rotation_matrix
         !> matrix that converts from fluid reference frame to IB reference frame
         real(wp), dimension(1:3,1:3) :: rotation_matrix_inverse
@@ -256,11 +277,10 @@ module m_derived_types
         real(wp)                     :: theta
         logical                      :: slip
 
-        ! STL or OBJ model input parameter
-        character(LEN=pathlen_max) :: model_filepath   !< Path the STL file relative to case_dir.
-        real(wp), dimension(1:3)   :: model_translate  !< Translation of the STL object.
-        real(wp), dimension(1:3)   :: model_scale      !< Scale factor for the STL object.
-        !> Angle to rotate the STL object along each cartesian coordinate axis, in radians.
+        !! STL or OBJ model input parameter
+        character(LEN=pathlen_max) :: model_filepath  !< Path the STL file relative to case_dir.
+        real(wp), dimension(1:3) :: model_translate  !< Translation of the STL object.
+        real(wp), dimension(1:3) :: model_scale  !< Scale factor for the STL object.
         real(wp), dimension(1:3) :: model_rotate
         integer :: model_spc  !< Number of samples per cell to use when discretizing the STL object.
         real(wp) :: model_threshold  !< Threshold to turn on smoothen STL patch. Patch conditions for moving imersed boundaries
@@ -309,6 +329,12 @@ module m_derived_types
         real(wp) :: R_g      !< gas constant of gas (bubble)
     end type subgrid_bubble_physical_parameters
 
+    !> Derived type annexing the physical parameters required for sub-grid particle models
+    type subgrid_particle_physical_parameters
+        real(wp) :: rho0ref_particle  !< reference density
+        real(wp) :: cp_particle       !< solid particle specific heat
+    end type subgrid_particle_physical_parameters
+
     type mpi_io_airfoil_ib_var
         integer, dimension(2)                    :: view
         type(vec3_dt), allocatable, dimension(:) :: var
@@ -396,18 +422,31 @@ module m_derived_types
     !> Lagrangian bubble parameters
     type bubbles_lagrange_parameters
 
-        integer  :: solver_approach      !< 1: One-way coupling, 2: two-way coupling
-        integer  :: cluster_type         !< Cluster model to find p_inf
-        logical  :: pressure_corrector   !< Cell pressure correction term
-        integer  :: smooth_type          !< Smoothing function. 1: Gaussian, 2:Delta 3x3
-        logical  :: heatTransfer_model   !< Activate HEAT transfer model at the bubble-liquid interface
-        logical  :: massTransfer_model   !< Activate MASS transfer model at the bubble-liquid interface
-        logical  :: write_bubbles        !< Write files to track the bubble evolution each time step
-        logical  :: write_bubbles_stats  !< Write the maximum and minimum radius of each bubble
-        integer  :: nBubs_glb            !< Global number of bubbles
-        real(wp) :: epsilonb             !< Standard deviation scaling for the gaussian function
-        real(wp) :: charwidth            !< Domain virtual depth (z direction, for 2D simulations)
-        real(wp) :: valmaxvoid           !< Maximum void fraction permitted
+        integer                    :: solver_approach      !< 1: One-way coupling, 2: two-way coupling
+        integer                    :: cluster_type         !< Cluster model to find p_inf
+        logical                    :: pressure_corrector   !< Cell pressure correction term
+        integer                    :: smooth_type          !< Smoothing function. 1: Gaussian, 2:Delta 3x3
+        logical                    :: heatTransfer_model   !< Activate HEAT transfer model at the bubble-liquid interface
+        logical                    :: massTransfer_model   !< Activate MASS transfer model at the bubble-liquid interface
+        logical                    :: write_void_evol      !< Write files to track evolution of void fraction at each time step
+        logical                    :: write_bubbles        !< Write files to track the bubble evolution each time step
+        logical                    :: write_bubbles_stats  !< Write the maximum and minimum radius of each bubble
+        integer                    :: nBubs_glb            !< Global number of bubbles
+        integer                    :: nParticles_glb       !< Global number of particles
+        integer                    :: vel_model            !< Particle velocity model
+        integer                    :: drag_model           !< Particle drag model
+        logical                    :: pressure_force       !< Include pressure force translational motion
+        logical                    :: gravity_force        !< Include gravity force in translational motion
+        integer                    :: qs_drag_model        !< Particle QS drag model
+        integer                    :: stokes_drag          !< Particle stokes drag
+        integer                    :: added_mass_model     !< Particle added mass model
+        integer                    :: interpolation_order  !< Fluid-to-Particle barycentric interpolation order
+        logical                    :: collision_force      !< Include collision forces
+        character(LEN=pathlen_max) :: input_path           !< Path to lag_bubbles.dat
+        real(wp)                   :: epsilonb             !< Standard deviation scaling for the gaussian function
+        real(wp)                   :: charwidth            !< Domain virtual depth (z direction, for 2D simulations)
+        integer                    :: charNz               !< Number of grid cells in characteristic depth
+        real(wp)                   :: valmaxvoid           !< Maximum void fraction permitted
     end type bubbles_lagrange_parameters
 
     !> Max and min number of cells in a direction of each combination of x-,y-, and z-
diff --git a/src/common/m_helper.fpp b/src/common/m_helper.fpp
index 4074530a8f..7741b9bc19 100644
--- a/src/common/m_helper.fpp
+++ b/src/common/m_helper.fpp
@@ -18,11 +18,15 @@ module m_helper
     public :: s_comp_n_from_prim, s_comp_n_from_cons, s_initialize_bubbles_model, s_initialize_nonpoly, s_simpson, s_transcoeff, &
         & s_int_to_str, s_transform_vec, s_transform_triangle, s_transform_model, s_swap, f_cross, f_create_transform_matrix, &
         & f_create_bbox, s_print_2D_array, f_xor, f_logical_to_int, associated_legendre, real_ylm, double_factorial, factorial, &
-        & f_cut_on, f_cut_off, s_downsample_data, s_upsample_data
+        & f_cut_on, f_cut_off, s_downsample_data, s_upsample_data, s_initialize_particles_model
 
 contains
 
     !> Computes the bubble number density n from the primitive variables
+    !! @param vftmp is the void fraction
+    !! @param Rtmp is the  bubble radii
+    !! @param ntmp is the output number bubble density
+    !! @param weights is the quadrature weights
     subroutine s_comp_n_from_prim(vftmp, Rtmp, ntmp, weights)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -37,7 +41,7 @@ contains
 
     end subroutine s_comp_n_from_prim
 
-    !> Compute the bubble number density from the conservative void fraction and weighted bubble radii.
+    !> @brief Computes the bubble number density from the conservative void fraction and weighted bubble radii.
     subroutine s_comp_n_from_cons(vftmp, nRtmp, ntmp, weights)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -52,7 +56,7 @@ contains
 
     end subroutine s_comp_n_from_cons
 
-    !> Print a 2D real array to standard output, optionally dividing each element by a given scalar.
+    !> @brief Prints a 2D real array to standard output, optionally dividing each element by a given scalar.
     impure subroutine s_print_2D_array(A, div)
 
         real(wp), dimension(:,:), intent(in) :: A
@@ -82,7 +86,14 @@ contains
 
     end subroutine s_print_2D_array
 
-    !> Initialize bubble model arrays for Euler or Lagrangian bubbles with polytropic or non-polytropic gas.
+    impure subroutine s_initialize_particles_model()
+
+        rho0ref_particle = particle_pp%rho0ref_particle
+        cp_particle = particle_pp%cp_particle
+
+    end subroutine s_initialize_particles_model
+
+    !> bubbles_euler + polytropic bubbles_euler + non-polytropic bubbles_lagrange + non-polytropic
     impure subroutine s_initialize_bubbles_model()
 
         ! Allocate memory
@@ -112,7 +123,7 @@ contains
 
     end subroutine s_initialize_bubbles_model
 
-    !> Set bubble physical parameters and nondimensional numbers from the input configuration.
+    !>
     impure subroutine s_initialize_bubble_vars()
 
         R0ref = bub_pp%R0ref; p0ref = bub_pp%p0ref
@@ -128,6 +139,7 @@ contains
             R_v = bub_pp%R_v; R_g = bub_pp%R_g
             Tw = bub_pp%T0ref
         end if
+
         if (bubbles_lagrange) then
             cp_v = bub_pp%cp_v; cp_g = bub_pp%cp_g
             k_vl = bub_pp%k_v; k_gl = bub_pp%k_g
@@ -171,18 +183,19 @@ contains
         integer                 :: ir
         real(wp), dimension(nb) :: chi_vw0, cp_m0, k_m0, rho_m0, x_vw, omegaN, rhol0
         real(wp), parameter     :: k_poly = 1._wp  !< polytropic index used to compute isothermal natural frequency
-        ! Chapman-Enskog transport coefficients for vapor-gas mixture, Ando JAS (2010)
+
+        ! phi_vg & phi_gv (phi_gg = phi_vv = 1) (Eq. 2.22 in Ando 2010)
 
         phi_vg = (1._wp + sqrt(mu_v/mu_g)*(M_g/M_v)**(0.25_wp))**2/(sqrt(8._wp)*sqrt(1._wp + M_v/M_g))
         phi_gv = (1._wp + sqrt(mu_g/mu_v)*(M_v/M_g)**(0.25_wp))**2/(sqrt(8._wp)*sqrt(1._wp + M_g/M_v))
 
-        ! Initial internal bubble pressure (Euler number + Laplace pressure)
+        ! internal bubble pressure
         pb0 = Eu + 2._wp/Web/R0
 
-        ! Vapor mass fraction at bubble wall, Ando JAS (2010)
+        ! mass fraction of vapor (Eq. 2.19 in Ando 2010)
         chi_vw0 = 1._wp/(1._wp + R_v/R_g*(pb0/pv - 1._wp))
 
-        ! Mixture specific heat from mass-weighted vapor/gas contributions
+        ! specific heat for gas/vapor mixture
         cp_m0 = chi_vw0*R_v*gam_v/(gam_v - 1._wp) + (1._wp - chi_vw0)*R_g*gam_g/(gam_g - 1._wp)
 
         ! mole fraction of vapor (Eq. 2.23 in Ando 2010)
@@ -203,7 +216,7 @@ contains
         ! Peclet numbers
         Pe_T(:) = rho_m0*cp_m0(:)/k_m0(:)
 
-        ! Bubble natural frequency, Ando JAS (2010)
+        ! natural frequencies (Eq. B.1)
         omegaN(:) = sqrt(3._wp*k_poly*Ca + 2._wp*(3._wp*k_poly - 1._wp)/(Web*R0))/R0/sqrt(rho0ref)
         do ir = 1, nb
             call s_transcoeff(omegaN(ir)*R0(ir), Pe_T(ir)*R0(ir), Re_trans_T(ir), Im_trans_T(ir))
@@ -214,6 +227,10 @@ contains
     end subroutine s_initialize_nonpoly
 
     !> Computes the transfer coefficient for the non-polytropic bubble compression process
+    !! @param omega natural frequencies
+    !! @param peclet Peclet number
+    !! @param Re_trans Real part of the transport coefficients
+    !! @param Im_trans Imaginary part of the transport coefficients
     elemental subroutine s_transcoeff(omega, peclet, Re_trans, Im_trans)
 
         real(wp), intent(in)  :: omega, peclet
@@ -232,7 +249,7 @@ contains
 
     end subroutine s_transcoeff
 
-    !> Convert an integer to its trimmed string representation.
+    !> @brief Converts an integer to its trimmed string representation.
     elemental subroutine s_int_to_str(i, res)
 
         integer, intent(in)             :: i
@@ -283,7 +300,10 @@ contains
 
     end subroutine s_simpson
 
-    !> Compute the cross product of two vectors.
+    !> This procedure computes the cross product of two vectors.
+    !! @param a First vector.
+    !! @param b Second vector.
+    !! @return The cross product of the two vectors.
     pure function f_cross(a, b) result(c)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -297,7 +317,9 @@ contains
 
     end function f_cross
 
-    !> Swap two real numbers.
+    !> This procedure swaps two real numbers.
+    !! @param lhs Left-hand side.
+    !! @param rhs Right-hand side.
     elemental subroutine s_swap(lhs, rhs)
 
         real(wp), intent(inout) :: lhs, rhs
@@ -309,7 +331,10 @@ contains
 
     end subroutine s_swap
 
-    !> Create a transformation matrix.
+    !> This procedure creates a transformation matrix.
+    !! @param param Parameters for the transformation.
+    !! @param center Optional center point for the transformation.
+    !! @return Transformation matrix.
     function f_create_transform_matrix(param, center) result(out_matrix)
 
         type(ic_model_parameters), intent(in)          :: param
@@ -347,7 +372,9 @@ contains
 
     end function f_create_transform_matrix
 
-    !> Transform a vector by a matrix.
+    !> This procedure transforms a vector by a matrix.
+    !! @param vec Vector to transform.
+    !! @param matrix Transformation matrix.
     subroutine s_transform_vec(vec, matrix)
 
         real(wp), dimension(1:3), intent(inout)  :: vec
@@ -359,7 +386,10 @@ contains
 
     end subroutine s_transform_vec
 
-    !> Transform a triangle by a matrix, one vertex at a time.
+    !> This procedure transforms a triangle by a matrix, one vertex at a time.
+    !! @param triangle Triangle to transform.
+    !! @param matrix   Transformation matrix.
+    !! @param matrix_n Normal transformation matrix.
     subroutine s_transform_triangle(triangle, matrix, matrix_n)
 
         type(t_triangle), intent(inout)          :: triangle
@@ -374,7 +404,10 @@ contains
 
     end subroutine s_transform_triangle
 
-    !> Transform a model by a matrix, one triangle at a time.
+    !> This procedure transforms a model by a matrix, one triangle at a time.
+    !! @param model    Model to transform.
+    !! @param matrix   Transformation matrix.
+    !! @param matrix_n Normal transformation matrix.
     subroutine s_transform_model(model, matrix, matrix_n)
 
         type(t_model), intent(inout)             :: model
@@ -387,7 +420,9 @@ contains
 
     end subroutine s_transform_model
 
-    !> Create a bounding box for a model.
+    !> This procedure creates a bounding box for a model.
+    !! @param model Model to create bounding box for.
+    !! @return Bounding box.
     function f_create_bbox(model) result(bbox)
 
         type(t_model), intent(in) :: model
@@ -412,7 +447,10 @@ contains
 
     end function f_create_bbox
 
-    !> Perform XOR on lhs and rhs.
+    !> This procedure performs xor on lhs and rhs.
+    !! @param lhs logical input.
+    !! @param rhs other logical input.
+    !! @return xored result.
     elemental function f_xor(lhs, rhs) result(res)
 
         logical, intent(in) :: lhs, rhs
@@ -422,7 +460,9 @@ contains
 
     end function f_xor
 
-    !> Convert a logical to 1 or 0.
+    !> This procedure converts logical to 1 or 0.
+    !! @param predicate A Logical argument.
+    !! @return 1 if .true., 0 if .false..
     elemental function f_logical_to_int(predicate) result(int)
 
         logical, intent(in) :: predicate
@@ -468,6 +508,7 @@ contains
     !! @param x argument (typically cos(theta)), should be in [-1,1]
     !! @param l degree (>= 0)
     !! @param m_order order (0 <= m_order <= l)
+    !! @return result_P P_l^m(x)
     recursive function associated_legendre(x, l, m_order) result(result_P)
 
         integer, intent(in)  :: l, m_order
@@ -502,7 +543,9 @@ contains
 
     end function associated_legendre
 
-    !> Calculate the double factorial of an integer
+    !> This function calculates the double factorial value of an integer
+    !! @param n_in is the input integer
+    !! @return R is the double factorial value of n
     elemental function double_factorial(n_in) result(R_result)
 
         integer, intent(in)      :: n_in
@@ -514,7 +557,9 @@ contains
 
     end function double_factorial
 
-    !> Calculate the factorial of an integer
+    !> The following function calculates the factorial value of an integer
+    !! @param n_in is the input integer
+    !! @return R is the factorial value of n
     elemental function factorial(n_in) result(R_result)
 
         integer, intent(in)      :: n_in
@@ -526,8 +571,11 @@ contains
 
     end function factorial
 
-    !> Calculate a smooth cut-on function that is zero for x values smaller than zero and goes to one, for generating smooth initial
-    !! conditions
+    !> This function calculates a smooth cut-on function that is zero for x values smaller than zero and goes to one. It can be used
+    !! for generating smooth initial conditions
+    !! @param x is the input value
+    !! @param eps is the smoothing parameter
+    !! @return fx is the cut-on function evaluated at x
     function f_cut_on(x, eps) result(fx)
 
         real(wp), intent(in) :: x, eps
@@ -537,8 +585,11 @@ contains
 
     end function f_cut_on
 
-    !> Calculate a smooth cut-off function that is one for x values smaller than zero and goes to zero, for generating smooth
-    !! initial conditions
+    !> This function calculates a smooth cut-off function that is one for x values smaller than zero and goes to zero. It can be
+    !! used for generating smooth initial conditions
+    !! @param x is the input value
+    !! @param eps is the smoothing parameter
+    !! @return fx is the cut-ff function evaluated at x
     function f_cut_off(x, eps) result(fx)
 
         real(wp), intent(in) :: x, eps
@@ -548,7 +599,9 @@ contains
 
     end function f_cut_off
 
-    !> Helper function for f_cut_on and f_cut_off
+    !> This function is a helper function for the functions f_cut_on and f_cut_off
+    !! @param x is the input value
+    !! @return gx is the result
     function f_gx(x) result(gx)
 
         real(wp), intent(in) :: x
@@ -562,7 +615,7 @@ contains
 
     end function f_gx
 
-    !> Downsample conservative variable fields by a factor of 3 in each direction using volume averaging.
+    !> @brief Downsamples conservative variable fields by a factor of 3 in each direction using volume averaging.
     subroutine s_downsample_data(q_cons_vf, q_cons_temp, m_ds, n_ds, p_ds, m_glb_ds, n_glb_ds, p_glb_ds)
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf, q_cons_temp
@@ -604,7 +657,7 @@ contains
 
     end subroutine s_downsample_data
 
-    !> Upsample conservative variable fields from a coarsened grid back to the original resolution using interpolation.
+    !> @brief Upsamples conservative variable fields from a coarsened grid back to the original resolution using interpolation.
     subroutine s_upsample_data(q_cons_vf, q_cons_temp)
 
         type(scalar_field), intent(inout), dimension(sys_size) :: q_cons_vf, q_cons_temp
diff --git a/src/common/m_helper_basic.fpp b/src/common/m_helper_basic.fpp
index 7208a451da..0138a5da35 100644
--- a/src/common/m_helper_basic.fpp
+++ b/src/common/m_helper_basic.fpp
@@ -17,8 +17,11 @@ module m_helper_basic
 
 contains
 
-    !> Check if two floating point numbers of wp are within tolerance.
+    !> This procedure checks if two floating point numbers of wp are within tolerance.
+    !! @param a First number.
+    !! @param b Second number.
     !! @param tol_input Relative error (default = 1.e-10_wp).
+    !! @return Result of the comparison.
     logical elemental function f_approx_equal(a, b, tol_input) result(res)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -42,8 +45,11 @@ contains
 
     end function f_approx_equal
 
-    !> Check if a wp value approximately matches any element of an array within tolerance.
+    !> This procedure checks if the point numbers of wp belongs to another array are within tolerance.
+    !! @param a First number.
+    !! @param b Array that contains several point numbers.
     !! @param tol_input Relative error (default = 1e-10_wp).
+    !! @return Result of the comparison.
     logical function f_approx_in_array(a, b, tol_input) result(res)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -71,6 +77,7 @@ contains
     end function f_approx_in_array
 
     !> Checks if a real(wp) variable is of default value.
+    !! @param var Variable to check.
     logical elemental function f_is_default(var) result(res)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -81,15 +88,23 @@ contains
     end function f_is_default
 
     !> Checks if ALL elements of a real(wp) array are of default value.
+    !! @param var_array Array to check.
     logical function f_all_default(var_array) result(res)
 
         real(wp), intent(in) :: var_array(:)
 
         res = all(f_is_default(var_array))
 
+        ! logical :: res_array(size(var_array)) integer :: i
+
+        ! do i = 1, size(var_array) res_array(i) = f_is_default(var_array(i)) end do
+
+        ! res = all(res_array)
+
     end function f_all_default
 
     !> Checks if a real(wp) variable is an integer.
+    !! @param var Variable to check.
     logical elemental function f_is_integer(var) result(res)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -99,20 +114,19 @@ contains
 
     end function f_is_integer
 
-    !> Compute ghost-cell buffer size and set interior/buffered coordinate index bounds.
     subroutine s_configure_coordinate_bounds(recon_type, weno_polyn, muscl_polyn, igr_order, buff_size, idwint, idwbuff, viscous, &
-
-        & bubbles_lagrange, m, n, p, num_dims, igr, ib)
+        & bubbles_lagrange, particles_lagrange, m, n, p, num_dims, igr, ib, fd_number)
 
         integer, intent(in)                                :: recon_type, weno_polyn, muscl_polyn
-        integer, intent(in)                                :: m, n, p, num_dims, igr_order
+        integer, intent(in)                                :: m, n, p, num_dims, igr_order, fd_number
         integer, intent(inout)                             :: buff_size
         type(int_bounds_info), dimension(3), intent(inout) :: idwint, idwbuff
-        logical, intent(in)                                :: viscous, bubbles_lagrange
+        logical, intent(in)                                :: viscous, bubbles_lagrange, particles_lagrange
         logical, intent(in)                                :: igr
         logical, intent(in)                                :: ib
 
-        ! Determine ghost cell buffer size for boundary conditions
+        ! Determining the number of cells that are needed in order to store sufficient boundary conditions data as to iterate the
+        ! solution in the physical computational domain from one time-step iteration to the next one
 
         if (igr) then
             buff_size = (igr_order - 1)/2 + 2
@@ -128,7 +142,12 @@ contains
 
         ! Correction for smearing function in the lagrangian subgrid bubble model
         if (bubbles_lagrange) then
-            buff_size = max(buff_size, 6)
+            buff_size = max(buff_size + fd_number, mapCells + 1 + fd_number)
+        end if
+
+        ! Correction for smearing function in the lagrangian subgrid particle model
+        if (particles_lagrange) then
+            buff_size = max(buff_size + fd_number, mapCells + 1 + fd_number)
         end if
 
         if (ib) then
@@ -150,7 +169,10 @@ contains
     end subroutine s_configure_coordinate_bounds
 
     !> Updates the min and max number of cells in each set of axes
-    !! @param bounds Min and max values to update
+    !! @param bounds Min ans max values to update
+    !! @param m Number of cells in x-axis
+    !! @param n Number of cells in y-axis
+    !! @param p Number of cells in z-axis
     elemental subroutine s_update_cell_bounds(bounds, m, n, p)
 
         type(cell_num_bounds), intent(out) :: bounds
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index 8a719f5758..643646c400 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -22,9 +22,20 @@ module m_mpi_common
 
     integer, private :: v_size
     $:GPU_DECLARE(create='[v_size]')
+    !! Generic flags used to identify and report MPI errors
 
+    !> This variable is utilized to pack and send the buffer of the cell-average primitive variables, for a single computational
+    !! domain boundary at the time, to the relevant neighboring processor.
     real(wp), private, allocatable, dimension(:) :: buff_send  !< Primitive variable send buffer for halo exchange
+
+    !> buff_recv is utilized to receive and unpack the buffer of the cell- average primitive variables, for a single computational
+    !! domain boundary at the time, from the relevant neighboring processor.
     real(wp), private, allocatable, dimension(:) :: buff_recv  !< Primitive variable receive buffer for halo exchange
+    type(int_bounds_info)                        :: comm_coords(3)
+    integer                                      :: comm_size(3)
+    $:GPU_DECLARE(create='[comm_coords, comm_size]')
+    !! Variables for EL bubbles communication
+
 #ifndef __NVCOMPILER_GPU_UNIFIED_MEM
     $:GPU_DECLARE(create='[buff_send, buff_recv]')
 #endif
@@ -34,9 +45,12 @@ module m_mpi_common
 
 contains
 
-    !> Initialize the module.
+    !> The computation of parameters, the allocation of memory, the association of pointers and/or the execution of any other
+    !! procedures that are necessary to setup the module.
     impure subroutine s_initialize_mpi_common_module
 
+        integer :: beta_v_size, beta_comm_size_1, beta_comm_size_2, beta_comm_size_3, beta_halo_size
+
 #ifdef MFC_MPI
         ! Allocating buff_send/recv and. Please note that for the sake of simplicity, both variables are provided sufficient storage
         ! to hold the largest buffer in the computational domain.
@@ -51,13 +65,31 @@ contains
             if (p > 0) then
                 halo_size = nint(-1._wp + 1._wp*buff_size*(v_size)*(m + 2*buff_size + 1)*(n + 2*buff_size + 1)*(p + 2*buff_size &
                                  & + 1)/(cells_bounds%mnp_min + 2*buff_size + 1))
-            else
+            else  ! PBC at the end
                 halo_size = -1 + buff_size*(v_size)*(cells_bounds%mn_max + 2*buff_size + 1)
             end if
         else
             halo_size = -1 + buff_size*(v_size)
         end if
 
+        if (bubbles_lagrange .or. particles_lagrange) then
+            beta_v_size = size(beta_vars)
+            beta_comm_size_1 = m + 2*mapCells + 3
+            beta_comm_size_2 = merge(n + 2*mapCells + 3, 1, n > 0)
+            beta_comm_size_3 = merge(p + 2*mapCells + 3, 1, p > 0)
+            if (n > 0) then
+                if (p > 0) then
+                    beta_halo_size = 2*(mapCells + 1)*beta_v_size*max(beta_comm_size_2*beta_comm_size_3, &
+                                        & beta_comm_size_1*beta_comm_size_3, beta_comm_size_1*beta_comm_size_2) - 1
+                else  ! PBC at the end only
+                    beta_halo_size = 2*(mapCells + 1)*beta_v_size*max(beta_comm_size_2, beta_comm_size_1) - 1
+                end if
+            else  ! PBC at the end
+                beta_halo_size = 2*(mapCells + 1)*beta_v_size - 1
+            end if
+            halo_size = max(halo_size, beta_halo_size)
+        end if
+
         $:GPU_UPDATE(device='[halo_size, v_size]')
 
 #ifndef __NVCOMPILER_GPU_UNIFIED_MEM
@@ -71,30 +103,39 @@ contains
 
     end subroutine s_initialize_mpi_common_module
 
-    !> Initialize the MPI execution environment and query the number of processors and local rank.
+    !> The subroutine initializes the MPI execution environment and queries both the number of processors which will be available
+    !! for the job and the local processor rank.
     impure subroutine s_mpi_initialize
 
 #ifdef MFC_MPI
         integer :: ierr  !< Generic flag used to identify and report MPI errors
 
+        ! Initializing the MPI environment
         call MPI_INIT(ierr)
 
+        ! Checking whether the MPI environment has been properly initialized
         if (ierr /= MPI_SUCCESS) then
             print '(A)', 'Unable to initialize MPI environment. Exiting.'
             call MPI_ABORT(MPI_COMM_WORLD, 1, ierr)
         end if
 
+        ! Querying the number of processors available for the job
         call MPI_COMM_SIZE(MPI_COMM_WORLD, num_procs, ierr)
 
+        ! Querying the rank of the local processor
         call MPI_COMM_RANK(MPI_COMM_WORLD, proc_rank, ierr)
 #else
+        ! Serial run only has 1 processor
         num_procs = 1
+        ! Local processor rank is 0
         proc_rank = 0
 #endif
 
     end subroutine s_mpi_initialize
 
-    !> Set up MPI I/O data views and variable pointers for parallel file output.
+    !! @param q_cons_vf Conservative variables
+    !! @param ib_markers track if a cell is within the immersed boundary
+    !! @param beta Eulerian void fraction from lagrangian bubbles
     impure subroutine s_initialize_mpi_data(q_cons_vf, ib_markers, beta)
 
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
@@ -104,8 +145,11 @@ contains
         integer, dimension(1)                               :: airfoil_glb, airfoil_loc, airfoil_start
 
 #ifdef MFC_MPI
+        ! Generic loop iterator
         integer :: i, j
         integer :: ierr  !< Generic flag used to identify and report MPI errors
+
+        ! Altered system size for the lagrangian subgrid bubble model
         integer :: alt_sys
 
         if (present(beta)) then
@@ -176,7 +220,7 @@ contains
 
     end subroutine s_initialize_mpi_data
 
-    !> Set up MPI I/O data views for downsampled (coarsened) parallel file output.
+    !! @param q_cons_vf Conservative variables
     subroutine s_initialize_mpi_data_ds(q_cons_vf)
 
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
@@ -184,6 +228,7 @@ contains
         integer, dimension(3)                               :: sf_start_idx
 
 #ifdef MFC_MPI
+        ! Generic loop iterator
         integer :: i, j, q, k, l, m_ds, n_ds, p_ds, ierr
 
         sf_start_idx = (/0, 0, 0/)
@@ -222,7 +267,7 @@ contains
 
     end subroutine s_initialize_mpi_data_ds
 
-    !> Gather variable-length real vectors from all MPI ranks onto the root process.
+    !> @brief Gathers variable-length real vectors from all MPI ranks onto the root process.
     impure subroutine s_mpi_gather_data(my_vector, counts, gathered_vector, root)
 
         integer, intent(in)                     :: counts              !< Array of vector lengths for each process
@@ -252,7 +297,7 @@ contains
 
     end subroutine s_mpi_gather_data
 
-    !> Gather per-rank time step wall-clock times onto rank 0 for performance reporting.
+    !> @brief Gathers per-rank time step wall-clock times onto rank 0 for performance reporting.
     impure subroutine mpi_bcast_time_step_values(proc_time, time_avg)
 
         real(wp), dimension(0:num_procs - 1), intent(inout) :: proc_time
@@ -266,7 +311,7 @@ contains
 
     end subroutine mpi_bcast_time_step_values
 
-    !> Print a case file error with the prohibited condition and message, then abort execution.
+    !> @brief Prints a case file error with the prohibited condition and message, then aborts execution.
     impure subroutine s_prohibit_abort(condition, message)
 
         character(len=*), intent(in) :: condition, message
@@ -286,40 +331,82 @@ contains
     !! performed by sifting through the local extrema of each stability criterion. Note that each of the local extrema is from a
     !! single process, within its assigned section of the computational domain. Finally, note that the global extrema values are
     !! only bookkeept on the rank 0 processor.
-    impure subroutine s_mpi_reduce_stability_criteria_extrema(icfl_max_loc, vcfl_max_loc, Rc_min_loc, icfl_max_glb, vcfl_max_glb, &
-
-        & Rc_min_glb)
+    !! @param icfl_max_loc Local maximum ICFL stability criterion
+    !! @param vcfl_max_loc Local maximum VCFL stability criterion
+    !! @param Rc_min_loc Local minimum Rc stability criterion
+    !! @param icfl_max_glb Global maximum ICFL stability criterion
+    !! @param vcfl_max_glb Global maximum VCFL stability criterion
+    !! @param Rc_min_glb Global minimum Rc stability criterion
+    impure subroutine s_mpi_reduce_stability_criteria_extrema(icfl_max_loc, vcfl_max_loc, Rc_min_loc, bubs_loc, icfl_max_glb, &
+        & vcfl_max_glb, Rc_min_glb, bubs_glb)
 
         real(wp), intent(in)  :: icfl_max_loc
         real(wp), intent(in)  :: vcfl_max_loc
         real(wp), intent(in)  :: Rc_min_loc
+        integer, intent(in)   :: bubs_loc
         real(wp), intent(out) :: icfl_max_glb
         real(wp), intent(out) :: vcfl_max_glb
         real(wp), intent(out) :: Rc_min_glb
+        integer, intent(out)  :: bubs_glb
 
 #ifdef MFC_SIMULATION
 #ifdef MFC_MPI
         integer :: ierr  !< Generic flag used to identify and report MPI errors
 
+        bubs_glb = 0
+
+        ! Reducing local extrema of ICFL, VCFL, CCFL and Rc numbers to their global extrema and bookkeeping the results on the rank
+        ! 0 processor
         call MPI_REDUCE(icfl_max_loc, icfl_max_glb, 1, mpi_p, MPI_MAX, 0, MPI_COMM_WORLD, ierr)
 
         if (viscous) then
             call MPI_REDUCE(vcfl_max_loc, vcfl_max_glb, 1, mpi_p, MPI_MAX, 0, MPI_COMM_WORLD, ierr)
             call MPI_REDUCE(Rc_min_loc, Rc_min_glb, 1, mpi_p, MPI_MIN, 0, MPI_COMM_WORLD, ierr)
         end if
+
+        if (bubbles_lagrange) then
+            call MPI_REDUCE(bubs_loc, bubs_glb, 1, MPI_INTEGER, MPI_SUM, 0, MPI_COMM_WORLD, ierr)
+        end if
 #else
         icfl_max_glb = icfl_max_loc
+        bubs_glb = 0
 
         if (viscous) then
             vcfl_max_glb = vcfl_max_loc
             Rc_min_glb = Rc_min_loc
         end if
+
+        if (bubbles_lagrange) bubs_glb = bubs_loc
 #endif
 #endif
 
     end subroutine s_mpi_reduce_stability_criteria_extrema
 
-    !> Reduce a local real value to its global sum across all MPI ranks.
+    !> The following subroutine takes the inputted variable and determines its sum on the entire computational domain.
+    ! ! @param var_loc holds the local value to be reduced among all the processors in communicator. On output, the variable holds
+    ! the sum, reduced amongst all of the local values.
+    subroutine s_mpi_reduce_int_sum(var_loc, sum)
+
+        integer, intent(in)  :: var_loc
+        integer, intent(out) :: sum
+
+#ifdef MFC_MPI
+        integer :: ierr  !< Generic flag used to identify and report MPI errors
+
+        ! Performing reduction procedure and eventually storing its result into the variable that was initially inputted into the
+        ! subroutine
+        call MPI_REDUCE(var_loc, sum, 1, MPI_INTEGER, MPI_SUM, 0, MPI_COMM_WORLD, ierr)
+#else
+        sum = var_loc
+#endif
+
+    end subroutine s_mpi_reduce_int_sum
+
+    !> The following subroutine takes the input local variable from all processors and reduces to the sum of all values. The reduced
+    !! variable is recorded back onto the original local variable on each processor.
+    ! ! @param var_loc Some variable containing the local value which should be reduced amongst all the processors in the
+    ! communicator.
+    !! @param var_glb The globally reduced value
     impure subroutine s_mpi_allreduce_sum(var_loc, var_glb)
 
         real(wp), intent(in)  :: var_loc
@@ -328,12 +415,14 @@ contains
 #ifdef MFC_MPI
         integer :: ierr  !< Generic flag used to identify and report MPI errors
 
+        ! Performing the reduction procedure
         call MPI_ALLREDUCE(var_loc, var_glb, 1, mpi_p, MPI_SUM, MPI_COMM_WORLD, ierr)
 #endif
 
     end subroutine s_mpi_allreduce_sum
 
-    !> Reduce an array of vectors to their global sums across all MPI ranks.
+    !> This subroutine follows the behavior of the s_mpi_allreduce_sum subroutine
+    !> with the additional feature that it reduces an array of vectors.
     impure subroutine s_mpi_allreduce_vectors_sum(var_loc, var_glb, num_vectors, vector_length)
 
         integer, intent(in)                   :: num_vectors, vector_length
@@ -343,6 +432,7 @@ contains
 #ifdef MFC_MPI
         integer :: ierr  !< Generic flag used to identify and report MPI errors
 
+        ! Performing the reduction procedure
         if (loc(var_loc) == loc(var_glb)) then
             call MPI_Allreduce(MPI_IN_PLACE, var_glb, num_vectors*vector_length, mpi_p, MPI_SUM, MPI_COMM_WORLD, ierr)
         else
@@ -354,7 +444,11 @@ contains
 
     end subroutine s_mpi_allreduce_vectors_sum
 
-    !> Reduce a local integer value to its global sum across all MPI ranks.
+    !> The following subroutine takes the input local variable from all processors and reduces to the sum of all values. The reduced
+    !! variable is recorded back onto the original local variable on each processor.
+    ! ! @param var_loc Some variable containing the local value which should be reduced amongst all the processors in the
+    ! communicator.
+    !! @param var_glb The globally reduced value
     impure subroutine s_mpi_allreduce_integer_sum(var_loc, var_glb)
 
         integer, intent(in)  :: var_loc
@@ -363,6 +457,7 @@ contains
 #ifdef MFC_MPI
         integer :: ierr  !< Generic flag used to identify and report MPI errors
 
+        ! Performing the reduction procedure
         call MPI_ALLREDUCE(var_loc, var_glb, 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, ierr)
 #else
         var_glb = var_loc
@@ -370,7 +465,11 @@ contains
 
     end subroutine s_mpi_allreduce_integer_sum
 
-    !> Reduce a local real value to its global minimum across all MPI ranks.
+    !> The following subroutine takes the input local variable from all processors and reduces to the minimum of all values. The
+    !! reduced variable is recorded back onto the original local variable on each processor.
+    ! ! @param var_loc Some variable containing the local value which should be reduced amongst all the processors in the
+    ! communicator.
+    !! @param var_glb The globally reduced value
     impure subroutine s_mpi_allreduce_min(var_loc, var_glb)
 
         real(wp), intent(in)  :: var_loc
@@ -379,12 +478,17 @@ contains
 #ifdef MFC_MPI
         integer :: ierr  !< Generic flag used to identify and report MPI errors
 
+        ! Performing the reduction procedure
         call MPI_ALLREDUCE(var_loc, var_glb, 1, mpi_p, MPI_MIN, MPI_COMM_WORLD, ierr)
 #endif
 
     end subroutine s_mpi_allreduce_min
 
-    !> Reduce a local real value to its global maximum across all MPI ranks.
+    !> The following subroutine takes the input local variable from all processors and reduces to the maximum of all values. The
+    !! reduced variable is recorded back onto the original local variable on each processor.
+    ! ! @param var_loc Some variable containing the local value which should be reduced amongst all the processors in the
+    ! communicator.
+    !! @param var_glb The globally reduced value
     impure subroutine s_mpi_allreduce_max(var_loc, var_glb)
 
         real(wp), intent(in)  :: var_loc
@@ -393,20 +497,28 @@ contains
 #ifdef MFC_MPI
         integer :: ierr  !< Generic flag used to identify and report MPI errors
 
+        ! Performing the reduction procedure
         call MPI_ALLREDUCE(var_loc, var_glb, 1, mpi_p, MPI_MAX, MPI_COMM_WORLD, ierr)
 #endif
 
     end subroutine s_mpi_allreduce_max
 
-    !> Reduce a local real value to its global minimum across all ranks
+    !> The following subroutine takes the inputted variable and determines its minimum value on the entire computational domain. The
+    !! result is stored back into inputted variable.
+    ! ! @param var_loc holds the local value to be reduced among all the processors in communicator. On output, the variable holds
+    ! the minimum value, reduced amongst all of the local values.
     impure subroutine s_mpi_reduce_min(var_loc)
 
         real(wp), intent(inout) :: var_loc
 
 #ifdef MFC_MPI
-        integer  :: ierr  !< Generic flag used to identify and report MPI errors
+        integer :: ierr  !< Generic flag used to identify and report MPI errors
+
+        ! Temporary storage variable that holds the reduced minimum value
         real(wp) :: var_glb
 
+        ! Performing reduction procedure and eventually storing its result into the variable that was initially inputted into the
+        ! subroutine
         call MPI_REDUCE(var_loc, var_glb, 1, mpi_p, MPI_MIN, 0, MPI_COMM_WORLD, ierr)
 
         call MPI_BCAST(var_glb, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
@@ -416,15 +528,25 @@ contains
 
     end subroutine s_mpi_reduce_min
 
-    !> Reduce a 2-element variable to its global maximum value with the owning processor rank (MPI_MAXLOC).
-    !> Reduce a local value to its global maximum with location (rank) across all ranks
+    !> The following subroutine takes the first element of the 2-element inputted variable and determines its maximum value on the
+    !! entire computational domain. The result is stored back into the first element of the variable while the rank of the processor
+    !! that is in charge of the sub- domain containing the maximum is stored into the second element of the variable.
+    ! ! @param var_loc On input, this variable holds the local value and processor rank, which are to be reduced among all the
+    ! processors in communicator. On output, this variable holds the maximum value, reduced amongst all of the local values, and the
+    ! process rank to which the value belongs.
     impure subroutine s_mpi_reduce_maxloc(var_loc)
 
         real(wp), dimension(2), intent(inout) :: var_loc
 
 #ifdef MFC_MPI
-        integer                :: ierr     !< Generic flag used to identify and report MPI errors
+        integer :: ierr  !< Generic flag used to identify and report MPI errors
+
+        !> Temporary storage variable that holds the reduced maximum value and the rank of the processor with which the value is
+        !! associated
         real(wp), dimension(2) :: var_glb  !< Reduced (max value, rank) pair
+
+        ! Performing reduction procedure and eventually storing its result into the variable that was initially inputted into the
+        ! subroutine
         call MPI_REDUCE(var_loc, var_glb, 1, mpi_2p, MPI_MAXLOC, 0, MPI_COMM_WORLD, ierr)
 
         call MPI_BCAST(var_glb, 1, mpi_2p, 0, MPI_COMM_WORLD, ierr)
@@ -435,6 +557,8 @@ contains
     end subroutine s_mpi_reduce_maxloc
 
     !> The subroutine terminates the MPI execution environment.
+    !! @param prnt error message to be printed
+    !! @param code optional exit code
     impure subroutine s_mpi_abort(prnt, code)
 
         character(len=*), intent(in), optional :: prnt
@@ -456,6 +580,7 @@ contains
             stop 1
         end if
 #else
+        ! Terminating the MPI environment
         if (present(code)) then
             call MPI_ABORT(MPI_COMM_WORLD, code, ierr)
         else
@@ -471,6 +596,7 @@ contains
 #ifdef MFC_MPI
         integer :: ierr  !< Generic flag used to identify and report MPI errors
 
+        ! Calling MPI_BARRIER
         call MPI_BARRIER(MPI_COMM_WORLD, ierr)
 #endif
 
@@ -482,6 +608,7 @@ contains
 #ifdef MFC_MPI
         integer :: ierr  !< Generic flag used to identify and report MPI errors
 
+        ! Finalizing the MPI environment
         call MPI_FINALIZE(ierr)
 #endif
 
@@ -489,6 +616,12 @@ contains
 
     !> The goal of this procedure is to populate the buffers of the cell-average conservative variables by communicating with the
     !! neighboring processors.
+    !! @param q_comm Cell-average conservative variables
+    !! @param mpi_dir MPI communication coordinate direction
+    !! @param pbc_loc Processor boundary condition (PBC) location
+    !! @param nVar Number of variables to communicate
+    !! @param pb_in Optional internal bubble pressure
+    !! @param mv_in Optional bubble mass velocity
     subroutine s_mpi_sendrecv_variables_buffers(q_comm, mpi_dir, pbc_loc, nVar, pb_in, mv_in)
 
         type(scalar_field), dimension(1:), intent(inout) :: q_comm
@@ -695,7 +828,7 @@ contains
                 #:endif
             end if
         #:endfor
-        call nvtxEndRange  ! Packbuf
+        call nvtxEndRange
 
         ! Send/Recv
 #ifdef MFC_SIMULATION
@@ -720,7 +853,7 @@ contains
                     call MPI_SENDRECV(buff_send, buffer_count, mpi_p, dst_proc, send_tag, buff_recv, buffer_count, mpi_p, &
                                       & src_proc, recv_tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
 
-                    call nvtxEndRange  ! RHS-MPI-SENDRECV-(NO)-RDMA
+                    call nvtxEndRange
 
                     call nvtxStartRange("RHS-COMM-HOST2DEV")
                     $:GPU_UPDATE(device='[buff_recv]')
@@ -842,6 +975,7 @@ contains
                         $:END_GPU_PARALLEL_LOOP()
                     end if
                 #:else
+                    ! Unpacking buffer from bc_z%beg
                     $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
                     do i = 1, nVar
                         do l = -buff_size, -1
@@ -903,19 +1037,265 @@ contains
 
     end subroutine s_mpi_sendrecv_variables_buffers
 
-    !> Decompose the computational domain among processors by balancing cells per rank in each coordinate direction.
+    !> The goal of this procedure is to populate the buffers of the cell-average conservative variables by communicating with the
+    !! neighboring processors.
+    !! @param q_cons_vf Cell-average conservative variables
+    !! @param mpi_dir MPI communication coordinate direction
+    !! @param pbc_loc Processor boundary condition (PBC) location
+    subroutine s_mpi_reduce_beta_variables_buffers(q_comm, mpi_dir, pbc_loc, nVar)
+
+        type(scalar_field), dimension(1:), intent(inout) :: q_comm
+        integer, intent(in)                              :: mpi_dir, pbc_loc, nVar
+        integer                                          :: i, j, k, l, r, q  !< Generic loop iterators
+        integer                                          :: lb_size
+        integer                                          :: buffer_counts(1:3), buffer_count
+        type(int_bounds_info)                            :: boundary_conditions(1:3)
+        integer                                          :: beg_end(1:2), grid_dims(1:3)
+        integer                                          :: dst_proc, src_proc, recv_tag, send_tag
+        logical                                          :: beg_end_geq_0, qbmm_comm, replace_buff
+        integer                                          :: pack_offset, unpack_offset
+
+#ifdef MFC_MPI
+        integer :: ierr  !< Generic flag used to identify and report MPI errors
+
+        call nvtxStartRange("BETA-COMM-PACKBUF")
+
+        ! Set bounds for each dimension Always include the full buffer range for each existing dimension. The Gaussian smearing
+        ! kernel writes to buffer cells even at physical boundaries, and these contributions must be communicated to neighbors in
+        ! other directions via ADD operations.
+        comm_coords(1)%beg = -mapcells - 1
+        comm_coords(1)%end = m + mapcells + 1
+        comm_coords(2)%beg = merge(-mapcells - 1, 0, n > 0)
+        comm_coords(2)%end = merge(n + mapcells + 1, n, n > 0)
+        comm_coords(3)%beg = merge(-mapcells - 1, 0, p > 0)
+        comm_coords(3)%end = merge(p + mapcells + 1, p, p > 0)
+
+        ! Compute sizes
+        comm_size(1) = comm_coords(1)%end - comm_coords(1)%beg + 1
+        comm_size(2) = comm_coords(2)%end - comm_coords(2)%beg + 1
+        comm_size(3) = comm_coords(3)%end - comm_coords(3)%beg + 1
+
+        ! Buffer counts using the conditional sizes
+        v_size = nVar
+        lb_size = 2*(mapcells + 1)  ! Size of the buffer region for beta variables (-mapcells - 1, mapcells)
+        buffer_counts = (/lb_size*v_size*comm_size(2)*comm_size(3), lb_size*v_size*comm_size(1)*comm_size(3), &
+                         & lb_size*v_size*comm_size(1)*comm_size(2)/)
+
+        $:GPU_UPDATE(device='[v_size, comm_coords, comm_size]')
+
+        buffer_count = buffer_counts(mpi_dir)
+        boundary_conditions = (/bc_x, bc_y, bc_z/)
+        beg_end = (/boundary_conditions(mpi_dir)%beg, boundary_conditions(mpi_dir)%end/)
+        beg_end_geq_0 = beg_end(max(pbc_loc, 0) - pbc_loc + 1) >= 0
+
+        ! Implements: pbc_loc bc_x >= 0 -> [send/recv]_tag [dst/src]_proc -1 (=0) 0 -> [1,0] [0,0] | 0 0 [1,0] [beg,beg] -1 (=0) 1
+        ! -> [0,0] [1,0] | 0 1 [0,0] [end,beg] +1 (=1) 0 -> [0,1] [1,1] | 1 0 [0,1] [end,end] +1 (=1) 1 -> [1,1] [0,1] | 1 1 [1,1]
+        ! [beg,end]
+
+        send_tag = f_logical_to_int(.not. f_xor(beg_end_geq_0, pbc_loc == 1))
+        recv_tag = f_logical_to_int(pbc_loc == 1)
+
+        dst_proc = beg_end(1 + f_logical_to_int(f_xor(pbc_loc == 1, beg_end_geq_0)))
+        src_proc = beg_end(1 + f_logical_to_int(pbc_loc == 1))
+
+        grid_dims = (/m, n, p/)
+
+        pack_offset = 0
+        if (f_xor(pbc_loc == 1, beg_end_geq_0)) then
+            pack_offset = grid_dims(mpi_dir) + 1
+        end if
+
+        unpack_offset = 0
+        if (pbc_loc == 1) then
+            unpack_offset = grid_dims(mpi_dir) + 1
+        end if
+
+        replace_buff = .false.
+        if (pbc_loc == 1 .and. beg_end_geq_0) replace_buff = .true.
+
+        ! Pack Buffer to Send
+        #:for mpi_dir in [1, 2, 3]
+            if (mpi_dir == ${mpi_dir}$) then
+                #:if mpi_dir == 1
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
+                    do l = comm_coords(3)%beg, comm_coords(3)%end
+                        do k = comm_coords(2)%beg, comm_coords(2)%end
+                            do j = -mapcells - 1, mapcells
+                                do i = 1, v_size
+                                    r = (i - 1) + v_size*((j + mapcells + 1) + lb_size*((k - comm_coords(2)%beg) + comm_size(2) &
+                                         & *(l - comm_coords(3)%beg)))
+                                    buff_send(r) = real(q_comm(beta_vars(i))%sf(j + pack_offset, k, l), kind=wp)
+                                end do
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                #:elif mpi_dir == 2
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
+                    do i = 1, v_size
+                        do l = comm_coords(3)%beg, comm_coords(3)%end
+                            do k = -mapcells - 1, mapcells
+                                do j = comm_coords(1)%beg, comm_coords(1)%end
+                                    r = (i - 1) + v_size*((j - comm_coords(1)%beg) + comm_size(1)*((k + mapcells + 1) &
+                                         & + lb_size*(l - comm_coords(3)%beg)))
+                                    buff_send(r) = real(q_comm(beta_vars(i))%sf(j, k + pack_offset, l), kind=wp)
+                                end do
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                #:else
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
+                    do i = 1, v_size
+                        do l = -mapcells - 1, mapcells
+                            do k = comm_coords(2)%beg, comm_coords(2)%end
+                                do j = comm_coords(1)%beg, comm_coords(1)%end
+                                    r = (i - 1) + v_size*((j - comm_coords(1)%beg) + comm_size(1)*((k - comm_coords(2)%beg) &
+                                         & + comm_size(2)*(l + mapcells + 1)))
+                                    buff_send(r) = real(q_comm(beta_vars(i))%sf(j, k, l + pack_offset), kind=wp)
+                                end do
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                #:endif
+            end if
+        #:endfor
+        call nvtxEndRange
+
+        ! Send/Recv
+#ifdef MFC_SIMULATION
+        #:for rdma_mpi in [False, True]
+            if (rdma_mpi .eqv. ${'.true.' if rdma_mpi else '.false.'}$) then
+                #:if rdma_mpi
+                    #:call GPU_HOST_DATA(use_device_addr='[buff_send, buff_recv]')
+                        call nvtxStartRange("BETA-COMM-SENDRECV-RDMA")
+
+                        call MPI_SENDRECV(buff_send, buffer_count, mpi_p, dst_proc, send_tag, buff_recv, buffer_count, mpi_p, &
+                                          & src_proc, recv_tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+
+                        call nvtxEndRange  ! RHS-MPI-SENDRECV-(NO)-RDMA
+                    #:endcall GPU_HOST_DATA
+                    $:GPU_WAIT()
+                #:else
+                    call nvtxStartRange("BETA-COMM-DEV2HOST")
+                    $:GPU_UPDATE(host='[buff_send]')
+                    call nvtxEndRange
+                    call nvtxStartRange("BETA-COMM-SENDRECV-NO-RDMA")
+
+                    call MPI_SENDRECV(buff_send, buffer_count, mpi_p, dst_proc, send_tag, buff_recv, buffer_count, mpi_p, &
+                                      & src_proc, recv_tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+
+                    call nvtxEndRange
+
+                    call nvtxStartRange("BETA-COMM-HOST2DEV")
+                    $:GPU_UPDATE(device='[buff_recv]')
+                    call nvtxEndRange
+                #:endif
+            end if
+        #:endfor
+#else
+        call MPI_SENDRECV(buff_send, buffer_count, mpi_p, dst_proc, send_tag, buff_recv, buffer_count, mpi_p, src_proc, recv_tag, &
+                          & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+#endif
+
+        ! Unpack Received Buffer
+        call nvtxStartRange("BETA-COMM-UNPACKBUF")
+        #:for mpi_dir in [1, 2, 3]
+            if (mpi_dir == ${mpi_dir}$) then
+                #:if mpi_dir == 1
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]',copyin='[replace_buff]')
+                    do l = comm_coords(3)%beg, comm_coords(3)%end
+                        do k = comm_coords(2)%beg, comm_coords(2)%end
+                            do j = -mapcells - 1, mapcells
+                                do i = 1, v_size
+                                    r = (i - 1) + v_size*((j + mapcells + 1) + lb_size*((k - comm_coords(2)%beg) + comm_size(2) &
+                                         & *(l - comm_coords(3)%beg)))
+                                    if (replace_buff) then
+                                        q_comm(beta_vars(i))%sf(j + unpack_offset, k, l) = real(buff_recv(r), kind=stp)
+                                    else
+                                        q_comm(beta_vars(i))%sf(j + unpack_offset, k, &
+                                               & l) = q_comm(beta_vars(i))%sf(j + unpack_offset, k, l) + real(buff_recv(r), &
+                                               & kind=stp)
+                                    end if
+                                end do
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                #:elif mpi_dir == 2
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]',copyin='[replace_buff]')
+                    do i = 1, v_size
+                        do l = comm_coords(3)%beg, comm_coords(3)%end
+                            do k = -mapcells - 1, mapcells
+                                do j = comm_coords(1)%beg, comm_coords(1)%end
+                                    r = (i - 1) + v_size*((j - comm_coords(1)%beg) + comm_size(1)*((k + mapcells + 1) &
+                                         & + lb_size*(l - comm_coords(3)%beg)))
+                                    if (replace_buff) then
+                                        q_comm(beta_vars(i))%sf(j, k + unpack_offset, l) = real(buff_recv(r), kind=stp)
+                                    else
+                                        q_comm(beta_vars(i))%sf(j, k + unpack_offset, l) = q_comm(beta_vars(i))%sf(j, &
+                                               & k + unpack_offset, l) + real(buff_recv(r), kind=stp)
+                                    end if
+                                end do
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                #:else
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]',copyin='[replace_buff]')
+                    do i = 1, v_size
+                        do l = -mapcells - 1, mapcells
+                            do k = comm_coords(2)%beg, comm_coords(2)%end
+                                do j = comm_coords(1)%beg, comm_coords(1)%end
+                                    r = (i - 1) + v_size*((j - comm_coords(1)%beg) + comm_size(1)*((k - comm_coords(2)%beg) &
+                                         & + comm_size(2)*(l + mapcells + 1)))
+                                    if (replace_buff) then
+                                        q_comm(beta_vars(i))%sf(j, k, l + unpack_offset) = real(buff_recv(r), kind=stp)
+                                    else
+                                        q_comm(beta_vars(i))%sf(j, k, l + unpack_offset) = q_comm(beta_vars(i))%sf(j, k, &
+                                               & l + unpack_offset) + real(buff_recv(r), kind=stp)
+                                    end if
+                                end do
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                #:endif
+            end if
+        #:endfor
+        call nvtxEndRange
+#endif
+
+    end subroutine s_mpi_reduce_beta_variables_buffers
+
+    !> The purpose of this procedure is to optimally decompose the computational domain among the available processors. This is
+    !! performed by attempting to award each processor, in each of the coordinate directions, approximately the same number of
+    !! cells, and then recomputing the affected global parameters.
     subroutine s_mpi_decompose_computational_domain
 
 #ifdef MFC_MPI
         integer :: num_procs_x, num_procs_y, num_procs_z  !< Optimal number of processors in the x-, y- and z-directions
+
         !> Non-optimal number of processors in the x-, y- and z-directions
         real(wp) :: tmp_num_procs_x, tmp_num_procs_y, tmp_num_procs_z
         real(wp) :: fct_min        !< Processor factorization (fct) minimization parameter
         integer  :: MPI_COMM_CART  !< Cartesian processor topology communicator
-        integer  :: rem_cells      !< Remaining cells after distribution among processors
-        integer  :: recon_order    !< WENO or MUSCL reconstruction order
-        integer  :: i, j           !< Generic loop iterators
-        integer  :: ierr           !< Generic flag used to identify and report MPI errors
+
+        !> Remaining number of cells, in a particular coordinate direction, after the majority is divided up among the available
+        !! processors
+        integer :: rem_cells
+        integer :: recon_order  !< WENO or MUSCL reconstruction order
+        integer :: i, j, k      !< Generic loop iterators
+        integer :: ierr         !< Generic flag used to identify and report MPI errors
+
+        ! temp array to store neighbor rank coordinates
+        integer, dimension(1:num_dims) :: neighbor_coords
+
+        ! Zeroing out communication needs for moving EL bubbles/particles
+        nidx(1)%beg = 0; nidx(1)%end = 0
+        nidx(2)%beg = 0; nidx(2)%end = 0
+        nidx(3)%beg = 0; nidx(3)%end = 0
 
         if (recon_type == WENO_TYPE) then
             recon_order = weno_order
@@ -964,9 +1344,11 @@ contains
                             end if
                         end if
                     end do
-                else
+                else  ! PBC at the end only
                     if (cyl_coord .and. p > 0) then
-                        ! Pencil blocking for cylindrical coordinates (Fourier filter near axis)
+                        ! Implement pencil processor blocking if using cylindrical coordinates so that all cells in azimuthal
+                        ! direction are stored on a single processor. This is necessary for efficient application of Fourier filter
+                        ! near axis.
 
                         ! Initial values of the processor factorization optimization
                         num_procs_x = 1
@@ -1048,6 +1430,7 @@ contains
 
                 ! Finding the Cartesian coordinates of the local process
                 call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 3, proc_coords, ierr)
+                ! END: 3D Cartesian Processor Topology
 
                 ! Global Parameters for z-direction
 
@@ -1069,6 +1452,7 @@ contains
                     proc_coords(3) = proc_coords(3) - 1
                     call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_z%beg, ierr)
                     proc_coords(3) = proc_coords(3) + 1
+                    nidx(3)%beg = -1
                 end if
 
                 ! Boundary condition at the end
@@ -1076,20 +1460,21 @@ contains
                     proc_coords(3) = proc_coords(3) + 1
                     call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_z%end, ierr)
                     proc_coords(3) = proc_coords(3) - 1
+                    nidx(3)%end = 1
                 end if
 
 #ifdef MFC_POST_PROCESS
                 ! Ghost zone at the beginning
                 if (proc_coords(3) > 0 .and. format == 1) then
                     offset_z%beg = 2
-                else
+                else  ! PBC at the end only
                     offset_z%beg = 0
                 end if
 
                 ! Ghost zone at the end
                 if (proc_coords(3) < num_procs_z - 1 .and. format == 1) then
                     offset_z%end = 2
-                else
+                else  ! PBC at the end only
                     offset_z%end = 0
                 end if
 #endif
@@ -1101,7 +1486,7 @@ contains
                     else
                         start_idx(3) = (p + 1)*proc_coords(3) + rem_cells
                     end if
-                else
+                else  ! PBC at the end only
 #ifdef MFC_PRE_PROCESS
                     if (old_grid .neqv. .true.) then
                         dz = (z_domain%end - z_domain%beg)/real(p_glb + 1, wp)
@@ -1119,7 +1504,7 @@ contains
                 end if
 
                 ! 2D Cartesian Processor Topology
-            else
+            else  ! PBC at the end
                 ! Initial estimate of optimal processor topology
                 num_procs_x = 1
                 num_procs_y = num_procs
@@ -1160,6 +1545,7 @@ contains
                 ! Finding the Cartesian coordinates of the local process
                 call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 2, proc_coords, ierr)
             end if
+            ! END: 2D Cartesian Processor Topology
 
             ! Global Parameters for y-direction
 
@@ -1181,6 +1567,7 @@ contains
                 proc_coords(2) = proc_coords(2) - 1
                 call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_y%beg, ierr)
                 proc_coords(2) = proc_coords(2) + 1
+                nidx(2)%beg = -1
             end if
 
             ! Boundary condition at the end
@@ -1188,20 +1575,21 @@ contains
                 proc_coords(2) = proc_coords(2) + 1
                 call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_y%end, ierr)
                 proc_coords(2) = proc_coords(2) - 1
+                nidx(2)%end = 1
             end if
 
 #ifdef MFC_POST_PROCESS
             ! Ghost zone at the beginning
             if (proc_coords(2) > 0 .and. format == 1) then
                 offset_y%beg = 2
-            else
+            else  ! PBC at the end
                 offset_y%beg = 0
             end if
 
             ! Ghost zone at the end
             if (proc_coords(2) < num_procs_y - 1 .and. format == 1) then
                 offset_y%end = 2
-            else
+            else  ! PBC at the end
                 offset_y%end = 0
             end if
 #endif
@@ -1210,10 +1598,10 @@ contains
             if (parallel_io) then
                 if (proc_coords(2) < rem_cells) then
                     start_idx(2) = (n + 1)*proc_coords(2)
-                else
+                else  ! PBC at the end only
                     start_idx(2) = (n + 1)*proc_coords(2) + rem_cells
                 end if
-            else
+            else  ! PBC at the end
 #ifdef MFC_PRE_PROCESS
                 if (old_grid .neqv. .true.) then
                     dy = (y_domain%end - y_domain%beg)/real(n_glb + 1, wp)
@@ -1264,6 +1652,7 @@ contains
             proc_coords(1) = proc_coords(1) - 1
             call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_x%beg, ierr)
             proc_coords(1) = proc_coords(1) + 1
+            nidx(1)%beg = -1
         end if
 
         ! Boundary condition at the end
@@ -1271,6 +1660,7 @@ contains
             proc_coords(1) = proc_coords(1) + 1
             call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_x%end, ierr)
             proc_coords(1) = proc_coords(1) - 1
+            nidx(1)%end = 1
         end if
 
 #ifdef MFC_POST_PROCESS
@@ -1293,7 +1683,7 @@ contains
         if (parallel_io) then
             if (proc_coords(1) < rem_cells) then
                 start_idx(1) = (m + 1)*proc_coords(1)
-            else
+            else  ! PBC at the end
                 start_idx(1) = (m + 1)*proc_coords(1) + rem_cells
             end if
         else
@@ -1304,13 +1694,27 @@ contains
                 if (proc_coords(1) < rem_cells) then
                     x_domain%beg = x_domain%beg + dx*real((m + 1)*proc_coords(1))
                     x_domain%end = x_domain%end - dx*real((m + 1)*(num_procs_x - proc_coords(1) - 1) - (num_procs_x - rem_cells))
-                else
+                else  ! PBC at the end only
                     x_domain%beg = x_domain%beg + dx*real((m + 1)*proc_coords(1) + rem_cells)
                     x_domain%end = x_domain%end - dx*real((m + 1)*(num_procs_x - proc_coords(1) - 1))
                 end if
             end if
 #endif
         end if
+
+        @:ALLOCATE(neighbor_ranks(nidx(1)%beg:nidx(1)%end, nidx(2)%beg:nidx(2)%end, nidx(3)%beg:nidx(3)%end))
+        do k = nidx(3)%beg, nidx(3)%end
+            do j = nidx(2)%beg, nidx(2)%end
+                do i = nidx(1)%beg, nidx(1)%end
+                    if (abs(i) + abs(j) + abs(k) > 0) then
+                        neighbor_coords(1) = proc_coords(1) + i
+                        if (num_dims > 1) neighbor_coords(2) = proc_coords(2) + j
+                        if (num_dims > 2) neighbor_coords(3) = proc_coords(3) + k
+                        call MPI_CART_RANK(MPI_COMM_CART, neighbor_coords, neighbor_ranks(i, j, k), ierr)
+                    end if
+                end do
+            end do
+        end do
 #endif
 
     end subroutine s_mpi_decompose_computational_domain
@@ -1318,6 +1722,8 @@ contains
     !> The goal of this procedure is to populate the buffers of the grid variables by communicating with the neighboring processors.
     !! Note that only the buffers of the cell-width distributions are handled in such a way. This is because the buffers of
     !! cell-boundary locations may be calculated directly from those of the cell-width distributions.
+    !! @param mpi_dir MPI communication coordinate direction
+    !! @param pbc_loc Processor boundary condition (PBC) location
 #ifndef MFC_PRE_PROCESS
     subroutine s_mpi_sendrecv_grid_variables_buffers(mpi_dir, pbc_loc)
 
@@ -1327,64 +1733,90 @@ contains
 #ifdef MFC_MPI
         integer :: ierr  !< Generic flag used to identify and report MPI errors
 
+        ! MPI Communication in x-direction
         if (mpi_dir == 1) then
             if (pbc_loc == -1) then  ! PBC at the beginning
 
                 if (bc_x%end >= 0) then  ! PBC at the beginning and end
+
+                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
                     call MPI_SENDRECV(dx(m - buff_size + 1), buff_size, mpi_p, bc_x%end, 0, dx(-buff_size), buff_size, mpi_p, &
                                       & bc_x%beg, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                else  ! PBC at the beginning only
+                else  ! PBC at the end only
+                    ! Send/receive buffer to/from bc_x%beg/bc_x%beg
                     call MPI_SENDRECV(dx(0), buff_size, mpi_p, bc_x%beg, 1, dx(-buff_size), buff_size, mpi_p, bc_x%beg, 0, &
                                       & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
                 end if
             else  ! PBC at the end
                 if (bc_x%beg >= 0) then  ! PBC at the end and beginning
+
+                    ! Send/receive buffer to/from bc_x%beg/bc_x%end
                     call MPI_SENDRECV(dx(0), buff_size, mpi_p, bc_x%beg, 1, dx(m + 1), buff_size, mpi_p, bc_x%end, 1, &
                                       & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
                 else  ! PBC at the end only
+                    ! Send/receive buffer to/from bc_x%end/bc_x%end
                     call MPI_SENDRECV(dx(m - buff_size + 1), buff_size, mpi_p, bc_x%end, 0, dx(m + 1), buff_size, mpi_p, &
                                       & bc_x%end, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
                 end if
             end if
+            ! END: MPI Communication in x-direction
+
+            ! MPI Communication in y-direction
         else if (mpi_dir == 2) then
             if (pbc_loc == -1) then  ! PBC at the beginning
 
                 if (bc_y%end >= 0) then  ! PBC at the beginning and end
+
+                    ! Send/receive buffer to/from bc_y%end/bc_y%beg
                     call MPI_SENDRECV(dy(n - buff_size + 1), buff_size, mpi_p, bc_y%end, 0, dy(-buff_size), buff_size, mpi_p, &
                                       & bc_y%beg, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                else  ! PBC at the beginning only
+                else  ! PBC at the end only
+                    ! Send/receive buffer to/from bc_y%beg/bc_y%beg
                     call MPI_SENDRECV(dy(0), buff_size, mpi_p, bc_y%beg, 1, dy(-buff_size), buff_size, mpi_p, bc_y%beg, 0, &
                                       & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
                 end if
             else  ! PBC at the end
                 if (bc_y%beg >= 0) then  ! PBC at the end and beginning
+
+                    ! Send/receive buffer to/from bc_y%beg/bc_y%end
                     call MPI_SENDRECV(dy(0), buff_size, mpi_p, bc_y%beg, 1, dy(n + 1), buff_size, mpi_p, bc_y%end, 1, &
                                       & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
                 else  ! PBC at the end only
+                    ! Send/receive buffer to/from bc_y%end/bc_y%end
                     call MPI_SENDRECV(dy(n - buff_size + 1), buff_size, mpi_p, bc_y%end, 0, dy(n + 1), buff_size, mpi_p, &
                                       & bc_y%end, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
                 end if
             end if
+            ! END: MPI Communication in y-direction
+
+            ! MPI Communication in z-direction
         else
             if (pbc_loc == -1) then  ! PBC at the beginning
 
                 if (bc_z%end >= 0) then  ! PBC at the beginning and end
+
+                    ! Send/receive buffer to/from bc_z%end/bc_z%beg
                     call MPI_SENDRECV(dz(p - buff_size + 1), buff_size, mpi_p, bc_z%end, 0, dz(-buff_size), buff_size, mpi_p, &
                                       & bc_z%beg, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                else  ! PBC at the beginning only
+                else  ! PBC at the end only
+                    ! Send/receive buffer to/from bc_z%beg/bc_z%beg
                     call MPI_SENDRECV(dz(0), buff_size, mpi_p, bc_z%beg, 1, dz(-buff_size), buff_size, mpi_p, bc_z%beg, 0, &
                                       & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
                 end if
             else  ! PBC at the end
                 if (bc_z%beg >= 0) then  ! PBC at the end and beginning
+
+                    ! Send/receive buffer to/from bc_z%beg/bc_z%end
                     call MPI_SENDRECV(dz(0), buff_size, mpi_p, bc_z%beg, 1, dz(p + 1), buff_size, mpi_p, bc_z%end, 1, &
                                       & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
                 else  ! PBC at the end only
+                    ! Send/receive buffer to/from bc_z%end/bc_z%end
                     call MPI_SENDRECV(dz(p - buff_size + 1), buff_size, mpi_p, bc_z%end, 0, dz(p + 1), buff_size, mpi_p, &
                                       & bc_z%end, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
                 end if
             end if
         end if
+        ! END: MPI Communication in z-direction
 #endif
 
     end subroutine s_mpi_sendrecv_grid_variables_buffers
diff --git a/src/post_process/m_global_parameters.fpp b/src/post_process/m_global_parameters.fpp
index 0530c31081..b25fb651d5 100644
--- a/src/post_process/m_global_parameters.fpp
+++ b/src/post_process/m_global_parameters.fpp
@@ -26,6 +26,7 @@ module m_global_parameters
     ! Computational Domain Parameters
 
     integer :: proc_rank  !< Rank of the local processor
+
     !> @name Number of cells in the x-, y- and z-coordinate directions
     !> @{
     integer :: m, m_root
@@ -50,6 +51,7 @@ module m_global_parameters
 
     integer :: num_dims  !< Number of spatial dimensions
     integer :: num_vels  !< Number of velocity components (different from num_dims for mhd)
+
     !> @name Cell-boundary locations in the x-, y- and z-coordinate directions
     !> @{
     real(wp), allocatable, dimension(:) :: x_cb, x_root_cb, y_cb, z_cb
@@ -66,10 +68,14 @@ module m_global_parameters
     real(wp), allocatable, dimension(:) :: dx, dy, dz
     !> @}
 
-    integer :: buff_size     !< Number of ghost cells for boundary condition storage
-    integer :: t_step_start  !< First time-step directory
-    integer :: t_step_stop   !< Last time-step directory
-    integer :: t_step_save   !< Interval between consecutive time-step directory
+    !> Number of cells in buffer region. For the variables which feature a buffer region, this region is used to store information
+    !! outside the computational domain based on the boundary conditions.
+    integer              :: buff_size
+    integer, allocatable :: beta_vars(:)  !< Indices of variables to communicate for bubble/particle coupling
+    integer              :: t_step_start  !< First time-step directory
+    integer              :: t_step_stop   !< Last time-step directory
+    integer              :: t_step_save   !< Interval between consecutive time-step directory
+
     !> @name IO options for adaptive time-stepping
     !> @{
     logical  :: cfl_adap_dt, cfl_const_dt, cfl_dt
@@ -80,7 +86,8 @@ module m_global_parameters
     integer  :: n_start
     !> @}
 
-    ! NOTE: m_root, x_root_cb, x_root_cc = defragmented grid (1D only; equals m, x_cb, x_cc in serial)
+    ! NOTE: The variables m_root, x_root_cb and x_root_cc contain the grid data of the defragmented computational domain. They are
+    ! only used in 1D. For serial simulations, they are equal to m, x_cb and x_cc, respectively.
 
     !> @name Simulation Algorithm Parameters
     !> @{
@@ -110,6 +117,7 @@ module m_global_parameters
     !> @}
 
     integer :: avg_state  !< Average state evaluation method
+
     !> @name Annotations of the structure, i.e. the organization, of the state vectors
     !> @{
     type(int_bounds_info) :: cont_idx              !< Indexes of first & last continuity eqns.
@@ -135,7 +143,8 @@ module m_global_parameters
     ! Cell Indices for the (local) interior points (O-m, O-n, 0-p). Stands for "InDices With BUFFer".
     type(int_bounds_info) :: idwint(1:3)
 
-    ! Cell indices (InDices With BUFFer): includes buffer in simulation only
+    ! Cell Indices for the entire (local) domain. In simulation, this includes the buffer region. idwbuff and idwint are the same
+    ! otherwise. Stands for "InDices With BUFFer".
     type(int_bounds_info) :: idwbuff(1:3)
     integer               :: num_bc_patches
     logical               :: bc_io
@@ -144,16 +153,23 @@ module m_global_parameters
     type(int_bounds_info) :: bc_x, bc_y, bc_z
     !> @}
 
-    integer                            :: shear_num  !< Number of shear stress components
-    integer, dimension(3)              :: shear_indices  !< Indices of the stress components that represent shear stress
-    integer                            :: shear_BC_flip_num  !< Number of shear stress components to reflect for boundary conditions
-    integer, dimension(3, 2)           :: shear_BC_flip_indices  !< Shear stress BC reflection indices (1:3, 1:shear_BC_flip_num)
-    logical                            :: parallel_io  !< Format of the data files
-    logical                            :: sim_data
-    logical                            :: file_per_process  !< output format
-    integer, allocatable, dimension(:) :: proc_coords  !< Processor coordinates in MPI_CART_COMM
+    integer               :: shear_num !! Number of shear stress components
+    integer, dimension(3) :: shear_indices      !< Indices of the stress components that represent shear stress
+    integer               :: shear_BC_flip_num  !< Number of shear stress components to reflect for boundary conditions
+    !> Indices of shear stress components to reflect for boundary conditions. Size: (1:3, 1:shear_BC_flip_num) for (x/y/z,
+    !! [indices])
+    integer, dimension(3, 2)               :: shear_BC_flip_indices
+    logical                                :: parallel_io       !< Format of the data files
+    logical                                :: sim_data
+    logical                                :: file_per_process  !< output format
+    integer, allocatable, dimension(:)     :: proc_coords       !< Processor coordinates in MPI_CART_COMM
+    type(int_bounds_info), dimension(3)    :: nidx
+    integer, allocatable, dimension(:,:,:) :: neighbor_ranks
+    !! Neighbor processor ranks
+
     integer, allocatable, dimension(:) :: start_idx  !< Starting cell-center index of local processor in global grid
-    integer                            :: num_ibs  !< Number of immersed boundaries
+    integer                            :: num_ibs    !< Number of immersed boundaries
+
 #ifdef MFC_MPI
     type(mpi_io_var), public                      :: MPI_IO_DATA
     type(mpi_io_ib_var), public                   :: MPI_IO_IB_DATA
@@ -168,10 +184,17 @@ module m_global_parameters
     integer                 :: mpi_info_int
     !> @}
 
+    !> Database of the physical parameters of each of the fluids that is present in the flow. These include the stiffened gas
+    !! equation of state parameters, and the Reynolds numbers.
     type(physical_parameters), dimension(num_fluids_max) :: fluid_pp  !< Stiffened gas EOS parameters and Reynolds numbers per fluid
+
     ! Subgrid Bubble Parameters
     type(subgrid_bubble_physical_parameters) :: bub_pp
-    real(wp), allocatable, dimension(:)      :: adv  !< Advection variables
+
+    ! Subgrid Particle Parameters
+    type(subgrid_particle_physical_parameters) :: particle_pp
+    real(wp), allocatable, dimension(:)        :: adv  !< Advection variables
+
     ! Formatted Database File(s) Structure Parameters
 
     integer               :: format                                    !< Format of the database file(s)
@@ -180,6 +203,7 @@ module m_global_parameters
     logical               :: output_partial_domain                     !< Specify portion of domain to output for post-processing
     type(bounds_info)     :: x_output, y_output, z_output              !< Portion of domain to output for post-processing
     type(int_bounds_info) :: x_output_idx, y_output_idx, z_output_idx  !< Indices of domain to output for post-processing
+
     !> @name Size of the ghost zone layer in the x-, y- and z-coordinate directions. The definition of the ghost zone layers is only
     !! necessary when using the Silo database file format in multidimensions. These zones provide VisIt with the subdomain
     !! connectivity information that it requires in order to produce smooth plots.
@@ -201,6 +225,8 @@ module m_global_parameters
     logical                            :: E_wrt
     logical, dimension(num_fluids_max) :: alpha_rho_e_wrt
     logical                            :: fft_wrt
+    !> AMDFlang workaround: keep a dummy logical to avoid a compiler case-optimization bug when a parameter+GPU-kernel conditional
+    !! is false
     logical                            :: dummy  !< AMDFlang workaround for case-optimization + GPU-kernel bug
     logical                            :: pres_wrt
     logical, dimension(num_fluids_max) :: alpha_wrt
@@ -240,9 +266,19 @@ module m_global_parameters
     logical                            :: lag_betaC_wrt
     !> @}
 
+    !> Amplitude coefficients of the numerical Schlieren function that are used to adjust the intensity of numerical Schlieren
+    !! renderings for individual fluids. This enables waves and interfaces of varying strengths and in all of the fluids to be made
+    !! simultaneously visible on a single plot.
     real(wp), dimension(num_fluids_max) :: schlieren_alpha  !< Per-fluid Schlieren intensity amplitude coefficients
-    integer                             :: fd_order         !< Finite-difference order for vorticity and Schlieren derivatives
-    integer                             :: fd_number        !< Finite-difference half-stencil size: MAX(1, fd_order/2)
+
+    !> The order of the finite-difference (fd) approximations of the first-order derivatives that need to be evaluated when
+    !! vorticity and/or the numerical Schlieren function are to be outputted to the formatted database file(s).
+    integer :: fd_order
+
+    !> The finite-difference number is given by MAX(1, fd_order/2). Essentially, it is a measure of the half-size of the
+    !! finite-difference stencil for the selected order of accuracy.
+    integer :: fd_number
+
     !> @name Reference parameters for Tait EOS
     !> @{
     real(wp) :: rhoref, pref
@@ -271,6 +307,11 @@ module m_global_parameters
     integer :: nmom
     !> @}
 
+    !> @name Particle modeling variables and parameters
+    !> @{
+    real(wp) :: cp_particle, rho0ref_particle
+    !> @}
+
     !> @name surface tension coefficient
     !> @{
     real(wp) :: sigma
@@ -292,6 +333,7 @@ module m_global_parameters
     !> @name Lagrangian bubbles
     !> @{
     logical :: bubbles_lagrange
+    logical :: particles_lagrange
     !> @}
 
     real(wp) :: Bx0                       !< Constant magnetic field in the x-direction (1D)
@@ -303,7 +345,8 @@ contains
     !! parameters once they are read from the input file.
     impure subroutine s_assign_default_values_to_user_inputs
 
-        integer :: i  !< Generic loop iterator
+        integer :: i
+
         ! Logistics
 
         case_dir = '.'
@@ -398,6 +441,10 @@ contains
         bub_pp%R_v = dflt_real; R_v = dflt_real
         bub_pp%R_g = dflt_real; R_g = dflt_real
 
+        ! Subgrid particle parameters
+        particle_pp%rho0ref_particle = dflt_real
+        particle_pp%cp_particle = dflt_real
+
         ! Formatted database file(s) structure parameters
         format = dflt_int
 
@@ -477,6 +524,7 @@ contains
 
         ! Lagrangian bubbles modeling
         bubbles_lagrange = .false.
+        particles_lagrange = .false.
 
         ! IBM
         num_ibs = dflt_int
@@ -511,11 +559,11 @@ contains
 
             ! Annotating structure of the state and flux vectors belonging to the system of equations defined by the selected number
             ! of spatial dimensions and the gamma/pi_inf model
-            cont_idx%beg = 1
+            cont_idx%beg = 1  ! one continuity equation
             cont_idx%end = cont_idx%beg
-            mom_idx%beg = cont_idx%end + 1
+            mom_idx%beg = cont_idx%end + 1  ! one momentum equation in each
             mom_idx%end = cont_idx%end + num_vels
-            E_idx = mom_idx%end + 1
+            E_idx = mom_idx%end + 1  ! one energy equation
             adv_idx%beg = E_idx + 1
             adv_idx%end = adv_idx%beg + 1
             gamma_idx = adv_idx%beg
@@ -526,17 +574,17 @@ contains
         else if (model_eqns == 2) then
             ! Annotating structure of the state and flux vectors belonging to the system of equations defined by the selected number
             ! of spatial dimensions and the volume fraction model
-            cont_idx%beg = 1
+            cont_idx%beg = 1  ! one continuity equation
             cont_idx%end = num_fluids
-            mom_idx%beg = cont_idx%end + 1
+            mom_idx%beg = cont_idx%end + 1  ! one momentum equation in each
             mom_idx%end = cont_idx%end + num_vels
-            E_idx = mom_idx%end + 1
+            E_idx = mom_idx%end + 1  ! one energy equation
 
             if (igr) then
                 ! Volume fractions are stored in the indices immediately following the energy equation. IGR tracks a total of (N-1)
                 ! volume fractions for N fluids, hence the "-1" in adv_idx%end. If num_fluids = 1 then adv_idx%end < adv_idx%beg,
                 ! which skips all loops over the volume fractions since there is no volume fraction to track
-                adv_idx%beg = E_idx + 1  ! Alpha for fluid 1
+                adv_idx%beg = E_idx + 1
                 adv_idx%end = E_idx + num_fluids - 1
             else
                 ! Volume fractions are stored in the indices immediately following the energy equation. WENO/MUSCL + Riemann tracks
@@ -606,10 +654,9 @@ contains
                 end if
             end if
 
-            if (bubbles_lagrange) then
-                beta_idx = sys_size + 1
-                sys_size = beta_idx
-            end if
+            ! if (bubbles_lagrange) then beta_idx = sys_size + 1 sys_size = beta_idx end if
+
+            ! if (particles_lagrange) then beta_idx = sys_size + 1 sys_size = beta_idx end if
 
             if (mhd) then
                 B_idx%beg = sys_size + 1
@@ -625,11 +672,11 @@ contains
         else if (model_eqns == 3) then
             ! Annotating structure of the state and flux vectors belonging to the system of equations defined by the selected number
             ! of spatial dimensions and the volume fraction model
-            cont_idx%beg = 1
+            cont_idx%beg = 1  ! one continuity equation
             cont_idx%end = num_fluids
-            mom_idx%beg = cont_idx%end + 1
+            mom_idx%beg = cont_idx%end + 1  ! one momentum equation in each
             mom_idx%end = cont_idx%end + num_vels
-            E_idx = mom_idx%end + 1
+            E_idx = mom_idx%end + 1  ! one energy equation
             adv_idx%beg = E_idx + 1
             adv_idx%end = E_idx + num_fluids
             internalEnergies_idx%beg = adv_idx%end + 1
@@ -733,6 +780,16 @@ contains
                 sys_size = c_idx
             end if
 
+            if (bubbles_lagrange) then
+                beta_idx = sys_size + 1
+                sys_size = beta_idx
+            end if
+
+            if (particles_lagrange) then
+                beta_idx = sys_size + 1
+                sys_size = beta_idx
+            end if
+
             if (cont_damage) then
                 damage_idx = sys_size + 1
                 sys_size = damage_idx
@@ -748,6 +805,14 @@ contains
             end if
         end if
 
+        if (bubbles_lagrange) then
+            allocate (beta_vars(1:3))
+            beta_vars(1:3) = [1, 2, 5]
+        else if (particles_lagrange) then
+            allocate (beta_vars(1:8))
+            beta_vars(1:8) = [1, 2, 3, 4, 5, 6, 7, 8]
+        end if
+
         if (chemistry) then
             species_idx%beg = sys_size + 1
             species_idx%end = sys_size + num_species
@@ -889,7 +954,7 @@ contains
             grid_geometry = 1
         else if (cyl_coord .and. p == 0) then  ! Axisymmetric cylindrical grid
             grid_geometry = 2
-        else  ! Fully 3D cylindrical grid
+        else
             grid_geometry = 3
         end if
 
@@ -968,6 +1033,9 @@ contains
         if (ib) MPI_IO_IB_DATA%var%sf => null()
 #endif
 
+        if (allocated(neighbor_ranks)) deallocate (neighbor_ranks)
+        if (allocated(beta_vars)) deallocate (beta_vars)
+
     end subroutine s_finalize_global_parameters_module
 
 end module m_global_parameters
diff --git a/src/post_process/m_mpi_proxy.fpp b/src/post_process/m_mpi_proxy.fpp
index c91f140d39..a4d37f464c 100644
--- a/src/post_process/m_mpi_proxy.fpp
+++ b/src/post_process/m_mpi_proxy.fpp
@@ -31,10 +31,10 @@ contains
 #ifdef MFC_MPI
         integer :: i     !< Generic loop iterator
         integer :: ierr  !< Generic flag used to identify and report MPI errors
+
         ! Allocating and configuring the receive counts and the displacement vector variables used in variable-gather communication
         ! procedures. Note that these are only needed for either multidimensional runs that utilize the Silo database file format or
         ! for 1D simulations.
-
         if ((format == 1 .and. n > 0) .or. n == 0) then
             allocate (recvcounts(0:num_procs - 1))
             allocate (displs(0:num_procs - 1))
@@ -64,8 +64,8 @@ contains
 #ifdef MFC_MPI
         integer :: i     !< Generic loop iterator
         integer :: ierr  !< Generic flag used to identify and report MPI errors
-        ! Logistics
 
+        ! Logistics
         call MPI_BCAST(case_dir, len(case_dir), MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr)
 
         #:for VAR in [ 'm', 'n', 'p', 'm_glb', 'n_glb', 'p_glb',               &
@@ -87,11 +87,22 @@ contains
             & 'adv_n', 'ib', 'cfl_adap_dt', 'cfl_const_dt', 'cfl_dt',          &
             & 'surface_tension', 'hyperelasticity', 'bubbles_lagrange',        &
             & 'output_partial_domain', 'relativity', 'cont_damage', 'bc_io',   &
-            & 'down_sample','fft_wrt', 'hyper_cleaning', 'ib_state_wrt']
+            & 'down_sample','fft_wrt', 'hyper_cleaning', 'ib_state_wrt',   &
+            & 'particles_lagrange' ]
             call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
         #:endfor
 
         if (bubbles_lagrange) then
+            #:for VAR in ['lag_header', 'lag_txt_wrt', 'lag_db_wrt', 'lag_id_wrt',         &
+                & 'lag_pos_wrt', 'lag_pos_prev_wrt', 'lag_vel_wrt', 'lag_rad_wrt', &
+                & 'lag_rvel_wrt', 'lag_r0_wrt', 'lag_rmax_wrt', 'lag_rmin_wrt',    &
+                & 'lag_dphidt_wrt', 'lag_pres_wrt', 'lag_mv_wrt', 'lag_mg_wrt',    &
+                & 'lag_betaT_wrt', 'lag_betaC_wrt']
+                call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
+            #:endfor
+        end if
+
+        if (particles_lagrange) then
             #:for VAR in ['lag_header', 'lag_txt_wrt', 'lag_db_wrt', 'lag_id_wrt',         &
                 & 'lag_pos_wrt', 'lag_pos_prev_wrt', 'lag_vel_wrt', 'lag_rad_wrt', &
                 & 'lag_rvel_wrt', 'lag_r0_wrt', 'lag_rmax_wrt', 'lag_rmin_wrt',    &
@@ -127,6 +138,13 @@ contains
             #:endfor
         end if
 
+        ! Subgrid particle parameters
+        if (particles_lagrange) then
+            #:for VAR in ['rho0ref_particle','cp_particle']
+                call MPI_BCAST(particle_pp%${VAR}$, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
+            #:endfor
+        end if
+
         #:for VAR in [ 'pref', 'rhoref', 'R0ref', 'poly_sigma', 'Web', 'Ca', &
             & 'Re_inv', 'Bx0', 'sigma', 't_save', 't_stop',   &
             & 'x_output%beg', 'x_output%end', 'y_output%beg', &
@@ -138,7 +156,10 @@ contains
 
     end subroutine s_mpi_bcast_user_inputs
 
-    !> Gather spatial extents from all ranks for Silo database metadata
+    !> This subroutine gathers the Silo database metadata for the spatial extents in order to boost the performance of the
+    !! multidimensional visualization.
+    ! ! @param spatial_extents Spatial extents for each processor's sub-domain. First dimension corresponds to the minimum and
+    ! maximum values, respectively, while the second dimension corresponds to the processor rank.
     impure subroutine s_mpi_gather_spatial_extents(spatial_extents)
 
         real(wp), dimension(1:,0:), intent(inout) :: spatial_extents
@@ -148,7 +169,6 @@ contains
         real(wp) :: ext_temp(0:num_procs - 1)
 
         ! Simulation is 3D
-
         if (p > 0) then
             if (grid_geometry == 3) then
                 ! Minimum spatial extent in the r-direction
@@ -229,14 +249,14 @@ contains
 
     end subroutine s_mpi_gather_spatial_extents
 
-    !> Collect the sub-domain cell-boundary or cell-center location data from all processors and put back together the grid of the
-    !! entire computational domain on the rank 0 processor. This is only done for 1D simulations.
+    !> This subroutine collects the sub-domain cell-boundary or cell-center locations data from all of the processors and puts back
+    !! together the grid of the entire computational domain on the rank 0 processor. This is only done for 1D simulations.
     impure subroutine s_mpi_defragment_1d_grid_variable
 
 #ifdef MFC_MPI
         integer :: ierr  !< Generic flag used to identify and report MPI errors
-        ! Silo-HDF5 database format
 
+        ! Silo-HDF5 database format
         if (format == 1) then
             call MPI_GATHERV(x_cc(0), m + 1, mpi_p, x_root_cc(0), recvcounts, displs, mpi_p, 0, MPI_COMM_WORLD, ierr)
 
@@ -250,8 +270,11 @@ contains
 
     end subroutine s_mpi_defragment_1d_grid_variable
 
-    !> Gather the Silo database metadata for the flow variable's extents to boost performance of the multidimensional visualization.
-    !! @param q_sf Flow variable on a single computational sub-domain
+    !> This subroutine gathers the Silo database metadata for the flow variable's extents as to boost performance of the
+    !! multidimensional visualization.
+    !! @param q_sf Flow variable defined on a single computational sub-domain
+    ! ! @param data_extents The flow variable extents on each of the processor's sub-domain. First dimension of array corresponds to
+    ! the former's minimum and maximum values, respectively, while second dimension corresponds to each processor's rank.
     impure subroutine s_mpi_gather_data_extents(q_sf, data_extents)
 
         real(wp), dimension(:,:,:), intent(in)                  :: q_sf
@@ -283,10 +306,10 @@ contains
 
     end subroutine s_mpi_gather_data_extents
 
-    !> Gather the sub-domain flow variable data from all processors and reassemble it for the entire computational domain on the
-    !! rank 0 processor. This is only done for 1D simulations.
-    !! @param q_sf Flow variable on a single computational sub-domain
-    !! @param q_root_sf Flow variable on the entire computational domain
+    !> This subroutine gathers the sub-domain flow variable data from all of the processors and puts it back together for the entire
+    !! computational domain on the rank 0 processor. This is only done for 1D simulations.
+    !! @param q_sf Flow variable defined on a single computational sub-domain
+    !! @param q_root_sf Flow variable defined on the entire computational domain
     impure subroutine s_mpi_defragment_1d_flow_variable(q_sf, q_root_sf)
 
         real(wp), dimension(0:m), intent(in)    :: q_sf
@@ -294,9 +317,9 @@ contains
 
 #ifdef MFC_MPI
         integer :: ierr  !< Generic flag used to identify and report MPI errors
+
         ! Gathering the sub-domain flow variable data from all the processes and putting it back together for the entire
         ! computational domain on the process with rank 0
-
         call MPI_GATHERV(q_sf(0), m + 1, mpi_p, q_root_sf(0), recvcounts, displs, mpi_p, 0, MPI_COMM_WORLD, ierr)
 #endif
 
diff --git a/src/post_process/m_start_up.fpp b/src/post_process/m_start_up.fpp
index c65e4cf7cf..7efb841780 100644
--- a/src/post_process/m_start_up.fpp
+++ b/src/post_process/m_start_up.fpp
@@ -8,6 +8,8 @@
 
 module m_start_up
 
+    ! Dependencies
+
     use, intrinsic :: iso_c_binding
 
     use m_derived_types
@@ -55,9 +57,15 @@ contains
     impure subroutine s_read_input_file
 
         character(LEN=name_len) :: file_loc
-        logical                 :: file_check
-        integer                 :: iostatus
-        character(len=1000)     :: line
+
+        !> Generic logical used for the purpose of asserting whether a file is or is not present in the designated location
+        logical :: file_check
+        integer :: iostatus
+        !! Integer to check iostat of file read
+
+        character(len=1000) :: line
+
+        ! Namelist for all of the parameters to be inputted by the user
 
         namelist /user_inputs/ case_dir, m, n, p, t_step_start, t_step_stop, t_step_save, model_eqns, num_fluids, mpp_lim, &
             & weno_order, bc_x, bc_y, bc_z, fluid_pp, bub_pp, format, precision, output_partial_domain, x_output, y_output, &
@@ -70,11 +78,14 @@ contains
             & bubbles_lagrange, sim_data, hyperelasticity, Bx0, relativity, cont_damage, hyper_cleaning, num_bc_patches, igr, &
             & igr_order, down_sample, recon_type, muscl_order, lag_header, lag_txt_wrt, lag_db_wrt, lag_id_wrt, lag_pos_wrt, &
             & lag_pos_prev_wrt, lag_vel_wrt, lag_rad_wrt, lag_rvel_wrt, lag_r0_wrt, lag_rmax_wrt, lag_rmin_wrt, lag_dphidt_wrt, &
-            & lag_pres_wrt, lag_mv_wrt, lag_mg_wrt, lag_betaT_wrt, lag_betaC_wrt, alpha_rho_e_wrt, ib_state_wrt
+            & lag_pres_wrt, lag_mv_wrt, lag_mg_wrt, lag_betaT_wrt, lag_betaC_wrt, alpha_rho_e_wrt, ib_state_wrt, &
+            & particles_lagrange, particle_pp
 
+        ! Inquiring the status of the post_process.inp file
         file_loc = 'post_process.inp'
         inquire (FILE=trim(file_loc), EXIST=file_check)
 
+        ! Checking whether the input file is there. If it is, the input file is read. If not, the program is terminated.
         if (file_check) then
             open (1, FILE=trim(file_loc), form='formatted', STATUS='old', ACTION='read')
             read (1, NML=user_inputs, iostat=iostatus)
@@ -96,6 +107,7 @@ contains
                 p = int((p + 1)/3) - 1
             end if
 
+            ! Store m,n,p into global m,n,p
             m_glb = m
             n_glb = n
             p_glb = p
@@ -120,12 +132,15 @@ contains
         character(LEN=len_trim(case_dir)) :: file_loc
         logical                           :: dir_check
 
+        ! Checking the existence of the case folder
+
         case_dir = adjustl(case_dir)
 
         file_loc = trim(case_dir) // '/.'
 
         call my_inquire(file_loc, dir_check)
 
+        ! Constraint on the location of the case directory
         if (dir_check .neqv. .true.) then
             call s_mpi_abort('Unsupported choice for the value of ' // 'case_dir. Exiting.')
         end if
@@ -135,7 +150,7 @@ contains
 
     end subroutine s_check_input_file
 
-    !> Load grid and conservative data for a time step, fill ghost-cell buffers, and convert to primitive variables.
+    !> @brief Load grid and conservative data for a time step, fill ghost-cell buffers, and convert to primitive variables.
     impure subroutine s_perform_time_step(t_step)
 
         integer, intent(inout) :: t_step
@@ -152,20 +167,24 @@ contains
             end if
         end if
 
+        ! Populating the grid and conservative variables
         call s_read_data_files(t_step)
 
+        ! Populating the buffer regions of the grid and conservative variables
         if (buff_size > 0) then
             call s_populate_grid_variables_buffers()
             call s_populate_variables_buffers(bc_type, q_cons_vf)
         end if
 
+        ! Initialize the Temperature cache.
         if (chemistry) call s_compute_q_T_sf(q_T_sf, q_cons_vf, idwbuff)
 
+        ! Converting the conservative variables to the primitive ones
         call s_convert_conservative_to_primitive_variables(q_cons_vf, q_T_sf, q_prim_vf, idwbuff)
 
     end subroutine s_perform_time_step
 
-    !> Derive requested flow quantities from primitive variables and write them to the formatted database files.
+    !> @brief Derive requested flow quantities from primitive variables and write them to the formatted database files.
     impure subroutine s_save_data(t_step, varname, pres, c, H)
 
         integer, intent(inout)                 :: t_step
@@ -200,6 +219,7 @@ contains
             z_end = offset_z%end + p
         end if
 
+        ! Opening a new formatted database file
         call s_open_formatted_database_file(t_step)
 
         if (sim_data .and. proc_rank == 0) then
@@ -212,20 +232,25 @@ contains
             call s_write_energy_data_file(q_prim_vf, q_cons_vf)
         end if
 
+        ! Adding the grid to the formatted database file
         call s_write_grid_to_formatted_database_file(t_step)
 
+        ! Computing centered finite-difference coefficients in x-direction
         if (omega_wrt(2) .or. omega_wrt(3) .or. qm_wrt .or. liutex_wrt .or. schlieren_wrt) then
             call s_compute_finite_difference_coefficients(m, x_cc, fd_coeff_x, buff_size, fd_number, fd_order, offset_x)
         end if
 
+        ! Computing centered finite-difference coefficients in y-direction
         if (omega_wrt(1) .or. omega_wrt(3) .or. qm_wrt .or. liutex_wrt .or. (n > 0 .and. schlieren_wrt)) then
             call s_compute_finite_difference_coefficients(n, y_cc, fd_coeff_y, buff_size, fd_number, fd_order, offset_y)
         end if
 
+        ! Computing centered finite-difference coefficients in z-direction
         if (omega_wrt(1) .or. omega_wrt(2) .or. qm_wrt .or. liutex_wrt .or. (p > 0 .and. schlieren_wrt)) then
             call s_compute_finite_difference_coefficients(p, z_cc, fd_coeff_z, buff_size, fd_number, fd_order, offset_z)
         end if
 
+        ! Adding the partial densities to the formatted database file
         if ((model_eqns == 2) .or. (model_eqns == 3) .or. (model_eqns == 4)) then
             do i = 1, num_fluids
                 if (alpha_rho_wrt(i) .or. (cons_vars_wrt .or. prim_vars_wrt)) then
@@ -242,6 +267,7 @@ contains
             end do
         end if
 
+        ! Adding the density to the formatted database file
         if ((rho_wrt .or. (model_eqns == 1 .and. (cons_vars_wrt .or. prim_vars_wrt))) .and. (.not. relativity)) then
             q_sf(:,:,:) = rho_sf(x_beg:x_end,y_beg:y_end,z_beg:z_end)
             write (varname, '(A)') 'rho'
@@ -267,6 +293,7 @@ contains
             varname(:) = ' '
         end if
 
+        ! Adding the momentum to the formatted database file
         do i = 1, E_idx - mom_idx%beg
             if (mom_wrt(i) .or. cons_vars_wrt) then
                 q_sf(:,:,:) = q_cons_vf(i + cont_idx%end)%sf(x_beg:x_end,y_beg:y_end,z_beg:z_end)
@@ -277,6 +304,7 @@ contains
             end if
         end do
 
+        ! Adding the velocity to the formatted database file
         do i = 1, E_idx - mom_idx%beg
             if (vel_wrt(i) .or. prim_vars_wrt) then
                 q_sf(:,:,:) = q_prim_vf(i + cont_idx%end)%sf(x_beg:x_end,y_beg:y_end,z_beg:z_end)
@@ -287,6 +315,7 @@ contains
             end if
         end do
 
+        ! Adding the species' concentrations to the formatted database file
         if (chemistry) then
             do i = 1, num_species
                 if (chem_wrt_Y(i) .or. prim_vars_wrt) then
@@ -307,6 +336,7 @@ contains
             end if
         end if
 
+        ! Adding the flux limiter function to the formatted database file
         do i = 1, E_idx - mom_idx%beg
             if (flux_wrt(i)) then
                 call s_derive_flux_limiter(i, q_prim_vf, q_sf)
@@ -318,6 +348,7 @@ contains
             end if
         end do
 
+        ! Adding the energy to the formatted database file
         if (E_wrt .or. cons_vars_wrt) then
             q_sf(:,:,:) = q_cons_vf(E_idx)%sf(x_beg:x_end,y_beg:y_end,z_beg:z_end)
             write (varname, '(A)') 'E'
@@ -326,6 +357,7 @@ contains
             varname(:) = ' '
         end if
 
+        ! Adding the individual energies to the formatted database file
         if (model_eqns == 3) then
             do i = 1, num_fluids
                 if (alpha_rho_e_wrt(i) .or. cons_vars_wrt) then
@@ -338,6 +370,7 @@ contains
             end do
         end if
 
+        ! Adding Energy cascade FFT
         if (fft_wrt) then
             do l = 0, p
                 do k = 0, n
@@ -443,6 +476,7 @@ contains
             end do
         end if
 
+        ! Adding the magnetic field to the formatted database file
         if (mhd .and. prim_vars_wrt) then
             do i = B_idx%beg, B_idx%end
                 q_sf(:,:,:) = q_prim_vf(i)%sf(x_beg:x_end,y_beg:y_end,z_beg:z_end)
@@ -470,6 +504,7 @@ contains
             end do
         end if
 
+        ! Adding the elastic shear stresses to the formatted database file
         if (elasticity) then
             do i = 1, stress_idx%end - stress_idx%beg + 1
                 if (prim_vars_wrt) then
@@ -508,6 +543,7 @@ contains
             varname(:) = ' '
         end if
 
+        ! Adding the pressure to the formatted database file
         if (pres_wrt .or. prim_vars_wrt) then
             q_sf(:,:,:) = q_prim_vf(E_idx)%sf(x_beg:x_end,y_beg:y_end,z_beg:z_end)
             write (varname, '(A)') 'pres'
@@ -516,6 +552,7 @@ contains
             varname(:) = ' '
         end if
 
+        ! Adding the volume fraction(s) to the formatted database file
         if (((model_eqns == 2) .and. (bubbles_euler .neqv. .true.)) .or. (model_eqns == 3)) then
             do i = 1, num_fluids - 1
                 if (alpha_wrt(i) .or. (cons_vars_wrt .or. prim_vars_wrt)) then
@@ -549,6 +586,7 @@ contains
             end if
         end if
 
+        ! Adding specific heat ratio function to formatted database file
         if (gamma_wrt .or. (model_eqns == 1 .and. (cons_vars_wrt .or. prim_vars_wrt))) then
             q_sf(:,:,:) = gamma_sf(x_beg:x_end,y_beg:y_end,z_beg:z_end)
             write (varname, '(A)') 'gamma'
@@ -557,6 +595,7 @@ contains
             varname(:) = ' '
         end if
 
+        ! Adding the specific heat ratio to the formatted database file
         if (heat_ratio_wrt) then
             call s_derive_specific_heat_ratio(q_sf)
 
@@ -566,6 +605,7 @@ contains
             varname(:) = ' '
         end if
 
+        ! Adding liquid stiffness function to formatted database file
         if (pi_inf_wrt .or. (model_eqns == 1 .and. (cons_vars_wrt .or. prim_vars_wrt))) then
             q_sf(:,:,:) = pi_inf_sf(x_beg:x_end,y_beg:y_end,z_beg:z_end)
             write (varname, '(A)') 'pi_inf'
@@ -574,6 +614,7 @@ contains
             varname(:) = ' '
         end if
 
+        ! Adding the liquid stiffness to the formatted database file
         if (pres_inf_wrt) then
             call s_derive_liquid_stiffness(q_sf)
 
@@ -583,6 +624,7 @@ contains
             varname(:) = ' '
         end if
 
+        ! Adding the sound speed to the formatted database file
         if (c_wrt) then
             do k = -offset_z%beg, p + offset_z%end
                 do j = -offset_y%beg, n + offset_y%end
@@ -609,6 +651,7 @@ contains
             varname(:) = ' '
         end if
 
+        ! Adding the vorticity to the formatted database file
         do i = 1, 3
             if (omega_wrt(i)) then
                 call s_derive_vorticity_component(i, q_prim_vf, q_sf)
@@ -627,6 +670,7 @@ contains
             call s_write_variable_to_formatted_database_file(varname, t_step)
         end if
 
+        ! Adding Q_M to the formatted database file
         if (p > 0 .and. qm_wrt) then
             call s_derive_qm(q_prim_vf, q_sf)
 
@@ -636,9 +680,12 @@ contains
             varname(:) = ' '
         end if
 
+        ! Adding Liutex magnitude to the formatted database file
         if (liutex_wrt) then
+            ! Compute Liutex vector and its magnitude
             call s_derive_liutex(q_prim_vf, liutex_mag, liutex_axis)
 
+            ! Liutex magnitude
             q_sf = liutex_mag
 
             write (varname, '(A)') 'liutex_mag'
@@ -646,6 +693,7 @@ contains
 
             varname(:) = ' '
 
+            ! Liutex axis
             do i = 1, 3
                 q_sf = liutex_axis(:,:,:,i)
 
@@ -656,6 +704,7 @@ contains
             end do
         end if
 
+        ! Adding numerical Schlieren function to formatted database file
         if (schlieren_wrt) then
             call s_derive_numerical_schlieren_function(q_cons_vf, q_sf)
 
@@ -665,6 +714,7 @@ contains
             varname(:) = ' '
         end if
 
+        ! Adding the color function to formatted database file
         if (cf_wrt) then
             q_sf(:,:,:) = q_cons_vf(c_idx)%sf(x_beg:x_end,y_beg:y_end,z_beg:z_end)
             write (varname, '(A,I0)') 'color_function'
@@ -672,6 +722,7 @@ contains
             varname(:) = ' '
         end if
 
+        ! Adding the volume fraction(s) to the formatted database file
         if (bubbles_euler) then
             do i = adv_idx%beg, adv_idx%end
                 q_sf(:,:,:) = q_cons_vf(i)%sf(x_beg:x_end,y_beg:y_end,z_beg:z_end)
@@ -681,6 +732,7 @@ contains
             end do
         end if
 
+        ! Adding the bubble variables  to the formatted database file
         if (bubbles_euler) then
             ! nR
             do i = 1, nb
@@ -724,8 +776,9 @@ contains
             end if
         end if
 
-        if (bubbles_lagrange) then
-            ! Void fraction field
+        ! Adding the lagrangian subgrid variables  to the formatted database file
+        if (bubbles_lagrange .or. particles_lagrange) then
+            !! Void fraction field
             q_sf(:,:,:) = 1._wp - q_cons_vf(beta_idx)%sf(-offset_x%beg:m + offset_x%end,-offset_y%beg:n + offset_y%end, &
                  & -offset_z%beg:p + offset_z%end)
             write (varname, '(A)') 'voidFraction'
@@ -741,11 +794,12 @@ contains
             call s_close_energy_data_file()
         end if
 
+        ! Closing the formatted database file
         call s_close_formatted_database_file()
 
     end subroutine s_save_data
 
-    !> Transpose 3-D complex data from x-pencil to y-pencil layout via MPI_Alltoall.
+    !> @brief Transpose 3-D complex data from x-pencil to y-pencil layout via MPI_Alltoall.
     subroutine s_mpi_transpose_x2y
 
         complex(c_double_complex), allocatable :: sendbuf(:), recvbuf(:)
@@ -787,7 +841,7 @@ contains
 
     end subroutine s_mpi_transpose_x2y
 
-    !> Transpose 3-D complex data from y-pencil to z-pencil layout via MPI_Alltoall.
+    !> @brief Transpose 3-D complex data from y-pencil to z-pencil layout via MPI_Alltoall.
     subroutine s_mpi_transpose_y2z
 
         complex(c_double_complex), allocatable :: sendbuf(:), recvbuf(:)
@@ -830,15 +884,19 @@ contains
 
     end subroutine s_mpi_transpose_y2z
 
-    !> Initialize all post-process sub-modules, set up I/O pointers, and prepare FFTW plans and MPI communicators.
+    !> @brief Initialize all post-process sub-modules, set up I/O pointers, and prepare FFTW plans and MPI communicators.
     impure subroutine s_initialize_modules
 
+        ! Computation of parameters, allocation procedures, and/or any other tasks needed to properly setup the modules
         integer :: size_n(1), inembed(1), onembed(1)
 
         call s_initialize_global_parameters_module()
         if (bubbles_euler .or. bubbles_lagrange) then
             call s_initialize_bubbles_model()
         end if
+        if (particles_lagrange) then
+            call s_initialize_particles_model()
+        end if
         if (num_procs > 1) then
             call s_initialize_mpi_proxy_module()
             call s_initialize_mpi_common_module()
@@ -849,6 +907,7 @@ contains
         call s_initialize_derived_variables_module()
         call s_initialize_data_output_module()
 
+        ! Associate pointers for serial or parallel I/O
         if (parallel_io .neqv. .true.) then
             s_read_data_files => s_read_serial_data_files
         else
@@ -919,7 +978,7 @@ contains
 
     end subroutine s_initialize_modules
 
-    !> Perform a distributed forward 3-D FFT using pencil decomposition with FFTW and MPI transposes.
+    !> @brief Perform a distributed forward 3-D FFT using pencil decomposition with FFTW and MPI transposes.
     subroutine s_mpi_FFT_fwd
 
         integer :: j, k, l
@@ -986,13 +1045,17 @@ contains
 
     end subroutine s_mpi_FFT_fwd
 
-    !> Set up the MPI environment, read and broadcast user inputs, and decompose the computational domain.
+    !> @brief Set up the MPI environment, read and broadcast user inputs, and decompose the computational domain.
     impure subroutine s_initialize_mpi_domain
 
         num_dims = 1 + min(1, n) + min(1, p)
 
+        ! Initialization of the MPI environment
         call s_mpi_initialize()
 
+        ! Processor with rank 0 assigns default user input values prior to reading those in from the input file. Next, the user
+        ! inputs are read in and their consistency is checked. The detection of any inconsistencies automatically leads to the
+        ! termination of the post-process.
         if (proc_rank == 0) then
             call s_assign_default_values_to_user_inputs()
             call s_read_input_file()
@@ -1001,6 +1064,8 @@ contains
             print '(" Post-processing a ", I0, "x", I0, "x", I0, " case on ", I0, " rank(s)")', m, n, p, num_procs
         end if
 
+        ! Broadcasting the user inputs to all of the processors and performing the parallel computational domain decomposition.
+        ! Neither procedure has to be carried out if the post-process is in fact not truly executed in parallel.
         call s_mpi_bcast_user_inputs()
         call s_initialize_parallel_io()
         call s_mpi_decompose_computational_domain()
@@ -1008,11 +1073,14 @@ contains
 
     end subroutine s_initialize_mpi_domain
 
-    !> Destroy FFTW plans, free MPI communicators, and finalize all post-process sub-modules.
+    !> @brief Destroy FFTW plans, free MPI communicators, and finalize all post-process sub-modules.
     impure subroutine s_finalize_modules
 
+        ! Disassociate pointers for serial and parallel I/O
         s_read_data_files => null()
 
+        ! if (sim_data .and. proc_rank == 0) then call s_close_intf_data_file() call s_close_energy_data_file() end if
+
         if (fft_wrt) then
             if (c_associated(fwd_plan_x)) call fftw_destroy_plan(fwd_plan_x)
             if (c_associated(fwd_plan_y)) call fftw_destroy_plan(fwd_plan_y)
@@ -1035,6 +1103,7 @@ contains
         end if
 #endif
 
+        ! Deallocation procedures for the modules
         call s_finalize_data_output_module()
         call s_finalize_derived_variables_module()
         call s_finalize_data_input_module()
@@ -1045,6 +1114,7 @@ contains
         end if
         call s_finalize_global_parameters_module()
 
+        ! Finalizing the MPI environment
         call s_mpi_finalize()
 
     end subroutine s_finalize_modules
diff --git a/src/pre_process/m_global_parameters.fpp b/src/pre_process/m_global_parameters.fpp
index 077262aece..ddfd3e7724 100644
--- a/src/pre_process/m_global_parameters.fpp
+++ b/src/pre_process/m_global_parameters.fpp
@@ -41,14 +41,19 @@ module m_global_parameters
     integer               :: num_vels             !< Number of velocity components (different from num_dims for mhd)
     logical               :: cyl_coord
     integer               :: grid_geometry        !< Cylindrical coordinates (either axisymmetric or full 3D)
+
     !> Locations of cell-centers (cc) in x-, y- and z-directions, respectively
     real(wp), allocatable, dimension(:) :: x_cc, y_cc, z_cc
+
     !> Locations of cell-boundaries (cb) in x-, y- and z-directions, respectively
     real(wp), allocatable, dimension(:) :: x_cb, y_cb, z_cb
     real(wp) :: dx, dy, dz                             !< Minimum cell-widths in the x-, y- and z-coordinate directions
     type(bounds_info) :: x_domain, y_domain, z_domain  !< Locations of the domain bounds in the x-, y- and z-coordinate directions
     logical :: stretch_x, stretch_y, stretch_z         !< Grid stretching flags for the x-, y- and z-coordinate directions
-    ! Grid stretching: a_x/a_y/a_z = rate, x_a/y_a/z_a = location
+
+    ! Parameters of the grid stretching function for the x-, y- and z-coordinate directions. The "a" parameters are a measure of the
+    ! rate at which the grid is stretched while the remaining parameters are indicative of the location on the grid at which the
+    ! stretching begins.
     real(wp) :: a_x, a_y, a_z
     integer  :: loops_x, loops_y, loops_z
     real(wp) :: x_a, y_a, z_a
@@ -81,6 +86,7 @@ module m_global_parameters
     logical            :: igr                          !< Use information geometric regularization
     integer            :: igr_order                    !< IGR reconstruction order
     logical, parameter :: chemistry = .${chemistry}$.  !< Chemistry modeling
+
     ! Annotations of the structure, i.e. the organization, of the state vectors
     type(int_bounds_info) :: cont_idx              !< Indexes of first & last continuity eqns.
     type(int_bounds_info) :: mom_idx               !< Indexes of first & last momentum eqns.
@@ -99,42 +105,68 @@ module m_global_parameters
     type(int_bounds_info) :: species_idx           !< Indexes of first & last concentration eqns.
     integer               :: damage_idx            !< Index of damage state variable (D) for continuum damage model
     integer               :: psi_idx               !< Index of hyperbolic cleaning state variable for MHD
+
     ! Cell Indices for the (local) interior points (O-m, O-n, 0-p). Stands for "InDices With BUFFer".
     type(int_bounds_info) :: idwint(1:3)
 
-    ! Cell indices (InDices With BUFFer): includes buffer except in pre_process
-    type(int_bounds_info)      :: idwbuff(1:3)
-    type(int_bounds_info)      :: bc_x, bc_y, bc_z       !< Boundary conditions in the x-, y- and z-coordinate directions
-    integer                    :: shear_num              !< Number of shear stress components
-    integer, dimension(3)      :: shear_indices          !< Indices of the stress components that represent shear stress
-    integer                    :: shear_BC_flip_num      !< Number of shear stress components to reflect for boundary conditions
-    integer, dimension(3, 2)   :: shear_BC_flip_indices  !< Shear stress BC reflection indices (1:3, 1:shear_BC_flip_num)
-    logical                    :: parallel_io            !< Format of the data files
-    logical                    :: file_per_process       !< type of data output
-    integer                    :: precision              !< Precision of output files
-    logical                    :: down_sample            !< Down-sample the output data
-    logical                    :: mixlayer_vel_profile   !< Set hyperbolic tangent streamwise velocity profile
-    real(wp)                   :: mixlayer_vel_coef      !< Coefficient for the hyperbolic tangent streamwise velocity profile
-    logical                    :: mixlayer_perturb       !< Superimpose instability waves to surrounding fluid flow
-    integer                    :: mixlayer_perturb_nk    !< Number of Fourier modes for perturbation with mixlayer_perturb flag
-    real(wp)                   :: mixlayer_perturb_k0    !< Peak wavenumber for mixlayer perturbation (default: most unstable mode)
+    ! Cell Indices for the entire (local) domain. In simulation and post_process, this includes the buffer region. idwbuff and
+    ! idwint are the same otherwise. Stands for "InDices With BUFFer".
+    type(int_bounds_info) :: idwbuff(1:3)
+
+    !> The order of the finite-difference (fd) approximations of the first-order derivatives that need to be evaluated when the CoM
+    !! or flow probe data files are to be written at each time step
+    integer :: fd_order
+
+    !> The finite-difference number is given by MAX(1, fd_order/2). Essentially, it is a measure of the half-size of the
+    !! finite-difference stencil for the selected order of accuracy.
+    integer :: fd_number
+
+    !> @name lagrangian subgrid bubble parameters
+    !> @{!
+    type(bubbles_lagrange_parameters) :: lag_params  !< Lagrange bubbles' parameters
+    !> @}
+
+    type(int_bounds_info) :: bc_x, bc_y, bc_z   !< Boundary conditions in the x-, y- and z-coordinate directions
+    integer               :: shear_num !! Number of shear stress components
+    integer, dimension(3) :: shear_indices      !< Indices of the stress components that represent shear stress
+    integer               :: shear_BC_flip_num  !< Number of shear stress components to reflect for boundary conditions
+    !> Indices of shear stress components to reflect for boundary conditions. Size: (1:3, 1:shear_BC_flip_num) for (x/y/z,
+    !! [indices])
+    integer, dimension(3, 2) :: shear_BC_flip_indices
+    logical                  :: parallel_io           !< Format of the data files
+    logical                  :: file_per_process      !< type of data output
+    integer                  :: precision             !< Precision of output files
+    logical                  :: down_sample           !< Down-sample the output data
+    logical                  :: mixlayer_vel_profile  !< Set hyperbolic tangent streamwise velocity profile
+    real(wp)                 :: mixlayer_vel_coef     !< Coefficient for the hyperbolic tangent streamwise velocity profile
+    logical                  :: mixlayer_perturb      !< Superimpose instability waves to surrounding fluid flow
+    integer                  :: mixlayer_perturb_nk   !< Number of Fourier modes for perturbation with mixlayer_perturb flag
+    !> Peak wavenumber of prescribed energy spectra with mixlayer_perturb flag Default value (k0 = 0.4446) is most unstable mode
+    !! obtained from linear stability analysis See Michalke (1964, JFM) for details
+    real(wp)                   :: mixlayer_perturb_k0  !< Peak wavenumber for mixlayer perturbation (default: most unstable mode)
     logical                    :: simplex_perturb
     type(simplex_noise_params) :: simplex_params
-    real(wp)                   :: pi_fac                 !< Factor for artificial pi_inf
+    real(wp)                   :: pi_fac               !< Factor for artificial pi_inf
     logical                    :: viscous
     logical                    :: bubbles_lagrange
+    logical                    :: particles_lagrange
 
     ! Perturb density of surrounding air so as to break symmetry of grid
-    logical                             :: perturb_flow
-    integer                             :: perturb_flow_fluid  !< Fluid to be perturbed with perturb_flow flag
-    real(wp)                            :: perturb_flow_mag    !< Magnitude of perturbation with perturb_flow flag
-    logical                             :: perturb_sph
-    integer                             :: perturb_sph_fluid   !< Fluid to be perturbed with perturb_sph flag
-    real(wp), dimension(num_fluids_max) :: fluid_rho
-    logical                             :: elliptic_smoothing
-    integer                             :: elliptic_smoothing_iters
-    integer, allocatable, dimension(:)  :: proc_coords         !< Processor coordinates in MPI_CART_COMM
-    integer, allocatable, dimension(:)  :: start_idx           !< Starting cell-center index of local processor in global grid
+    logical                                :: perturb_flow
+    integer                                :: perturb_flow_fluid  !< Fluid to be perturbed with perturb_flow flag
+    real(wp)                               :: perturb_flow_mag    !< Magnitude of perturbation with perturb_flow flag
+    logical                                :: perturb_sph
+    integer                                :: perturb_sph_fluid   !< Fluid to be perturbed with perturb_sph flag
+    real(wp), dimension(num_fluids_max)    :: fluid_rho
+    logical                                :: elliptic_smoothing
+    integer                                :: elliptic_smoothing_iters
+    integer, allocatable, dimension(:)     :: proc_coords         !< Processor coordinates in MPI_CART_COMM
+    type(int_bounds_info), dimension(3)    :: nidx
+    integer, allocatable, dimension(:,:,:) :: neighbor_ranks
+    !! Neighbor ranks
+
+    integer, allocatable, dimension(:) :: start_idx  !< Starting cell-center index of local processor in global grid
+
 #ifdef MFC_MPI
     type(mpi_io_var), public :: MPI_IO_DATA
     character(LEN=name_len)  :: mpiiofs
@@ -142,18 +174,28 @@ module m_global_parameters
 #endif
 
     ! Initial Condition Parameters
-    integer                                                  :: num_patches     !< Number of patches composing initial condition
-    type(ic_patch_parameters), dimension(num_patches_max)    :: patch_icpp      !< IC patch parameters (max: num_patches_max)
-    integer                                                  :: num_bc_patches  !< Number of boundary condition patches
-    logical                                                  :: bc_io           !< whether or not to save BC data
-    type(bc_patch_parameters), dimension(num_bc_patches_max) :: patch_bc        !< Boundary condition patch parameters
+    integer :: num_patches  !< Number of patches composing initial condition
+
+    !> Database of the initial condition patch parameters (icpp) for each of the patches employed in the configuration of the
+    !! initial condition. Note that the maximum allowable number of patches, num_patches_max, may be changed in the module
+    !! m_derived_types.f90.
+    type(ic_patch_parameters), dimension(num_patches_max) :: patch_icpp      !< IC patch parameters (max: num_patches_max)
+    integer                                               :: num_bc_patches  !< Number of boundary condition patches
+    logical                                               :: bc_io           !< whether or not to save BC data
+    !> Boundary condition patch parameters Database of the boundary condition patch parameters for each of the patches employed in
+    !! the configuration of the boundary conditions
+    type(bc_patch_parameters), dimension(num_bc_patches_max) :: patch_bc
 
     ! Fluids Physical Parameters
+    !> Database of the physical parameters of each of the fluids that is present in the flow. These include the stiffened gas
+    !! equation of state parameters, and the Reynolds numbers.
     type(physical_parameters), dimension(num_fluids_max) :: fluid_pp  !< Stiffened gas EOS parameters and Reynolds numbers per fluid
+
     ! Subgrid Bubble Parameters
-    type(subgrid_bubble_physical_parameters) :: bub_pp
-    real(wp)                                 :: rhoref, pref  !< Reference parameters for Tait EOS
-    type(chemistry_parameters)               :: chem_params
+    type(subgrid_bubble_physical_parameters)   :: bub_pp
+    type(subgrid_particle_physical_parameters) :: particle_pp
+    real(wp)                                   :: rhoref, pref  !< Reference parameters for Tait EOS
+    type(chemistry_parameters)                 :: chem_params
     !> @name Bubble modeling
     !> @{
     integer                             :: nb
@@ -173,6 +215,9 @@ module m_global_parameters
     integer                                               :: Np
     type(ib_patch_parameters), dimension(num_patches_max) :: patch_ib  !< Immersed boundary patch parameters
     type(vec3_dt), allocatable, dimension(:)              :: airfoil_grid_u, airfoil_grid_l
+    !! Database of the immersed boundary patch parameters for each of the patches employed in the configuration of the initial
+    !! condition. Note that the maximum allowable number of patches, num_patches_max, may be changed in the module
+    !! m_derived_types.f90.
     !> @}
 
     !> @name Non-polytropic bubble gas compression
@@ -187,6 +232,9 @@ module m_global_parameters
     real(wp), dimension(:), allocatable :: pb0, mass_g0, mass_v0, Pe_T, k_v, k_g
     real(wp), dimension(:), allocatable :: Re_trans_T, Re_trans_c, Im_trans_T, Im_trans_c, omegaN
     real(wp) :: R0ref, p0ref, rho0ref, T0ref, ss, pv, vd, mu_l, mu_v, mu_g, gam_v, gam_g, M_v, M_g, cp_v, cp_g, R_v, R_g
+
+    ! Solid particle physical parameters
+    real(wp) :: cp_particle, rho0ref_particle
     !> @}
 
     !> @name Surface Tension Modeling
@@ -210,10 +258,20 @@ module m_global_parameters
     integer, allocatable, dimension(:,:,:) :: logic_grid
     type(pres_field)                       :: pb
     type(pres_field)                       :: mv
-    real(wp)                               :: Bx0        !< Constant magnetic field in the x-direction (1D)
-    integer                                :: buff_size  !< Number of ghost cells for boundary condition storage
-    logical                                :: fft_wrt
-    logical                                :: dummy      !< AMDFlang workaround for case-optimization + GPU-kernel bug
+    real(wp)                               :: Bx0  !< Constant magnetic field in the x-direction (1D)
+
+    !> The number of cells that are necessary to be able to store enough boundary conditions data to march the solution in the
+    !! physical computational domain to the next time-step.
+    integer              :: buff_size
+    integer, allocatable :: beta_vars(:)  !< Indices of variables to communicate for bubble/particle coupling
+    logical              :: fft_wrt
+    !> AMDFlang workaround: keep a dummy logical to avoid a compiler case-optimization bug when a parameter+GPU-kernel conditional
+    !! is false
+    logical :: dummy
+
+    ! Variables for hardcoded initial conditions that are read from input files
+    character(LEN=2*path_len) :: interface_file
+    real(wp)                  :: normFac, normMag, g0_ic, p0_ic
 
 contains
 
@@ -221,7 +279,8 @@ contains
     !! parameters once they are read from the input file.
     impure subroutine s_assign_default_values_to_user_inputs
 
-        integer :: i  !< Generic loop operator
+        integer :: i
+
         ! Logistics
 
         case_dir = '.'
@@ -322,6 +381,8 @@ contains
         elliptic_smoothing_iters = dflt_int
         elliptic_smoothing = .false.
 
+        particles_lagrange = .false.
+
         fft_wrt = .false.
         dummy = .false.
 
@@ -338,6 +399,27 @@ contains
         ! Initial condition parameters
         num_patches = dflt_int
 
+        fd_order = dflt_int
+        lag_params%cluster_type = dflt_int
+        lag_params%pressure_corrector = .false.
+        lag_params%smooth_type = dflt_int
+        lag_params%heatTransfer_model = .false.
+        lag_params%massTransfer_model = .false.
+        lag_params%write_bubbles = .false.
+        lag_params%write_bubbles_stats = .false.
+        lag_params%nBubs_glb = dflt_int
+        lag_params%vel_model = dflt_int
+        lag_params%drag_model = dflt_int
+        lag_params%epsilonb = 1._wp
+        lag_params%charwidth = dflt_real
+        lag_params%nParticles_glb = dflt_int
+        lag_params%qs_drag_model = dflt_int
+        lag_params%stokes_drag = dflt_int
+        lag_params%added_mass_model = dflt_int
+        lag_params%interpolation_order = dflt_int
+        lag_params%charNz = dflt_int
+        lag_params%valmaxvoid = dflt_real
+
         do i = 1, num_patches_max
             patch_icpp(i)%geometry = dflt_int
             patch_icpp(i)%model_scale(:) = 1._wp
@@ -540,6 +622,10 @@ contains
         bub_pp%R_v = dflt_real; R_v = dflt_real
         bub_pp%R_g = dflt_real; R_g = dflt_real
 
+        ! Subgrid particle parameters
+        particle_pp%rho0ref_particle = dflt_real
+        particle_pp%cp_particle = dflt_real
+
     end subroutine s_assign_default_values_to_user_inputs
 
     !> Computation of parameters, allocation procedures, and/or any other tasks needed to properly setup the module
@@ -563,11 +649,11 @@ contains
 
             ! Annotating structure of the state and flux vectors belonging to the system of equations defined by the selected number
             ! of spatial dimensions and the gamma/pi_inf model
-            cont_idx%beg = 1
+            cont_idx%beg = 1  ! one continuity equation
             cont_idx%end = cont_idx%beg
-            mom_idx%beg = cont_idx%end + 1
+            mom_idx%beg = cont_idx%end + 1  ! one momentum equation in each direction
             mom_idx%end = cont_idx%end + num_vels
-            E_idx = mom_idx%end + 1
+            E_idx = mom_idx%end + 1  ! one energy equation
             adv_idx%beg = E_idx + 1
             adv_idx%end = adv_idx%beg + 1
             gamma_idx = adv_idx%beg
@@ -578,11 +664,11 @@ contains
         else if (model_eqns == 2) then
             ! Annotating structure of the state and flux vectors belonging to the system of equations defined by the selected number
             ! of spatial dimensions and the volume fraction model
-            cont_idx%beg = 1
+            cont_idx%beg = 1  ! one continuity equation
             cont_idx%end = num_fluids
-            mom_idx%beg = cont_idx%end + 1
+            mom_idx%beg = cont_idx%end + 1  ! one momentum equation in each direction
             mom_idx%end = cont_idx%end + num_vels
-            E_idx = mom_idx%end + 1
+            E_idx = mom_idx%end + 1  ! one energy equation
 
             if (igr) then
                 ! Volume fractions are stored in the indices immediately following the energy equation. IGR tracks a total of (N-1)
@@ -678,11 +764,11 @@ contains
         else if (model_eqns == 3) then
             ! Annotating structure of the state and flux vectors belonging to the system of equations defined by the selected number
             ! of spatial dimensions and the volume fraction model
-            cont_idx%beg = 1
+            cont_idx%beg = 1  ! one continuity equation
             cont_idx%end = num_fluids
-            mom_idx%beg = cont_idx%end + 1
+            mom_idx%beg = cont_idx%end + 1  ! one momentum equation in each direction
             mom_idx%end = cont_idx%end + num_vels
-            E_idx = mom_idx%end + 1
+            E_idx = mom_idx%end + 1  ! one energy equation
             adv_idx%beg = E_idx + 1
             adv_idx%end = E_idx + num_fluids
             internalEnergies_idx%beg = adv_idx%end + 1
@@ -797,6 +883,14 @@ contains
             end if
         end if
 
+        if (bubbles_lagrange) then
+            allocate (beta_vars(1:3))
+            beta_vars(1:3) = [1, 2, 5]
+        else if (particles_lagrange) then
+            allocate (beta_vars(1:8))
+            beta_vars(1:8) = [1, 2, 3, 4, 5, 6, 7, 8]
+        end if
+
         if (chemistry) then
             species_idx%beg = sys_size + 1
             species_idx%end = sys_size + num_species
@@ -820,8 +914,10 @@ contains
         chemxb = species_idx%beg
         chemxe = species_idx%end
 
+        if (bubbles_lagrange .or. particles_lagrange) fd_number = max(1, fd_order/2)
+
         call s_configure_coordinate_bounds(recon_type, weno_polyn, muscl_polyn, igr_order, buff_size, idwint, idwbuff, viscous, &
-                                           & bubbles_lagrange, m, n, p, num_dims, igr, ib)
+                                           & bubbles_lagrange, particles_lagrange, m, n, p, num_dims, igr, ib, fd_number)
 
 #ifdef MFC_MPI
         if (qbmm .and. .not. polytropic) then
@@ -860,7 +956,7 @@ contains
             grid_geometry = 1
         else if (cyl_coord .and. p == 0) then  ! Axisymmetric cylindrical grid
             grid_geometry = 2
-        else  ! Fully 3D cylindrical grid
+        else
             grid_geometry = 3
         end if
 
@@ -870,7 +966,7 @@ contains
 
     end subroutine s_initialize_global_parameters_module
 
-    !> Configure MPI parallel I/O settings and allocate processor coordinate arrays.
+    !> @brief Configures MPI parallel I/O settings and allocates processor coordinate arrays.
     impure subroutine s_initialize_parallel_io
 
 #ifdef MFC_MPI
@@ -904,7 +1000,7 @@ contains
 
     end subroutine s_initialize_parallel_io
 
-    !> Deallocate all global grid, index, and equation-of-state parameter arrays.
+    !> @brief Deallocates all global grid, index, and equation-of-state parameter arrays.
     impure subroutine s_finalize_global_parameters_module
 
         integer :: i
@@ -934,6 +1030,9 @@ contains
         end if
 #endif
 
+        if (allocated(neighbor_ranks)) deallocate (neighbor_ranks)
+        if (allocated(beta_vars)) deallocate (beta_vars)
+
     end subroutine s_finalize_global_parameters_module
 
 end module m_global_parameters
diff --git a/src/pre_process/m_mpi_proxy.fpp b/src/pre_process/m_mpi_proxy.fpp
index e9545ce865..55546aaa9b 100644
--- a/src/pre_process/m_mpi_proxy.fpp
+++ b/src/pre_process/m_mpi_proxy.fpp
@@ -22,9 +22,12 @@ contains
     impure subroutine s_mpi_bcast_user_inputs
 
 #ifdef MFC_MPI
+        ! Generic loop iterator
         integer :: i, j
+        ! Generic flag used to identify and report MPI errors
         integer :: ierr
 
+        ! Logistics
         call MPI_BCAST(case_dir, len(case_dir), MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr)
 
         #:for VAR in ['t_step_old', 't_step_start', 'm', 'n', 'p', 'm_glb', 'n_glb', 'p_glb',  &
@@ -33,7 +36,7 @@ contains
             & 'perturb_sph_fluid', 'num_patches', 'thermal', 'nb', 'dist_type',&
             & 'relax_model', 'num_ibs', 'n_start', 'elliptic_smoothing_iters', &
             & 'num_bc_patches', 'mixlayer_perturb_nk', 'recon_type',           &
-            & 'muscl_order', 'igr_order' ]
+            & 'muscl_order', 'igr_order', 'fd_order']
             call MPI_BCAST(${VAR}$, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
         #:endfor
 
@@ -45,7 +48,8 @@ contains
             & 'cfl_const_dt', 'cfl_dt', 'surface_tension',                     &
             & 'hyperelasticity', 'pre_stress', 'elliptic_smoothing', 'viscous',&
             & 'bubbles_lagrange', 'bc_io', 'mhd', 'relativity', 'cont_damage', &
-            & 'igr', 'down_sample', 'simplex_perturb','fft_wrt', 'hyper_cleaning' ]
+            & 'igr', 'down_sample', 'simplex_perturb','fft_wrt', 'hyper_cleaning',&
+            & 'particles_lagrange' ]
             call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
         #:endfor
         call MPI_BCAST(fluid_rho(1), num_fluids_max, mpi_p, 0, MPI_COMM_WORLD, ierr)
@@ -133,6 +137,22 @@ contains
             #:endfor
         end do
 
+        ! Variables from input files for hardcoded patches
+        call MPI_BCAST(interface_file, len(interface_file), MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr)
+        call MPI_BCAST(normFac, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
+        call MPI_BCAST(normMag, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
+        call MPI_BCAST(g0_ic, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
+        call MPI_BCAST(p0_ic, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
+
+        ! Subgrid bubble parameters
+        if (bubbles_euler .or. bubbles_lagrange) then
+            #:for VAR in [ 'R0ref','p0ref','rho0ref','T0ref', &
+                'ss','pv','vd','mu_l','mu_v','mu_g','gam_v','gam_g', &
+                'M_v','M_g','k_v','k_g','cp_v','cp_g','R_v','R_g']
+                call MPI_BCAST(bub_pp%${VAR}$, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
+            #:endfor
+        end if
+
         ! Simplex noise  and fluid physical parameters
         do i = 1, num_fluids_max
             #:for VAR in [ 'gamma','pi_inf', 'G', 'cv', 'qv', 'qvp' ]
@@ -148,15 +168,6 @@ contains
             end do
         end do
 
-        ! Subgrid bubble parameters
-        if (bubbles_euler .or. bubbles_lagrange) then
-            #:for VAR in [ 'R0ref','p0ref','rho0ref','T0ref', &
-                'ss','pv','vd','mu_l','mu_v','mu_g','gam_v','gam_g', &
-                'M_v','M_g','k_v','k_g','cp_v','cp_g','R_v','R_g']
-                call MPI_BCAST(bub_pp%${VAR}$, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
-            #:endfor
-        end if
-
         do i = 1, 3
             call MPI_BCAST(simplex_params%perturb_vel(i), 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
             call MPI_BCAST(simplex_params%perturb_vel_freq(i), 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
diff --git a/src/pre_process/m_start_up.fpp b/src/pre_process/m_start_up.fpp
index a3a7c51270..73ca46a7bc 100644
--- a/src/pre_process/m_start_up.fpp
+++ b/src/pre_process/m_start_up.fpp
@@ -43,12 +43,13 @@ module m_start_up
 
     abstract interface
 
-        !> Abstract interface for reading grid data files in serial or parallel.
+        !> @brief Abstract interface for reading grid data files in serial or parallel.
         impure subroutine s_read_abstract_grid_data_files
 
         end subroutine s_read_abstract_grid_data_files
 
-        !> Abstract interface for reading initial condition data files in serial or parallel.
+        !> @brief Abstract interface for reading initial condition data files in serial or parallel.
+        !! @param q_cons_vf Conservative variables
         impure subroutine s_read_abstract_ic_data_files(q_cons_vf_in)
 
             import :: scalar_field, integer_field, sys_size, pres_field
@@ -59,7 +60,11 @@ module m_start_up
     end interface
 
     character(LEN=path_len + name_len) :: proc_rank_dir  !< Location of the folder associated with the rank of the local processor
+
+    !> Possible location of time-step folder containing preexisting grid and/or conservative variables data to be used as starting
+    !! point for pre-process
     character(LEN=path_len + 2*name_len), private :: t_step_dir  !< Path to preexisting time-step folder for restart
+
     procedure(s_read_abstract_grid_data_files), pointer :: s_read_grid_data_files => null()
     procedure(s_read_abstract_ic_data_files), pointer :: s_read_ic_data_files => null()
 
@@ -70,9 +75,15 @@ contains
     impure subroutine s_read_input_file
 
         character(LEN=name_len) :: file_loc
-        logical                 :: file_check
-        integer                 :: iostatus
-        character(len=1000)     :: line
+
+        !> Generic logical used for the purpose of asserting whether a file is or is not present in the designated location
+        logical :: file_check
+        integer :: iostatus
+        !! Integer to check iostat of file read
+
+        character(len=1000) :: line
+
+        ! Namelist for all of the parameters to be inputted by the user
 
         namelist /user_inputs/ case_dir, old_grid, old_ic, t_step_old, t_step_start, m, n, p, x_domain, y_domain, z_domain, &
             & stretch_x, stretch_y, stretch_z, a_x, a_y, a_z, x_a, y_a, z_a, x_b, y_b, z_b, model_eqns, num_fluids, mpp_lim, &
@@ -84,11 +95,14 @@ contains
             & ptgalpha_eps, ib, num_ibs, patch_ib, sigma, adv_n, cfl_adap_dt, cfl_const_dt, n_start, n_start_old, &
             & surface_tension, hyperelasticity, pre_stress, elliptic_smoothing, elliptic_smoothing_iters, viscous, &
             & bubbles_lagrange, num_bc_patches, patch_bc, Bx0, relativity, cont_damage, igr, igr_order, down_sample, recon_type, &
-            & muscl_order, hyper_cleaning, simplex_perturb, simplex_params, fft_wrt
+            & muscl_order, fft_wrt, fd_order, lag_params, simplex_perturb, simplex_params, interface_file, normFac, normMag, &
+            & g0_ic, p0_ic, hyper_cleaning, particles_lagrange, particle_pp
 
+        ! Inquiring the status of the pre_process.inp file
         file_loc = 'pre_process.inp'
         inquire (FILE=trim(file_loc), EXIST=file_check)
 
+        ! Checking whether the input file is there. If it is, the input file is read. If not, the program is terminated.
         if (file_check) then
             open (1, FILE=trim(file_loc), form='formatted', STATUS='old', ACTION='read')
             read (1, NML=user_inputs, iostat=iostatus)
@@ -102,6 +116,7 @@ contains
 
             call s_update_cell_bounds(cells_bounds, m, n, p)
 
+            ! Store m,n,p into global m,n,p
             m_glb = m
             n_glb = n
             p_glb = p
@@ -126,6 +141,8 @@ contains
         character(LEN=len_trim(case_dir)) :: file_loc
         logical                           :: dir_check
 
+        ! Checking the existence of the case folder
+
         case_dir = adjustl(case_dir)
 
         file_loc = trim(case_dir) // '/.'
@@ -141,6 +158,7 @@ contains
         call s_check_inputs_common()
         call s_check_inputs()
 
+        ! Check all the patch properties
         call s_check_patches()
 
         if (ib) call s_check_ib_patches()
@@ -151,9 +169,16 @@ contains
     !! necessary global computational domain parameters.
     impure subroutine s_read_serial_grid_data_files
 
+        ! Generic string used to store the address of a particular file
         character(LEN=len_trim(case_dir) + 3*name_len) :: file_loc
-        logical                                        :: dir_check
-        logical                                        :: file_check
+
+        ! Logical variable used to test the existence of folders
+        logical :: dir_check
+
+        ! Generic logical used for the purpose of asserting whether a file is or is not present in the designated location
+        logical :: file_check
+
+        ! Setting address of the local processor rank and time-step directory
 
         write (proc_rank_dir, '(A,I0)') '/p_all/p', proc_rank
         proc_rank_dir = trim(case_dir) // trim(proc_rank_dir)
@@ -161,16 +186,22 @@ contains
         write (t_step_dir, '(A,I0)') '/', t_step_start
         t_step_dir = trim(proc_rank_dir) // trim(t_step_dir)
 
+        ! Inquiring as to the existence of the time-step directory
         file_loc = trim(t_step_dir) // '/.'
         call my_inquire(file_loc, dir_check)
 
+        ! If the time-step directory is missing, the pre-process exits
         if (dir_check .neqv. .true.) then
             call s_mpi_abort('Time-step folder ' // trim(t_step_dir) // ' is missing. Exiting.')
         end if
 
+        ! Reading the Grid Data File for the x-direction
+
+        ! Checking whether x_cb.dat exists
         file_loc = trim(t_step_dir) // '/x_cb.dat'
         inquire (FILE=trim(file_loc), EXIST=file_check)
 
+        ! If it exists, x_cb.dat is read
         if (file_check) then
             open (1, FILE=trim(file_loc), form='unformatted', STATUS='old', ACTION='read')
             read (1) x_cb(-1:m)
@@ -179,18 +210,25 @@ contains
             call s_mpi_abort('File x_cb.dat is missing in ' // trim(t_step_dir) // '. Exiting.')
         end if
 
+        ! Computing cell-center locations
         x_cc(0:m) = (x_cb(0:m) + x_cb(-1:(m - 1)))/2._wp
 
+        ! Computing minimum cell-width
         dx = minval(x_cb(0:m) - x_cb(-1:m - 1))
         if (num_procs > 1) call s_mpi_reduce_min(dx)
 
+        ! Setting locations of domain bounds
         x_domain%beg = x_cb(-1)
         x_domain%end = x_cb(m)
 
+        ! Reading the Grid Data File for the y-direction
+
         if (n > 0) then
+            ! Checking whether y_cb.dat exists
             file_loc = trim(t_step_dir) // '/y_cb.dat'
             inquire (FILE=trim(file_loc), EXIST=file_check)
 
+            ! If it exists, y_cb.dat is read
             if (file_check) then
                 open (1, FILE=trim(file_loc), form='unformatted', STATUS='old', ACTION='read')
                 read (1) y_cb(-1:n)
@@ -199,18 +237,24 @@ contains
                 call s_mpi_abort('File y_cb.dat is missing in ' // trim(t_step_dir) // '. Exiting.')
             end if
 
+            ! Computing cell-center locations
             y_cc(0:n) = (y_cb(0:n) + y_cb(-1:(n - 1)))/2._wp
 
+            ! Computing minimum cell-width
             dy = minval(y_cb(0:n) - y_cb(-1:n - 1))
             if (num_procs > 1) call s_mpi_reduce_min(dy)
 
+            ! Setting locations of domain bounds
             y_domain%beg = y_cb(-1)
             y_domain%end = y_cb(n)
 
+            ! Reading the Grid Data File for the z-direction
             if (p > 0) then
+                ! Checking whether z_cb.dat exists
                 file_loc = trim(t_step_dir) // '/z_cb.dat'
                 inquire (FILE=trim(file_loc), EXIST=file_check)
 
+                ! If it exists, z_cb.dat is read
                 if (file_check) then
                     open (1, FILE=trim(file_loc), form='unformatted', STATUS='old', ACTION='read')
                     read (1) z_cb(-1:p)
@@ -219,17 +263,23 @@ contains
                     call s_mpi_abort('File z_cb.dat is missing in ' // trim(t_step_dir) // '. Exiting.')
                 end if
 
+                ! Computing cell-center locations
                 z_cc(0:p) = (z_cb(0:p) + z_cb(-1:(p - 1)))/2._wp
 
+                ! Computing minimum cell-width
                 dz = minval(z_cb(0:p) - z_cb(-1:p - 1))
                 if (num_procs > 1) call s_mpi_reduce_min(dz)
 
+                ! Setting locations of domain bounds
                 z_domain%beg = z_cb(-1)
                 z_domain%end = z_cb(p)
             end if
         end if
 
-        ! Clean processor dir and create time-step dir (unless reading preexisting IC)
+        ! If only the preexisting grid data files are read in and there will not be any preexisting initial condition data files
+        ! imported, then the directory associated with the rank of the local processor may be cleaned to make room for the new
+        ! pre-process data. In addition, the time-step directory that will contain the new grid and initial condition data are also
+        ! generated.
         if (old_ic .neqv. .true.) then
             call s_delete_directory(trim(proc_rank_dir))
             call s_create_directory(trim(proc_rank_dir) // '/0')
@@ -241,15 +291,21 @@ contains
     !! coordinate directions and making sure that all of the cell-widths are positively valued
     impure subroutine s_check_grid_data_files
 
+        ! Cell-boundary Data Consistency Check in x-direction
+
         if (any(x_cb(0:m) - x_cb(-1:m - 1) <= 0._wp)) then
             call s_mpi_abort('x_cb.dat in ' // trim(t_step_dir) // ' contains non-positive cell-spacings. Exiting.')
         end if
 
+        ! Cell-boundary Data Consistency Check in y-direction
+
         if (n > 0) then
             if (any(y_cb(0:n) - y_cb(-1:n - 1) <= 0._wp)) then
                 call s_mpi_abort('y_cb.dat in ' // trim(t_step_dir) // ' contains non-positive cell-spacings. ' // 'Exiting.')
             end if
 
+            ! Cell-boundary Data Consistency Check in z-direction
+
             if (p > 0) then
                 if (any(z_cb(0:p) - z_cb(-1:p - 1) <= 0._wp)) then
                     call s_mpi_abort('z_cb.dat in ' // trim(t_step_dir) // ' contains non-positive cell-spacings' // ' .Exiting.')
@@ -261,19 +317,29 @@ contains
 
     !> The goal of this subroutine is to read in any preexisting initial condition data files so that they may be used by the
     !! pre-process as a starting point in the creation of an all new initial condition.
+    !! @param q_cons_vf_in Conservative variables
     impure subroutine s_read_serial_ic_data_files(q_cons_vf_in)
 
-        type(scalar_field), dimension(sys_size), intent(inout)   :: q_cons_vf_in
-        character(LEN=len_trim(case_dir) + 3*name_len)           :: file_loc
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf_in
+        character(LEN=len_trim(case_dir) + 3*name_len)         :: file_loc
+        ! Generic string used to store the address of a particular file
+
+        !> Used to store the variable position, in character form, of the currently manipulated conservative variable file
         character(LEN=int(floor(log10(real(sys_size, wp)))) + 1) :: file_num
-        logical                                                  :: file_check
-        integer                                                  :: i, r
+
+        !> Generic logical used for the purpose of asserting whether a file is or is not present in the designated location
+        logical :: file_check
+        integer :: i, r  !< Generic loop iterator
+
+        ! Reading the Conservative Variables Data Files
 
         do i = 1, sys_size
+            ! Checking whether data file associated with variable position of the currently manipulated conservative variable exists
             write (file_num, '(I0)') i
             file_loc = trim(t_step_dir) // '/q_cons_vf' // trim(file_num) // '.dat'
             inquire (FILE=trim(file_loc), EXIST=file_check)
 
+            ! If it exists, the data file is read
             if (file_check) then
                 open (1, FILE=trim(file_loc), form='unformatted', STATUS='old', ACTION='read')
                 read (1) q_cons_vf_in(i)%sf
@@ -283,13 +349,17 @@ contains
             end if
         end do
 
+        ! Read bubble variables pb and mv for non-polytropic qbmm
         if (qbmm .and. .not. polytropic) then
             do i = 1, nb
                 do r = 1, nnode
+                    ! Checking whether data file associated with variable position of the currently manipulated bubble variable
+                    ! exists
                     write (file_num, '(I0)') sys_size + r + (i - 1)*nnode
                     file_loc = trim(t_step_dir) // '/pb' // trim(file_num) // '.dat'
                     inquire (FILE=trim(file_loc), EXIST=file_check)
 
+                    ! If it exists, the data file is read
                     if (file_check) then
                         open (1, FILE=trim(file_loc), form='unformatted', STATUS='old', ACTION='read')
                         read (1) pb%sf(:,:,:,r, i)
@@ -302,10 +372,13 @@ contains
 
             do i = 1, nb
                 do r = 1, nnode
+                    ! Checking whether data file associated with variable position of the currently manipulated bubble variable
+                    ! exists
                     write (file_num, '(I0)') sys_size + r + (i - 1)*nnode
                     file_loc = trim(t_step_dir) // '/mv' // trim(file_num) // '.dat'
                     inquire (FILE=trim(file_loc), EXIST=file_check)
 
+                    ! If it exists, the data file is read
                     if (file_check) then
                         open (1, FILE=trim(file_loc), form='unformatted', STATUS='old', ACTION='read')
                         read (1) mv%sf(:,:,:,r, i)
@@ -340,6 +413,7 @@ contains
         allocate (y_cb_glb(-1:n_glb))
         allocate (z_cb_glb(-1:p_glb))
 
+        ! Read in cell boundary locations in x-direction
         file_loc = trim(case_dir) // '/restart_data' // trim(mpiiofs) // 'x_cb.dat'
         inquire (FILE=trim(file_loc), EXIST=file_exist)
 
@@ -352,14 +426,19 @@ contains
             call s_mpi_abort('File ' // trim(file_loc) // ' is missing. Exiting. ')
         end if
 
+        ! Assigning local cell boundary locations
         x_cb(-1:m) = x_cb_glb((start_idx(1) - 1):(start_idx(1) + m))
+        ! Computing cell center locations
         x_cc(0:m) = (x_cb(0:m) + x_cb(-1:(m - 1)))/2._wp
+        ! Computing minimum cell width
         dx = minval(x_cb(0:m) - x_cb(-1:(m - 1)))
         if (num_procs > 1) call s_mpi_reduce_min(dx)
+        ! Setting locations of domain bounds
         x_domain%beg = x_cb(-1)
         x_domain%end = x_cb(m)
 
         if (n > 0) then
+            ! Read in cell boundary locations in y-direction
             file_loc = trim(case_dir) // '/restart_data' // trim(mpiiofs) // 'y_cb.dat'
             inquire (FILE=trim(file_loc), EXIST=file_exist)
 
@@ -372,14 +451,19 @@ contains
                 call s_mpi_abort('File ' // trim(file_loc) // ' is missing. Exiting. ')
             end if
 
+            ! Assigning local cell boundary locations
             y_cb(-1:n) = y_cb_glb((start_idx(2) - 1):(start_idx(2) + n))
+            ! Computing cell center locations
             y_cc(0:n) = (y_cb(0:n) + y_cb(-1:(n - 1)))/2._wp
+            ! Computing minimum cell width
             dy = minval(y_cb(0:n) - y_cb(-1:(n - 1)))
             if (num_procs > 1) call s_mpi_reduce_min(dy)
+            ! Setting locations of domain bounds
             y_domain%beg = y_cb(-1)
             y_domain%end = y_cb(n)
 
             if (p > 0) then
+                ! Read in cell boundary locations in z-direction
                 file_loc = trim(case_dir) // '/restart_data' // trim(mpiiofs) // 'z_cb.dat'
                 inquire (FILE=trim(file_loc), EXIST=file_exist)
 
@@ -392,10 +476,14 @@ contains
                     call s_mpi_abort('File ' // trim(file_loc) // ' is missing. Exiting. ')
                 end if
 
+                ! Assigning local cell boundary locations
                 z_cb(-1:p) = z_cb_glb((start_idx(3) - 1):(start_idx(3) + p))
+                ! Computing cell center locations
                 z_cc(0:p) = (z_cb(0:p) + z_cb(-1:(p - 1)))/2._wp
+                ! Computing minimum cell width
                 dz = minval(z_cb(0:p) - z_cb(-1:(p - 1)))
                 if (num_procs > 1) call s_mpi_reduce_min(dz)
+                ! Setting locations of domain bounds
                 z_domain%beg = z_cb(-1)
                 z_domain%end = z_cb(p)
             end if
@@ -408,6 +496,7 @@ contains
 
     !> The goal of this subroutine is to read in any preexisting initial condition data files so that they may be used by the
     !! pre-process as a starting point in the creation of an all new initial condition.
+    !! @param q_cons_vf_in Conservative variables
     impure subroutine s_read_parallel_ic_data_files(q_cons_vf_in)
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf_in
@@ -424,6 +513,7 @@ contains
         logical                              :: file_exist
         integer                              :: i
 
+        ! Open the file to read
         if (cfl_adap_dt) then
             write (file_loc, '(I0,A)') n_start, '.dat'
         else
@@ -437,6 +527,7 @@ contains
 
             call s_initialize_mpi_data(q_cons_vf_in)
 
+            ! Size of local arrays
             data_size = (m + 1)*(n + 1)*(p + 1)
 
             ! Resize some integers so MPI can read even the biggest files
@@ -448,6 +539,7 @@ contains
             str_MOK = int(name_len, MPI_OFFSET_KIND)
             NVARS_MOK = int(sys_size, MPI_OFFSET_KIND)
 
+            ! Read the data for each variable
             do i = 1, sys_size
                 var_MOK = int(i, MPI_OFFSET_KIND)
 
@@ -482,13 +574,17 @@ contains
 
     end subroutine s_read_parallel_ic_data_files
 
-    !> Initialize all pre-process modules, allocate data structures, and set I/O procedure pointers.
+    !> @brief Initializes all pre-process modules, allocates data structures, and sets I/O procedure pointers.
     impure subroutine s_initialize_modules
 
+        ! Computation of parameters, allocation procedures, and/or any other tasks needed to properly setup the modules
         call s_initialize_global_parameters_module()
         if (bubbles_euler .or. bubbles_lagrange) then
             call s_initialize_bubbles_model()
         end if
+        if (particles_lagrange) then
+            call s_initialize_particles_model()
+        end if
         call s_initialize_mpi_common_module()
         call s_initialize_data_output_module()
         call s_initialize_variables_conversion_module()
@@ -502,6 +598,7 @@ contains
         ! Create the D directory if it doesn't exit, to store the serial data files
         call s_create_directory('D')
 
+        ! Associate pointers for serial or parallel I/O
         if (parallel_io .neqv. .true.) then
             s_generate_grid => s_generate_serial_grid
             s_read_grid_data_files => s_read_serial_grid_data_files
@@ -516,7 +613,7 @@ contains
 
     end subroutine s_initialize_modules
 
-    !> Read an existing grid from data files or generate a new grid from user inputs.
+    !> @brief Reads an existing grid from data files or generates a new grid from user inputs.
     impure subroutine s_read_grid()
 
         if (old_grid) then
@@ -535,13 +632,21 @@ contains
 
     end subroutine s_read_grid
 
-    !> Generate or read the initial condition, apply relaxation if needed, and write output data files.
+    !> @brief Generates or reads the initial condition, applies relaxation if needed, and writes output data files.
     impure subroutine s_apply_initial_condition(start, finish)
 
         real(wp), intent(inout) :: start, finish
         integer                 :: j, k, l
         real(wp)                :: r2
 
+        ! Setting up the grid and the initial condition. If the grid is read in from preexisting grid data files, it is checked for
+        ! consistency. If the grid is not read in, it is generated from scratch according to the inputs provided by the user. The
+        ! initial condition may also be read in. It in turn is not checked for consistency since it WILL further be edited by the
+        ! pre-process and also because it may be incomplete at the time it is read in. Finally, when the grid and initial condition
+        ! are completely setup, they are written to their respective data files.
+
+        ! Setting up grid and initial condition
+
         call cpu_time(start)
 
         if (old_ic) call s_read_ic_data_files(q_cons_vf)
@@ -578,7 +683,7 @@ contains
 
     end subroutine s_apply_initial_condition
 
-    !> Gather processor timing data and write elapsed wall-clock time to a summary file.
+    !> @brief Gathers processor timing data and writes elapsed wall-clock time to a summary file.
     impure subroutine s_save_data(proc_time, time_avg, time_final, file_exists)
 
         real(wp), dimension(:), intent(inout) :: proc_time
@@ -614,11 +719,17 @@ contains
 
     end subroutine s_save_data
 
-    !> Initialize MPI, read and validate user inputs on rank 0, and decompose the computational domain.
+    !> @brief Initializes MPI, reads and validates user inputs on rank 0, and decomposes the computational domain.
     impure subroutine s_initialize_mpi_domain
 
+        ! Initialization of the MPI environment
+
         call s_mpi_initialize()
 
+        ! Rank 0 processor assigns default values to user inputs prior to reading those in from the input file. Next, the user
+        ! inputs are read in and their consistency is checked. The detection of any inconsistencies automatically leads to the
+        ! termination of the pre-process.
+
         if (proc_rank == 0) then
             call s_assign_default_values_to_user_inputs()
             call s_read_input_file()
@@ -635,14 +746,16 @@ contains
 
     end subroutine s_initialize_mpi_domain
 
-    !> Finalize all pre-process modules, deallocate resources, and shut down MPI.
+    !> @brief Finalizes all pre-process modules, deallocates resources, and shuts down MPI.
     impure subroutine s_finalize_modules
 
+        ! Disassociate pointers for serial and parallel I/O
         s_generate_grid => null()
         s_read_grid_data_files => null()
         s_read_ic_data_files => null()
         s_write_data_files => null()
 
+        ! Deallocation procedures for the modules
         call s_finalize_mpi_common_module()
         call s_finalize_grid_module()
         call s_finalize_variables_conversion_module()
@@ -653,6 +766,7 @@ contains
         call s_finalize_boundary_common_module()
         if (relax) call s_finalize_relaxation_solver_module()
         call s_finalize_initial_condition_module()
+        ! Finalization of the MPI environment
         call s_mpi_finalize()
 
     end subroutine s_finalize_modules
diff --git a/src/simulation/m_bubbles.fpp b/src/simulation/m_bubbles.fpp
index 2ff5952827..60a7ed8f4b 100644
--- a/src/simulation/m_bubbles.fpp
+++ b/src/simulation/m_bubbles.fpp
@@ -13,6 +13,7 @@ module m_bubbles
     use m_mpi_proxy
     use m_variables_conversion
     use m_helper_basic
+    use m_bubbles_EL_kernels
 
     implicit none
 
@@ -23,8 +24,21 @@ module m_bubbles
 
 contains
 
-    !> Compute the bubble radial acceleration based on the selected bubble model
-    elemental function f_rddot(fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, fntait, fBtait, f_bub_adv_src, f_divu, fCson)
+    !> Function that computes the bubble radial acceleration based on bubble models
+    !! @param fRho Current density
+    !! @param fP Current driving pressure
+    !! @param fR Current bubble radius
+    !! @param fV Current bubble velocity
+    !! @param fR0 Equilibrium bubble radius
+    !! @param fpb Internal bubble pressure
+    !! @param fpbdot Time-derivative of internal bubble pressure
+    !! @param alf bubble volume fraction
+    !! @param fntait Tait EOS parameter
+    !! @param fBtait Tait EOS parameter
+    !! @param f_bub_adv_src Source for bubble volume fraction
+    !! @param f_divu Divergence of velocity
+    !! @param fCson Speed of sound from fP (EL)
+    function f_rddot(fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, fntait, fBtait, f_bub_adv_src, f_divu, fCson)
 
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fRho, fP, fR, fV, fR0, fpb, fpbdot, alf
@@ -63,8 +77,12 @@ contains
 
     end function f_rddot
 
-    !> Bubble wall pressure: stiffened gas with Laplace pressure and viscous stress
-    elemental function f_cpbw(fR0, fR, fV, fpb)
+    !> Function that computes that bubble wall pressure for Gilmore bubbles
+    !! @param fR0 Equilibrium bubble radius
+    !! @param fR Current bubble radius
+    !! @param fV Current bubble velocity
+    !! @param fpb Internal bubble pressure
+    function f_cpbw(fR0, fR, fV, fpb)
 
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fR0, fR, fV, fpb
@@ -78,8 +96,12 @@ contains
 
     end function f_cpbw
 
-    !> Compute the bubble enthalpy
-    elemental function f_H(fCpbw, fCpinf, fntait, fBtait)
+    !> Function that computes the bubble enthalpy
+    !! @param fCpbw Bubble wall pressure
+    !! @param fCpinf Driving bubble pressure
+    !! @param fntait Tait EOS parameter
+    !! @param fBtait Tait EOS parameter
+    function f_H(fCpbw, fCpinf, fntait, fBtait)
 
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fCpbw, fCpinf, fntait, fBtait
@@ -94,8 +116,12 @@ contains
 
     end function f_H
 
-    !> Compute the sound speed for the bubble
-    elemental function f_cgas(fCpinf, fntait, fBtait, fH)
+    !> Function that computes the sound speed for the bubble
+    !! @param fCpinf Driving bubble pressure
+    !! @param fntait Tait EOS parameter
+    !! @param fBtait Tait EOS parameter
+    !! @param fH Bubble enthalpy
+    function f_cgas(fCpinf, fntait, fBtait, fH)
 
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fCpinf, fntait, fBtait, fH
@@ -110,8 +136,15 @@ contains
 
     end function f_cgas
 
-    !> Compute the time derivative of the driving pressure
-    elemental function f_cpinfdot(fRho, fP, falf, fntait, fBtait, advsrc, divu)
+    !> Function that computes the time derivative of the driving pressure
+    !! @param fRho Local liquid density
+    !! @param fP Local pressure
+    !! @param falf Local void fraction
+    !! @param fntait Tait EOS parameter
+    !! @param fBtait Tait EOS parameter
+    !! @param advsrc Advection equation source term
+    !! @param divu Divergence of velocity
+    function f_cpinfdot(fRho, fP, falf, fntait, fBtait, advsrc, divu)
 
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fRho, fP, falf, fntait, fBtait, advsrc, divu
@@ -130,8 +163,17 @@ contains
 
     end function f_cpinfdot
 
-    !> Enthalpy derivative for Gilmore bubble model, Gilmore (1952)
-    elemental function f_Hdot(fCpbw, fCpinf, fCpinf_dot, fntait, fBtait, fR, fV, fR0, fpbdot)
+    !> Function that computes the time derivative of the enthalpy
+    !! @param fCpbw Bubble wall pressure
+    !! @param fCpinf Driving bubble pressure
+    !! @param fCpinf_dot Time derivative of the driving pressure
+    !! @param fntait Tait EOS parameter
+    !! @param fBtait Tait EOS parameter
+    !! @param fR Current bubble radius
+    !! @param fV Current bubble velocity
+    !! @param fR0 Equilibrium bubble radius
+    !! @param fpbdot Time derivative of the internal bubble pressure
+    function f_Hdot(fCpbw, fCpinf, fCpinf_dot, fntait, fBtait, fR, fV, fR0, fpbdot)
 
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fCpbw, fCpinf, fCpinf_dot, fntait, fBtait
@@ -150,21 +192,43 @@ contains
         f_Hdot = (fCpbw/(1._wp + fBtait) + 1._wp)**(-1._wp/fntait)*(tmp1 + tmp2) - (fCpinf/(1._wp + fBtait) + 1._wp) &
                   & **(-1._wp/fntait)*fCpinf_dot
 
+        ! Hdot = (Cpbw/(1+B) + 1)^(-1/n_tait)*(-3 gam)*(R0/R)^(3gam) V/R f_Hdot =
+        ! ((fCpbw/(1._wp+fBtait)+1._wp)**(-1._wp/fntait))*(-3._wp)*gam * & ( (fR0/fR)**(3._wp*gam ))*(fV/fR)
+
+        ! Hdot = Hdot - (Cpinf/(1+B) + 1)^(-1/n_tait) Cpinfdot f_Hdot = f_Hdot -
+        ! ((fCpinf/(1._wp+fBtait)+1._wp)**(-1._wp/fntait))*fCpinf_dot
+
     end function f_Hdot
 
-    !> Rayleigh-Plesset bubble radial acceleration
-    elemental function f_rddot_RP(fCp, fRho, fR, fV, fCpbw)
+    !> Function that computes the bubble radial acceleration for Rayleigh-Plesset bubbles
+    !! @param fCp Driving pressure
+    !! @param fRho Current density
+    !! @param fR Current bubble radius
+    !! @param fV Current bubble velocity
+    !! @param fCpbw Boundary wall pressure
+    function f_rddot_RP(fCp, fRho, fR, fV, fCpbw)
 
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fCp, fRho, fR, fV, fCpbw
         real(wp)             :: f_rddot_RP
 
+        !! rddot = (1/r) ( -3/2 rdot^2 + ((r0/r)^3\gamma - Cp)/rho ) rddot = (1/r) ( -3/2 rdot^2 + (tmp1 - Cp)/rho ) rddot = (1/r) (
+        !! tmp2 )
+
         f_rddot_RP = (-1.5_wp*(fV**2._wp) + (fCpbw - fCp)/fRho)/fR
 
     end function f_rddot_RP
 
-    !> Compute the Gilmore bubble radial acceleration
-    elemental function f_rddot_G(fCpbw, fR, fV, fH, fHdot, fcgas, fntait, fBtait)
+    !> Function that computes the bubble radial acceleration
+    !! @param fCpbw Bubble wall pressure
+    !! @param fR Current bubble radius
+    !! @param fV Current bubble velocity
+    !! @param fH Current enthalpy
+    !! @param fHdot Current time derivative of the enthalpy
+    !! @param fcgas Current gas sound speed
+    !! @param fntait Tait EOS parameter
+    !! @param fBtait Tait EOS parameter
+    function f_rddot_G(fCpbw, fR, fV, fH, fHdot, fcgas, fntait, fBtait)
 
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fCpbw, fR, fV, fH, fHdot
@@ -180,8 +244,12 @@ contains
 
     end function f_rddot_G
 
-    !> Keller-Miksis bubble wall pressure
-    elemental function f_cpbw_KM(fR0, fR, fV, fpb)
+    !> Function that computes the bubble wall pressure for Keller--Miksis bubbles
+    !! @param fR0 Equilibrium bubble radius
+    !! @param fR Current bubble radius
+    !! @param fV Current bubble velocity
+    !! @param fpb Internal bubble pressure
+    function f_cpbw_KM(fR0, fR, fV, fpb)
 
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fR0, fR, fV, fpb
@@ -199,8 +267,16 @@ contains
 
     end function f_cpbw_KM
 
-    !> Keller-Miksis bubble radial acceleration
-    elemental function f_rddot_KM(fpbdot, fCp, fCpbw, fRho, fR, fV, fR0, fC)
+    !> Function that computes the bubble radial acceleration for Keller--Miksis bubbles
+    !! @param fpbdot Time-derivative of internal bubble pressure
+    !! @param fCp Driving pressure
+    !! @param fCpbw Bubble wall pressure
+    !! @param fRho Current density
+    !! @param fR Current bubble radius
+    !! @param fV Current bubble velocity
+    !! @param fR0 Equilibrium bubble radius
+    !! @param fC Current sound speed
+    function f_rddot_KM(fpbdot, fCp, fCpbw, fRho, fR, fV, fR0, fC)
 
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fpbdot, fCp, fCpbw
@@ -228,8 +304,10 @@ contains
 
     end function f_rddot_KM
 
-    !> Compute bubble wall properties for vapor bubbles
-    elemental subroutine s_bwproperty(pb_in, iR0, chi_vw_out, k_mw_out, rho_mw_out)
+    !> Subroutine that computes bubble wall properties for vapor bubbles
+    !! @param pb_in Internal bubble pressure
+    !! @param iR0 Current bubble size index
+    subroutine s_bwproperty(pb_in, iR0, chi_vw_out, k_mw_out, rho_mw_out)
 
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in)  :: pb_in
@@ -249,8 +327,18 @@ contains
 
     end subroutine s_bwproperty
 
-    !> Compute the vapour flux
-    elemental subroutine s_vflux(fR, fV, fpb, fmass_v, iR0, vflux, fmass_g, fbeta_c, fR_m, fgamma_m)
+    !> Function that computes the vapour flux
+    !! @param fR Current bubble radius
+    !! @param fV Current bubble velocity
+    !! @param fpb
+    !! @param fmass_v Current mass of vapour
+    !! @param iR0 Bubble size index (EE) or bubble identifier (EL)
+    !! @param vflux Computed vapour flux
+    !! @param fmass_g Current gas mass (EL)
+    !! @param fbeta_c Mass transfer coefficient (EL)
+    !! @param fR_m Mixture gas constant (EL)
+    !! @param fgamma_m Mixture gamma (EL)
+    subroutine s_vflux(fR, fV, fpb, fmass_v, iR0, vflux, fmass_g, fbeta_c, fR_m, fgamma_m)
 
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in)            :: fR
@@ -266,7 +354,7 @@ contains
         real(wp)                        :: grad_chi
         real(wp)                        :: conc_v
 
-        if (thermal == 3) then  ! transfer
+        if (thermal == 3) then
             ! constant transfer model
             if (bubbles_lagrange) then
                 ! Mixture properties (gas+vapor) in the bubble
@@ -297,9 +385,19 @@ contains
 
     end subroutine s_vflux
 
-    !> Compute the time derivative of the internal bubble pressure
-    elemental function f_bpres_dot(fvflux, fR, fV, fpb, fmass_v, iR0, fbeta_t, fR_m, fgamma_m)
-
+    !> Function that computes the time derivative of the internal bubble pressure
+    !! @param fvflux Vapour flux
+    !! @param fR Current bubble radius
+    !! @param fV Current bubble velocity
+    !! @param fpb Current internal bubble pressure
+    !! @param fmass_v Current mass of vapour
+    !! @param iR0 Bubble size index (EE) or bubble identifier (EL)
+    !! @param fbeta_t Mass transfer coefficient (EL)
+    !! @param fR_m Mixture gas constant (EL)
+    !! @param fgamma_m Mixture gamma (EL)
+    function f_bpres_dot(fvflux, fR, fV, fpb, fmass_v, iR0, fbeta_t, fR_m, fgamma_m)
+
+        !$DIR INLINENEVER f_bpres_dot
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in)           :: fvflux
         real(wp), intent(in)           :: fR
@@ -332,25 +430,48 @@ contains
 
     !> Adaptive time stepping routine for subgrid bubbles (See Heirer, E. Hairer S.P.Norsett G. Wanner, Solving Ordinary
     !! Differential Equations I, Chapter II.4)
-    subroutine s_advance_step(fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, fntait, fBtait, f_bub_adv_src, f_divu, bub_id, fmass_v, &
-
-        & fmass_g, fbeta_c, fbeta_t, fCson, adap_dt_stop)
-        $:GPU_ROUTINE(function_name='s_advance_step',parallelism='[seq]', cray_inline=True)
+    !! @param fRho Current density
+    !! @param fP Current driving pressure
+    !! @param fR Current bubble radius
+    !! @param fV Current bubble radial velocity
+    !! @param fR0 Equilibrium bubble radius
+    !! @param fpb Internal bubble pressure
+    !! @param fpbdot Time-derivative of internal bubble pressure
+    !! @param alf bubble volume fraction
+    !! @param fntait Tait EOS parameter
+    !! @param fBtait Tait EOS parameter
+    !! @param f_bub_adv_src Source for bubble volume fraction
+    !! @param f_divu Divergence of velocity
+    !! @param bub_id Bubble identifier (EL)
+    !! @param fmass_v Current mass of vapour (EL)
+    !! @param fmass_g Current mass of gas (EL)
+    !! @param fbeta_c Mass transfer coefficient (EL)
+    !! @param fbeta_t Heat transfer coefficient (EL)
+    !! @param fCson Speed of sound (EL)
+    !! @param adap_dt_stop Fail-safe exit if max iteration count reached
+    function f_advance_step(fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, fntait, fBtait, f_bub_adv_src, f_divu, bub_id, fmass_v, &
+                            & fmass_g, fbeta_c, fbeta_t, fCson, fRe, fPos, fVel, cell, q_prim_vf) result(adap_dt_stop)
+        $:GPU_ROUTINE(parallelism='[seq]')
 
-        real(wp), intent(inout) :: fR, fV, fpb, fmass_v
-        real(wp), intent(in)    :: fRho, fP, fR0, fpbdot, alf
-        real(wp), intent(in)    :: fntait, fBtait, f_bub_adv_src, f_divu
-        integer, intent(in)     :: bub_id
-        real(wp), intent(in)    :: fmass_g, fbeta_c, fbeta_t, fCson
-        integer, intent(inout)  :: adap_dt_stop
-        real(wp), dimension(5)  :: err    !< Error estimates for adaptive time stepping
-        real(wp)                :: t_new  !< Updated time step size
-        real(wp)                :: h0, h  !< Time step size
+        real(wp), intent(inout)                                       :: fR, fV, fpb, fmass_v
+        real(wp), intent(in)                                          :: fRho, fP, fR0, fpbdot, alf
+        real(wp), intent(in)                                          :: fntait, fBtait, f_bub_adv_src, f_divu
+        integer, intent(in)                                           :: bub_id
+        real(wp), intent(in)                                          :: fmass_g, fbeta_c, fbeta_t, fCson
+        real(wp), intent(inout), dimension(3), optional               :: fPos, fVel
+        real(wp), intent(in), optional                                :: fRe
+        integer, intent(in), dimension(3), optional                   :: cell
+        type(scalar_field), intent(in), dimension(sys_size), optional :: q_prim_vf
+        real(wp), dimension(5)                                        :: err    !< Error estimates for adaptive time stepping
+        real(wp)                                                      :: t_new  !< Updated time step size
+        real(wp)                                                      :: h0, h  !< Time step size
         !> Bubble radius, radial velocity, and radial acceleration for the inner loop
         real(wp), dimension(4) :: myR_tmp1, myV_tmp1, myR_tmp2, myV_tmp2
         real(wp), dimension(4) :: myPb_tmp1, myMv_tmp1, myPb_tmp2, myMv_tmp2  !< Gas pressure and vapor mass for the inner loop (EL)
-        real(wp)               :: fR2, fV2, fpb2, fmass_v2
-        integer                :: iter_count
+        real(wp)               :: fR2, fV2, fpb2, fmass_v2, f_bTemp
+        real(wp), dimension(3) :: vTemp, aTemp
+        integer                :: adap_dt_stop
+        integer                :: l, iter_count
 
         call s_initial_substep_h(fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, fntait, fBtait, f_bub_adv_src, f_divu, fCson, h0)
         h = h0
@@ -416,6 +537,42 @@ contains
                         ! Update pb and mass_v
                         fpb = myPb_tmp1(4)
                         fmass_v = myMv_tmp1(4)
+
+                        select case (lag_vel_model)
+                        case (1)
+                            do l = 1, num_dims
+                                vTemp(l) = f_interpolate_velocity(fR, cell, l, q_prim_vf)
+                            end do
+                            do l = 1, num_dims
+                                fVel(l) = vTemp(l)
+                                fPos(l) = fPos(l) + h*vTemp(l)
+                            end do
+                        case (2)
+                            do l = 1, num_dims
+                                f_bTemp = f_get_bubble_force(fPos(l), fR, fV, fVel(l), fmass_g, fmass_v, fRe, fRho, cell, l, &
+                                                             & q_prim_vf)
+                                aTemp(l) = f_bTemp/(fmass_g + fmass_v)
+                            end do
+                            do l = 1, num_dims
+                                fVel(l) = fVel(l) + h*aTemp(l)
+                                fPos(l) = fPos(l) + h*fVel(l)
+                            end do
+                        case (3)
+                            do l = 1, num_dims
+                                f_bTemp = f_get_bubble_force(fPos(l), fR, fV, fVel(l), fmass_g, fmass_v, fRe, fRho, cell, l, &
+                                                             & q_prim_vf)
+                                aTemp(l) = 2._wp*f_bTemp/(fmass_g + fmass_v) - 3._wp*fV*fVel(l)/fR
+                            end do
+                            do l = 1, num_dims
+                                fVel(l) = fVel(l) + h*aTemp(l)
+                                fPos(l) = fPos(l) + h*fVel(l)
+                            end do
+                        case default
+                            do l = 1, num_dims
+                                fVel(l) = fVel(l)
+                                fPos(l) = fPos(l)
+                            end do
+                        end select
                     end if
 
                     ! Update step size for the next sub-step
@@ -438,10 +595,24 @@ contains
 
         if (iter_count >= adap_dt_max_iters) adap_dt_stop = 1
 
-    end subroutine s_advance_step
+    end function f_advance_step
 
     !> Choose the initial time step size for the adaptive time stepping routine (See Heirer, E. Hairer S.P.Norsett G. Wanner,
     !! Solving Ordinary Differential Equations I, Chapter II.4)
+    !! @param fRho Current density
+    !! @param fP Current driving pressure
+    !! @param fR Current bubble radius
+    !! @param fV Current bubble velocity
+    !! @param fR0 Equilibrium bubble radius
+    !! @param fpb Internal bubble pressure
+    !! @param fpbdot Time-derivative of internal bubble pressure
+    !! @param alf bubble volume fraction
+    !! @param fntait Tait EOS parameter
+    !! @param fBtait Tait EOS parameter
+    !! @param f_bub_adv_src Source for bubble volume fraction
+    !! @param f_divu Divergence of velocity
+    !! @param fCson Speed of sound (EL)
+    !! @param h Time step size
     subroutine s_initial_substep_h(fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, fntait, fBtait, f_bub_adv_src, f_divu, fCson, h)
 
         $:GPU_ROUTINE(function_name='s_initial_substep_h',parallelism='[seq]', cray_inline=True)
@@ -453,6 +624,7 @@ contains
         real(wp), dimension(2) :: h_size                     !< Time step size (h0, h1)
         real(wp), dimension(3) :: d_norms                    !< norms (d_0, d_1, d_2)
         real(wp), dimension(2) :: myR_tmp, myV_tmp, myA_tmp  !< Bubble radius, radial velocity, and radial acceleration
+
         ! Determine the starting time step Evaluate f(x0,y0)
         myR_tmp(1) = fR
         myV_tmp(1) = fV
@@ -487,9 +659,32 @@ contains
     end subroutine s_initial_substep_h
 
     !> Integrate bubble variables over the given time step size, h, using a third-order accurate embedded Runge-Kutta scheme.
+    !! @param err Estimated error
+    !! @param fRho Current density
+    !! @param fP Current driving pressure
+    !! @param fR Current bubble radius
+    !! @param fV Current bubble velocity
+    !! @param fR0 Equilibrium bubble radius
+    !! @param fpb Internal bubble pressure
+    !! @param fpbdot Time-derivative of internal bubble pressure
+    !! @param alf bubble volume fraction
+    !! @param fntait Tait EOS parameter
+    !! @param fBtait Tait EOS parameter
+    !! @param f_bub_adv_src Source for bubble volume fraction
+    !! @param f_divu Divergence of velocity
+    !! @param bub_id Bubble identifier (EL)
+    !! @param fmass_v Current mass of vapour (EL)
+    !! @param fmass_g Current mass of gas (EL)
+    !! @param fbeta_c Mass transfer coefficient (EL)
+    !! @param fbeta_t Heat transfer coefficient (EL)
+    !! @param fCson Speed of sound (EL)
+    !! @param h Time step size
+    !! @param myR_tmp Bubble radius at each stage
+    !! @param myV_tmp Bubble radial velocity at each stage
+    !! @param myPb_tmp Internal bubble pressure at each stage (EL)
+    !! @param myMv_tmp Mass of vapor in the bubble at each stage (EL)
     subroutine s_advance_substep(err, fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, fntait, fBtait, f_bub_adv_src, f_divu, bub_id, &
-
-        & fmass_v, fmass_g, fbeta_c, fbeta_t, fCson, h, myR_tmp, myV_tmp, myPb_tmp, myMv_tmp)
+                                 & fmass_v, fmass_g, fbeta_c, fbeta_t, fCson, h, myR_tmp, myV_tmp, myPb_tmp, myMv_tmp)
         $:GPU_ROUTINE(function_name='s_advance_substep',parallelism='[seq]', cray_inline=True)
 
         real(wp), intent(out)               :: err
@@ -577,7 +772,18 @@ contains
     end subroutine s_advance_substep
 
     !> Changes of pressure and vapor mass in the lagrange bubbles.
-    elemental subroutine s_advance_EL(fR_tmp, fV_tmp, fPb_tmp, fMv_tmp, bub_id, fmass_g, fbeta_c, fbeta_t, fdPbdt_tmp, advance_EL)
+    !! @param fR_tmp Bubble radius
+    !! @param fV_tmp Bubble radial velocity
+    !! @param fPb_tmp Internal bubble pressure
+    !! @param fMv_tmp Mass of vapor in the bubble
+    !! @param bub_id Bubble identifier
+    !! @param fmass_g Current mass of gas
+    !! @param fbeta_c Mass transfer coefficient
+    !! @param fbeta_t Heat transfer coefficient
+    !! @param fdPbdt_tmp Rate of change of the internal bubble pressure
+    !! @param fdMvdt_tmp Rate of change of the mass of vapor in the bubble
+    !! @param advance_EL Rate of change of the mass of vapor in the bubble
+    subroutine s_advance_EL(fR_tmp, fV_tmp, fPb_tmp, fMv_tmp, bub_id, fmass_g, fbeta_c, fbeta_t, fdPbdt_tmp, advance_EL)
 
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in)    :: fR_tmp, fV_tmp, fPb_tmp, fMv_tmp
diff --git a/src/simulation/m_bubbles_EE.fpp b/src/simulation/m_bubbles_EE.fpp
index b2c5f1fd97..94f6605dd5 100644
--- a/src/simulation/m_bubbles_EE.fpp
+++ b/src/simulation/m_bubbles_EE.fpp
@@ -27,7 +27,7 @@ module m_bubbles_EE
 
 contains
 
-    !> Initialize the Euler-Euler bubble module
+    !> @brief Allocates and initializes arrays for the Euler-Euler bubble model.
     impure subroutine s_initialize_bubbles_EE_module
 
         integer :: l
@@ -65,7 +65,8 @@ contains
 
     end subroutine s_initialize_bubbles_EE_module
 
-    !> Compute the bubble volume fraction alpha from the bubble number density
+    !> @brief Computes the bubble volume fraction alpha from the bubble number density.
+    !! @param q_cons_vf is the conservative variable
     subroutine s_comp_alpha_from_n(q_cons_vf)
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
@@ -90,6 +91,8 @@ contains
     end subroutine s_comp_alpha_from_n
 
     !> Compute the right-hand side for Euler-Euler bubble transport
+    !! @param idir Direction index
+    !! @param q_prim_vf Primitive variables
     subroutine s_compute_bubbles_EE_rhs(idir, q_prim_vf, divu_in)
 
         integer, intent(in)                                 :: idir
@@ -137,7 +140,10 @@ contains
 
     end subroutine s_compute_bubbles_EE_rhs
 
-    !> Compute the Euler-Euler bubble source terms
+    !> The purpose of this procedure is to compute the source terms that are needed for the bubble modeling
+    !! @param q_prim_vf Primitive variables
+    !! @param q_cons_vf Conservative variables
+    !! @param rhs_vf Right-hand side variables
     impure subroutine s_compute_bubble_EE_source(q_cons_vf, q_prim_vf, rhs_vf, divu_in)
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
@@ -159,7 +165,7 @@ contains
         real(wp) :: nbub                            !< Bubble number density
         real(wp) :: my_divu
         integer  :: i, j, k, l, q, ii               !< Loop variables
-        integer  :: adap_dt_stop_max, adap_dt_stop  !< Fail-safe exit if max iteration count reached
+        integer  :: adap_dt_stop_sum, adap_dt_stop  !< Fail-safe exit if max iteration count reached
         integer  :: dmBub_id                        !< Dummy variables for unified subgrid bubble subroutines
         real(wp) :: dmMass_v, dmMass_n, dmBeta_c, dmBeta_t, dmCson
 
@@ -181,10 +187,10 @@ contains
         end do
         $:END_GPU_PARALLEL_LOOP()
 
-        adap_dt_stop_max = 0
+        adap_dt_stop_sum = 0
         $:GPU_PARALLEL_LOOP(private='[j, k, l, Rtmp, Vtmp, myalpha_rho, myalpha, myR, myV, alf, myP, myRho, R2Vav, R3, nbub, &
-                            & pb_local, mv_local, vflux, pbdot, rddot, n_tait, B_tait, my_divu]', collapse=3, &
-                            & reduction = '[[adap_dt_stop_max]]', reductionOp = '[MAX]', copy = '[adap_dt_stop_max]')
+                            & pb_local, mv_local, vflux, pbdot, rddot, n_tait, B_tait, &
+                                & my_divu]', collapse=3, copy='[adap_dt_stop_sum]')
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -272,24 +278,25 @@ contains
                                 pb_local = 0._wp; mv_local = 0._wp; vflux = 0._wp; pbdot = 0._wp
                             end if
 
+                            adap_dt_stop = 0
+
                             ! Adaptive time stepping
                             if (adap_dt) then
-                                adap_dt_stop = 0
-
-                                call s_advance_step(myRho, myP, myR, myV, R0(q), pb_local, pbdot, alf, n_tait, B_tait, &
-                                                    & bub_adv_src(j, k, l), divu_in%sf(j, k, l), dmBub_id, dmMass_v, dmMass_n, &
-                                                    & dmBeta_c, dmBeta_t, dmCson, adap_dt_stop)
+                                adap_dt_stop = f_advance_step(myRho, myP, myR, myV, R0(q), pb_local, pbdot, alf, n_tait, B_tait, &
+                                                              & bub_adv_src(j, k, l), divu_in%sf(j, k, l), dmBub_id, dmMass_v, &
+                                                              & dmMass_n, dmBeta_c, dmBeta_t, dmCson)
 
                                 q_cons_vf(rs(q))%sf(j, k, l) = nbub*myR
                                 q_cons_vf(vs(q))%sf(j, k, l) = nbub*myV
-
-                                adap_dt_stop_max = max(adap_dt_stop_max, adap_dt_stop)
                             else
                                 rddot = f_rddot(myRho, myP, myR, myV, R0(q), pb_local, pbdot, alf, n_tait, B_tait, bub_adv_src(j, &
                                                 & k, l), divu_in%sf(j, k, l), dmCson)
                                 bub_v_src(j, k, l, q) = nbub*rddot
                                 bub_r_src(j, k, l, q) = q_cons_vf(vs(q))%sf(j, k, l)
                             end if
+
+                            $:GPU_ATOMIC(atomic='update')
+                            adap_dt_stop_sum = adap_dt_stop_sum + adap_dt_stop
                         end if
                     end do
                 end do
@@ -297,7 +304,7 @@ contains
         end do
         $:END_GPU_PARALLEL_LOOP()
 
-        if (adap_dt .and. adap_dt_stop_max > 0) call s_mpi_abort("Adaptive time stepping failed to converge.")
+        if (adap_dt .and. adap_dt_stop_sum > 0) call s_mpi_abort("Adaptive time stepping failed to converge.")
 
         if (.not. adap_dt) then
             $:GPU_PARALLEL_LOOP(private='[i, k, l, q]', collapse=3)
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index 437dbe0db2..05a0600f0f 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -17,6 +17,9 @@ module m_bubbles_EL
     use m_helper_basic
     use m_sim_helpers
     use m_helper
+    use m_mpi_common
+    use m_ibm
+    use m_finite_differences
 
     implicit none
 
@@ -55,22 +58,34 @@ module m_bubbles_EL
     $:GPU_DECLARE(create='[intfc_draddt, intfc_dveldt, gas_dpdt, gas_dmvdt, mtn_dposdt, mtn_dveldt]')
 
     integer, private :: lag_num_ts  !< Number of time stages in the time-stepping scheme
+
     $:GPU_DECLARE(create='[lag_num_ts]')
 
-    integer  :: nBubs               !< Number of bubbles in the local domain
     real(wp) :: Rmax_glb, Rmin_glb  !< Maximum and minimum bubbe size in the local domain
     !> Projection of the lagrangian particles in the Eulerian framework
     type(scalar_field), dimension(:), allocatable :: q_beta
+    type(scalar_field), dimension(:), allocatable :: kahan_comp  !< Kahan compensation for q_beta accumulation
     integer                                       :: q_beta_idx  !< Size of the q_beta vector field
-    $:GPU_DECLARE(create='[nBubs, Rmax_glb, Rmin_glb, q_beta, q_beta_idx]')
+
+    $:GPU_DECLARE(create='[Rmax_glb, Rmin_glb, q_beta, kahan_comp, q_beta_idx]')
+
+    integer, parameter                   :: LAG_EVOL_ID = 11  ! File id for lag_bubbles_evol_*.dat
+    integer, parameter                   :: LAG_STATS_ID = 12  ! File id for stats_lag_bubbles_*.dat
+    integer, parameter                   :: LAG_VOID_ID = 13  ! File id for voidfraction.dat
+    integer, allocatable, dimension(:)   :: keep_bubble
+    integer, allocatable, dimension(:,:) :: wrap_bubble_loc, wrap_bubble_dir
+    $:GPU_DECLARE(create='[keep_bubble]')
+    $:GPU_DECLARE(create='[wrap_bubble_loc, wrap_bubble_dir]')
 
 contains
 
     !> Initializes the lagrangian subgrid bubble solver
-    impure subroutine s_initialize_bubbles_EL_module(q_cons_vf)
+    !! @param q_cons_vf Initial conservative variables
+    impure subroutine s_initialize_bubbles_EL_module(q_cons_vf, bc_type)
 
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        integer                                                :: nBubs_glb, i
+        type(scalar_field), dimension(sys_size), intent(inout)     :: q_cons_vf
+        type(integer_field), dimension(1:num_dims,1:2), intent(in) :: bc_type
+        integer                                                    :: nBubs_glb, i
 
         ! Setting number of time-stages for selected time-stepping scheme
 
@@ -83,7 +98,7 @@ contains
         else if (lag_params%solver_approach == 2) then
             ! Two-way coupling
             q_beta_idx = 4
-            if (p == 0) then
+            if (p == 0) then  ! 2D
                 ! Subgrid noise model for 2D approximation
                 q_beta_idx = 6
             end if
@@ -91,22 +106,36 @@ contains
             call s_mpi_abort('Please check the lag_params%solver_approach input')
         end if
 
+        pcomm_coords(1)%beg = x_cb(-1)
+        pcomm_coords(1)%end = x_cb(m)
+        $:GPU_UPDATE(device='[pcomm_coords(1)]')
+        if (n > 0) then
+            pcomm_coords(2)%beg = y_cb(-1)
+            pcomm_coords(2)%end = y_cb(n)
+            $:GPU_UPDATE(device='[pcomm_coords(2)]')
+            if (p > 0) then
+                pcomm_coords(3)%beg = z_cb(-1)
+                pcomm_coords(3)%end = z_cb(p)
+                $:GPU_UPDATE(device='[pcomm_coords(3)]')
+            end if
+        end if
+
         $:GPU_UPDATE(device='[lag_num_ts, q_beta_idx]')
 
         @:ALLOCATE(q_beta(1:q_beta_idx))
+        @:ALLOCATE(kahan_comp(1:q_beta_idx))
 
         do i = 1, q_beta_idx
             @:ALLOCATE(q_beta(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, idwbuff(3)%beg:idwbuff(3)%end))
-        end do
-
-        do i = 1, q_beta_idx
             @:ACC_SETUP_SFs(q_beta(i))
+            @:ALLOCATE(kahan_comp(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
+                       & idwbuff(3)%beg:idwbuff(3)%end))
+            @:ACC_SETUP_SFs(kahan_comp(i))
         end do
 
         ! Allocating space for lagrangian variables
         nBubs_glb = lag_params%nBubs_glb
 
-        @:ALLOCATE(lag_id(1:nBubs_glb, 1:2))
         @:ALLOCATE(bub_R0(1:nBubs_glb))
         @:ALLOCATE(Rmax_stats(1:nBubs_glb))
         @:ALLOCATE(Rmin_stats(1:nBubs_glb))
@@ -114,6 +143,7 @@ contains
         @:ALLOCATE(gas_betaT(1:nBubs_glb))
         @:ALLOCATE(gas_betaC(1:nBubs_glb))
         @:ALLOCATE(bub_dphidt(1:nBubs_glb))
+        @:ALLOCATE(lag_id(1:nBubs_glb, 1:2))
         @:ALLOCATE(gas_p(1:nBubs_glb, 1:2))
         @:ALLOCATE(gas_mv(1:nBubs_glb, 1:2))
         @:ALLOCATE(intfc_rad(1:nBubs_glb, 1:2))
@@ -129,23 +159,70 @@ contains
         @:ALLOCATE(mtn_dposdt(1:nBubs_glb, 1:3, 1:lag_num_ts))
         @:ALLOCATE(mtn_dveldt(1:nBubs_glb, 1:3, 1:lag_num_ts))
 
+        @:ALLOCATE(keep_bubble(1:nBubs_glb))
+        @:ALLOCATE(wrap_bubble_loc(1:nBubs_glb, 1:num_dims), wrap_bubble_dir(1:nBubs_glb, 1:num_dims))
+
         if (adap_dt .and. f_is_default(adap_dt_tol)) adap_dt_tol = dflt_adap_dt_tol
 
+        if (num_procs > 1) call s_initialize_particles_mpi(lag_num_ts)
+
         ! Starting bubbles
-        call s_read_input_bubbles(q_cons_vf)
+        if (lag_params%write_void_evol) call s_open_void_evol
+        if (lag_params%write_bubbles) call s_open_lag_bubble_evol()
+        if (lag_params%write_bubbles_stats) call s_open_lag_bubble_stats()
+
+        if (lag_params%vel_model > 0) then
+            moving_lag_bubbles = .true.
+            lag_pressure_force = lag_params%pressure_force
+            lag_gravity_force = lag_params%gravity_force
+            lag_vel_model = lag_params%vel_model
+            lag_drag_model = lag_params%drag_model
+        end if
+        $:GPU_UPDATE(device='[moving_lag_bubbles, lag_pressure_force, lag_gravity_force, lag_vel_model, lag_drag_model]')
+
+        ! Allocate cell-centered pressure gradient arrays and FD coefficients
+        if (lag_params%vel_model > 0 .and. lag_params%pressure_force) then
+            @:ALLOCATE(grad_p_x(0:m, 0:n, 0:p))
+            @:ALLOCATE(fd_coeff_x_pgrad(-fd_number:fd_number, 0:m))
+            call s_compute_finite_difference_coefficients(m, x_cc, fd_coeff_x_pgrad, buff_size, fd_number, fd_order)
+            $:GPU_UPDATE(device='[fd_coeff_x_pgrad]')
+            if (n > 0) then
+                @:ALLOCATE(grad_p_y(0:m, 0:n, 0:p))
+                @:ALLOCATE(fd_coeff_y_pgrad(-fd_number:fd_number, 0:n))
+                call s_compute_finite_difference_coefficients(n, y_cc, fd_coeff_y_pgrad, buff_size, fd_number, fd_order)
+                $:GPU_UPDATE(device='[fd_coeff_y_pgrad]')
+            end if
+            if (p > 0) then
+                @:ALLOCATE(grad_p_z(0:m, 0:n, 0:p))
+                @:ALLOCATE(fd_coeff_z_pgrad(-fd_number:fd_number, 0:p))
+                call s_compute_finite_difference_coefficients(p, z_cc, fd_coeff_z_pgrad, buff_size, fd_number, fd_order)
+                $:GPU_UPDATE(device='[fd_coeff_z_pgrad]')
+            end if
+        end if
+
+        ! Allocate cell list arrays for atomic-free Gaussian smearing
+        @:ALLOCATE(cell_list_start(0:m, 0:n, 0:p))
+        @:ALLOCATE(cell_list_count(0:m, 0:n, 0:p))
+        @:ALLOCATE(cell_list_idx(1:lag_params%nBubs_glb))
+
+        call s_read_input_bubbles(q_cons_vf, bc_type)
 
     end subroutine s_initialize_bubbles_EL_module
 
-    !> Read initial bubble data from input files
-    impure subroutine s_read_input_bubbles(q_cons_vf)
+    !> The purpose of this procedure is to obtain the initial bubbles' information
+    !! @param q_cons_vf Conservative variables
+    impure subroutine s_read_input_bubbles(q_cons_vf, bc_type)
+
+        type(scalar_field), dimension(sys_size), intent(inout)     :: q_cons_vf
+        type(integer_field), dimension(1:num_dims,1:2), intent(in) :: bc_type
+        real(wp), dimension(8)                                     :: inputBubble
+        real(wp)                                                   :: qtime
+        integer                                                    :: id, bub_id, save_count
+        integer                                                    :: i, ios
+        logical                                                    :: file_exist, indomain
+        integer, dimension(3)                                      :: cell
+        character(LEN=path_len + 2*name_len)                       :: path_D_dir
 
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        real(wp), dimension(8)                                 :: inputBubble
-        real(wp)                                               :: qtime
-        integer                                                :: id, bub_id, save_count
-        integer                                                :: i, ios
-        logical                                                :: file_exist, indomain
-        character(LEN=path_len + 2*name_len)                   :: path_D_dir
         ! Initialize number of particles
 
         bub_id = 0
@@ -162,9 +239,9 @@ contains
 
         if (save_count == 0) then
             if (proc_rank == 0) print *, 'Reading lagrange bubbles input file.'
-            inquire (file='input/lag_bubbles.dat', exist=file_exist)
+            call my_inquire(trim(lag_params%input_path), file_exist)
             if (file_exist) then
-                open (94, file='input/lag_bubbles.dat', form='formatted', iostat=ios)
+                open (94, file=trim(lag_params%input_path), form='formatted', iostat=ios)
                 do while (ios == 0)
                     read (94, *, iostat=ios) (inputBubble(i), i=1, 8)
                     if (ios /= 0) cycle
@@ -178,12 +255,12 @@ contains
                         call s_add_bubbles(inputBubble, q_cons_vf, bub_id)
                         lag_id(bub_id, 1) = id  ! global ID
                         lag_id(bub_id, 2) = bub_id  ! local ID
-                        nBubs = bub_id  ! local number of bubbles
+                        n_el_bubs_loc = bub_id  ! local number of bubbles
                     end if
                 end do
                 close (94)
             else
-                call s_mpi_abort("Initialize the lagrange bubbles in input/lag_bubbles.dat")
+                call s_mpi_abort("Initialize the lagrange bubbles in " // trim(lag_params%input_path))
             end if
         else
             if (proc_rank == 0) print *, 'Restarting lagrange bubbles at save_count: ', save_count
@@ -192,11 +269,21 @@ contains
 
         print *, " Lagrange bubbles running, in proc", proc_rank, "number:", bub_id, "/", id
 
+        if (num_procs > 1) then
+            call s_mpi_reduce_int_sum(n_el_bubs_loc, n_el_bubs_glb)
+        else
+            n_el_bubs_glb = n_el_bubs_loc
+        end if
+
+        if (proc_rank == 0) then
+            if (n_el_bubs_glb == 0) call s_mpi_abort('No bubbles in the domain. Check ' // trim(lag_params%input_path))
+        end if
+
         $:GPU_UPDATE(device='[bubbles_lagrange, lag_params]')
 
         $:GPU_UPDATE(device='[lag_id, bub_R0, Rmax_stats, Rmin_stats, gas_mg, gas_betaT, gas_betaC, bub_dphidt, gas_p, gas_mv, &
                      & intfc_rad, intfc_vel, mtn_pos, mtn_posPrev, mtn_vel, mtn_s, intfc_draddt, intfc_dveldt, gas_dpdt, &
-                     & gas_dmvdt, mtn_dposdt, mtn_dveldt, nBubs]')
+                     & gas_dmvdt, mtn_dposdt, mtn_dveldt, n_el_bubs_loc]')
 
         Rmax_glb = min(dflt_real, -dflt_real)
         Rmin_glb = max(dflt_real, -dflt_real)
@@ -206,22 +293,28 @@ contains
 
         ! Populate temporal variables
         call s_transfer_data_to_tmp()
-        call s_smear_voidfraction()
-
-        if (lag_params%write_bubbles) call s_write_lag_particles(qtime)
+        call s_smear_voidfraction(bc_type)
 
         if (save_count == 0) then
             ! Create ./D directory
-            write (path_D_dir, '(A,I0,A,I0)') trim(case_dir) // '/D'
-            call my_inquire(path_D_dir, file_exist)
-            if (.not. file_exist) call s_create_directory(trim(path_D_dir))
+            if (proc_rank == 0) then
+                write (path_D_dir, '(A,I0,A,I0)') trim(case_dir) // '/D'
+                call my_inquire(trim(path_D_dir), file_exist)
+                if (.not. file_exist) call s_create_directory(trim(path_D_dir))
+            end if
+            call s_mpi_barrier()
             call s_write_restart_lag_bubbles(save_count)  ! Needed for post_processing
-            call s_write_void_evol(qtime)
+            if (lag_params%write_void_evol) call s_write_void_evol(qtime)
         end if
 
+        if (lag_params%write_bubbles) call s_write_lag_bubble_evol(qtime)
+
     end subroutine s_read_input_bubbles
 
-    !> Add a new bubble from input data for a fresh start
+    !> The purpose of this procedure is to obtain the information of the bubbles when starting fresh
+    !! @param inputBubble Bubble information
+    !! @param q_cons_vf Conservative variables
+    !! @param bub_id Local id of the bubble
     impure subroutine s_add_bubbles(inputBubble, q_cons_vf, bub_id)
 
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
@@ -256,7 +349,7 @@ contains
             mtn_posPrev(bub_id,1:3,1) = mtn_pos(bub_id,1:3,1)
         end if
 
-        cell = -buff_size
+        cell = fd_number - buff_size
         call s_locate_cell(mtn_pos(bub_id,1:3,1), cell, mtn_s(bub_id,1:3,1))
 
         ! Check if the bubble is located in the ghost cell of a symmetric, or wall boundary
@@ -329,7 +422,9 @@ contains
 
     end subroutine s_add_bubbles
 
-    !> Restore bubble data from a restart file
+    !> The purpose of this procedure is to obtain the information of the bubbles from a restart point.
+    !! @param bub_id Local ID of the particle
+    !! @param save_count File identifier
     impure subroutine s_restart_bubbles(bub_id, save_count)
 
         integer, intent(inout)               :: bub_id, save_count
@@ -349,7 +444,6 @@ contains
         integer                                :: i
         integer, dimension(:), allocatable     :: proc_bubble_counts
         real(wp), dimension(1:1,1:lag_io_vars) :: dummy
-
         dummy = 0._wp
 
         ! Construct file path
@@ -431,7 +525,7 @@ contains
             call MPI_FILE_CLOSE(ifile, ierr)
             call MPI_TYPE_FREE(view, ierr)
 
-            nBubs = bub_id
+            n_el_bubs_loc = bub_id
 
             do i = 1, bub_id
                 lag_id(i, 1) = int(MPI_IO_DATA_lag_bubbles(i, 1))
@@ -455,7 +549,7 @@ contains
 
             deallocate (MPI_IO_DATA_lag_bubbles)
         else
-            nBubs = 0
+            n_el_bubs_loc = 0
 
             call MPI_TYPE_CONTIGUOUS(0, mpi_p, view, ierr)
             call MPI_TYPE_COMMIT(view, ierr)
@@ -484,16 +578,20 @@ contains
     end subroutine s_restart_bubbles
 
     !> Contains the bubble dynamics subroutines.
-    subroutine s_compute_bubble_EL_dynamics(q_prim_vf, stage)
-
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
-        integer, intent(in)                                    :: stage
-        real(wp)                                               :: myVapFlux
-        real(wp)                                               :: preterm1, term2, paux, pint, Romega, term1_fac
-        real(wp)                                               :: myR_m, mygamma_m, myPb, myMass_n, myMass_v
-        real(wp)                                               :: myR, myV, myBeta_c, myBeta_t, myR0, myPbdot, myMvdot
-        real(wp)                                               :: myPinf, aux1, aux2, myCson, myRho
-        real(wp)                                               :: gamma, pi_inf, qv
+    !! @param q_prim_vf Primitive variables
+    !! @param stage Current stage in the time-stepper algorithm
+    subroutine s_compute_bubble_EL_dynamics(q_prim_vf, bc_type, stage)
+
+        type(scalar_field), dimension(sys_size), intent(inout)     :: q_prim_vf
+        type(integer_field), dimension(1:num_dims,1:2), intent(in) :: bc_type
+        integer, intent(in)                                        :: stage
+        real(wp)                                                   :: myVapFlux
+        real(wp)                                                   :: preterm1, term2, paux, pint, Romega, term1_fac
+        real(wp)                                                   :: myR_m, mygamma_m, myPb, myMass_n, myMass_v
+        real(wp)                                                   :: myR, myV, myBeta_c, myBeta_t, myR0, myPbdot, myMvdot
+        real(wp)                                                   :: myPinf, aux1, aux2, myCson, myRho
+        real(wp), dimension(3)                                     :: myPos, myVel
+        real(wp)                                                   :: gamma, pi_inf, qv, f_b, myRe
 
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3) :: myalpha_rho, myalpha
@@ -502,17 +600,16 @@ contains
         #:endif
         real(wp), dimension(2) :: Re
         integer, dimension(3) :: cell
-        integer :: adap_dt_stop_max, adap_dt_stop                     !< Fail-safe exit if max iteration count reached
+        integer :: adap_dt_stop_sum, adap_dt_stop                     !< Fail-safe exit if max iteration count reached
         real(wp) :: dmalf, dmntait, dmBtait, dm_bub_adv_src, dm_divu  !< Dummy variables for unified subgrid bubble subroutines
-        integer :: i, k, l
-
-        call nvtxStartRange("LAGRANGE-BUBBLE-DYNAMICS")
+        integer :: k, l
 
         ! Subgrid p_inf model based on Maeda and Colonius (2018).
         if (lag_params%pressure_corrector) then
+            call nvtxStartRange("LAGRANGE-BUBBLE-PINF-CORRECTION")
             ! Calculate velocity potentials (valid for one bubble per cell)
             $:GPU_PARALLEL_LOOP(private='[k, cell, paux, preterm1, term2, Romega, myR0, myR, myV, myPb, pint, term1_fac]')
-            do k = 1, nBubs
+            do k = 1, n_el_bubs_loc
                 call s_get_pinf(k, q_prim_vf, 2, paux, cell, preterm1, term2, Romega)
                 myR0 = bub_R0(k)
                 myR = intfc_rad(k, 2)
@@ -529,16 +626,24 @@ contains
                 end if
             end do
             $:END_GPU_PARALLEL_LOOP()
+            call nvtxEndRange()
         end if
 
+        ! Precompute cell-centered pressure gradients for translational motion
+        if (moving_lag_bubbles .and. lag_pressure_force) then
+            call nvtxStartRange("LAGRANGE-BUBBLE-PRESSURE-GRADIENT")
+            call s_compute_pressure_gradients(q_prim_vf)
+            call nvtxEndRange()
+        end if
+
+        call nvtxStartRange("LAGRANGE-BUBBLE-DYNAMICS")
         ! Radial motion model
-        adap_dt_stop_max = 0
-        $:GPU_PARALLEL_LOOP(private='[k, i, myalpha_rho, myalpha, Re, cell, myVapFlux, preterm1, term2, paux, pint, Romega, &
+        adap_dt_stop_sum = 0
+        $:GPU_PARALLEL_LOOP(private='[k, myalpha_rho, myalpha, Re, cell, myVapFlux, preterm1, term2, paux, pint, Romega, &
                             & term1_fac, myR_m, mygamma_m, myPb, myMass_n, myMass_v, myR, myV, myBeta_c, myBeta_t, myR0, myPbdot, &
                             & myMvdot, myPinf, aux1, aux2, myCson, myRho, gamma, pi_inf, qv, dmalf, dmntait, dmBtait, &
-                            & dm_bub_adv_src, dm_divu, adap_dt_stop]', reduction='[[adap_dt_stop_max]]',reductionOp='[MAX]', &
-                            & copy = '[adap_dt_stop_max]', copyin = '[stage]')
-        do k = 1, nBubs
+                            & dm_bub_adv_src, dm_divu, adap_dt_stop, myPos, myVel]', copy='[adap_dt_stop_sum]',copyin='[stage]')
+        do k = 1, n_el_bubs_loc
             ! Keller-Miksis model
 
             ! Current bubble state
@@ -550,6 +655,8 @@ contains
             myBeta_c = gas_betaC(k)
             myBeta_t = gas_betaT(k)
             myR0 = bub_R0(k)
+            myPos = mtn_pos(k,:,2)
+            myVel = mtn_vel(k,:,2)
 
             ! Vapor and heat fluxes
             call s_vflux(myR, myV, myPb, myMass_v, k, myVapFlux, myMass_n, myBeta_c, myR_m, mygamma_m)
@@ -568,44 +675,74 @@ contains
             adap_dt_stop = 0
 
             if (adap_dt) then
-                call s_advance_step(myRho, myPinf, myR, myV, myR0, myPb, myPbdot, dmalf, dmntait, dmBtait, dm_bub_adv_src, &
-                                    & dm_divu, k, myMass_v, myMass_n, myBeta_c, myBeta_t, myCson, adap_dt_stop)
+                mtn_posPrev(k,:,1) = myPos
+
+                myRe = Re(1)
+                adap_dt_stop = f_advance_step(myRho, myPinf, myR, myV, myR0, myPb, myPbdot, dmalf, dmntait, dmBtait, &
+                                              & dm_bub_adv_src, dm_divu, k, myMass_v, myMass_n, myBeta_c, myBeta_t, myCson, myRe, &
+                                              & myPos, myVel, cell, q_prim_vf)
 
                 ! Update bubble state
                 intfc_rad(k, 1) = myR
                 intfc_vel(k, 1) = myV
                 gas_p(k, 1) = myPb
                 gas_mv(k, 1) = myMass_v
+                mtn_pos(k,:,1) = myPos
+                mtn_vel(k,:,1) = myVel
             else
                 ! Radial acceleration from bubble models
                 intfc_dveldt(k, stage) = f_rddot(myRho, myPinf, myR, myV, myR0, myPb, myPbdot, dmalf, dmntait, dmBtait, &
                              & dm_bub_adv_src, dm_divu, myCson)
+
                 intfc_draddt(k, stage) = myV
                 gas_dmvdt(k, stage) = myMvdot
                 gas_dpdt(k, stage) = myPbdot
+
+                if (moving_lag_bubbles) then
+                    do l = 1, num_dims
+                        select case (lag_vel_model)
+                        case (1)
+                            mtn_dposdt(k, l, stage) = f_interpolate_velocity(myPos(l), cell, l, q_prim_vf)
+                            mtn_dveldt(k, l, stage) = 0._wp
+                        case (2)
+                            mtn_dposdt(k, l, stage) = myVel(l)
+                            f_b = f_get_bubble_force(myPos(l), myR, myV, myVel(l), myMass_n, myMass_v, Re(1), myRho, cell, l, &
+                                                     & q_prim_vf)
+                            mtn_dveldt(k, l, stage) = f_b/(myMass_n + myMass_v)
+                        case (3)
+                            mtn_dposdt(k, l, stage) = myVel(l)
+                            f_b = f_get_bubble_force(myPos(l), myR, myV, myVel(l), myMass_n, myMass_v, Re(1), myRho, cell, l, &
+                                                     & q_prim_vf)
+                            mtn_dveldt(k, l, stage) = 2._wp*f_b/(myMass_n + myMass_v) - 3._wp*myV*myVel(l)/myR
+                        case default
+                            mtn_dposdt(k, l, stage) = 0._wp
+                            mtn_dveldt(k, l, stage) = 0._wp
+                        end select
+                    end do
+                end if
             end if
 
-            adap_dt_stop_max = max(adap_dt_stop_max, adap_dt_stop)
+            $:GPU_ATOMIC(atomic='update')
+            adap_dt_stop_sum = adap_dt_stop_sum + adap_dt_stop
         end do
         $:END_GPU_PARALLEL_LOOP()
+        call nvtxEndRange
 
-        if (adap_dt .and. adap_dt_stop_max > 0) call s_mpi_abort("Adaptive time stepping failed to converge.")
-
-        ! Bubbles remain in a fixed position
-        $:GPU_PARALLEL_LOOP(collapse=2, private='[k, l]', copyin='[stage]')
-        do k = 1, nBubs
-            do l = 1, 3
-                mtn_dposdt(k, l, stage) = 0._wp
-                mtn_dveldt(k, l, stage) = 0._wp
-            end do
-        end do
-        $:END_GPU_PARALLEL_LOOP()
+        if (adap_dt .and. adap_dt_stop_sum > 0) call s_mpi_abort("Adaptive time stepping failed to converge.")
 
-        call nvtxEndRange
+        if (adap_dt) then
+            call s_transfer_data_to_tmp()
+            if (moving_lag_bubbles) call s_enforce_EL_bubbles_boundary_conditions(q_prim_vf)
+            call s_smear_voidfraction(bc_type)
+        end if
 
     end subroutine s_compute_bubble_EL_dynamics
 
-    !> Compute the Lagrangian bubble source terms and add them to the RHS
+    !> The purpose of this subroutine is to obtain the bubble source terms based on Maeda and Colonius (2018) and add them to the
+    !! RHS scalar field.
+    !! @param q_cons_vf Conservative variables
+    !! @param q_prim_vf Conservative variables
+    !! @param rhs_vf Time derivative of the conservative variables
     subroutine s_compute_bubbles_EL_source(q_cons_vf, q_prim_vf, rhs_vf)
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
@@ -613,91 +750,96 @@ contains
         type(scalar_field), dimension(sys_size), intent(inout) :: rhs_vf
         integer                                                :: i, j, k, l
 
-        if (.not. adap_dt) call s_smear_voidfraction()
-
-        if (lag_params%solver_approach == 2) then
-            ! (q / (1 - beta)) * d(beta)/dt source
-            if (p == 0) then
-                $:GPU_PARALLEL_LOOP(private='[i, j, k, l]', collapse=4)
-                do k = 0, p
-                    do j = 0, n
-                        do i = 0, m
-                            do l = 1, E_idx
-                                if (q_beta(1)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then
-                                    rhs_vf(l)%sf(i, j, k) = rhs_vf(l)%sf(i, j, k) + q_cons_vf(l)%sf(i, j, k)*(q_beta(2)%sf(i, j, &
-                                           & k) + q_beta(5)%sf(i, j, k))
-                                end if
-                            end do
+        call nvtxStartRange("LAGRANGE-BUBBLE-EL-SOURCE")
+        ! (q / (1 - beta)) * d(beta)/dt source
+        if (lag_params%cluster_type >= 4) then
+            $:GPU_PARALLEL_LOOP(private='[i, j, k, l]', collapse=4)
+            do k = idwint(3)%beg, idwint(3)%end
+                do j = idwint(2)%beg, idwint(2)%end
+                    do i = idwint(1)%beg, idwint(1)%end
+                        do l = 1, E_idx
+                            if (q_beta(1)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then
+                                rhs_vf(l)%sf(i, j, k) = rhs_vf(l)%sf(i, j, k) + q_cons_vf(l)%sf(i, j, k)*(q_beta(2)%sf(i, j, &
+                                       & k) + q_beta(5)%sf(i, j, k))
+                            end if
                         end do
                     end do
                 end do
-                $:END_GPU_PARALLEL_LOOP()
-            else
-                $:GPU_PARALLEL_LOOP(private='[i, j, k, l]', collapse=4)
-                do k = 0, p
-                    do j = 0, n
-                        do i = 0, m
-                            do l = 1, E_idx
-                                if (q_beta(1)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then
-                                    rhs_vf(l)%sf(i, j, k) = rhs_vf(l)%sf(i, j, k) + q_cons_vf(l)%sf(i, j, k)/q_beta(1)%sf(i, j, &
-                                           & k)*q_beta(2)%sf(i, j, k)
-                                end if
-                            end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        else
+            $:GPU_PARALLEL_LOOP(private='[i, j, k, l]', collapse=4)
+            do k = idwint(3)%beg, idwint(3)%end
+                do j = idwint(2)%beg, idwint(2)%end
+                    do i = idwint(1)%beg, idwint(1)%end
+                        do l = 1, E_idx
+                            if (q_beta(1)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then
+                                rhs_vf(l)%sf(i, j, k) = rhs_vf(l)%sf(i, j, k) + (q_cons_vf(l)%sf(i, j, k)/q_beta(1)%sf(i, j, &
+                                       & k))*q_beta(2)%sf(i, j, k)
+                            end if
                         end do
                     end do
                 end do
-                $:END_GPU_PARALLEL_LOOP()
-            end if
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
 
-            do l = 1, num_dims
-                call s_gradient_dir(q_prim_vf(E_idx)%sf, q_beta(3)%sf, l)
+        do l = 1, num_dims
+            call s_gradient_dir(q_prim_vf(E_idx)%sf, q_beta(3)%sf, l)
 
-                ! (q / (1 - beta)) * d(beta)/dt source
-                $:GPU_PARALLEL_LOOP(private='[i, j, k]', collapse=3)
-                do k = 0, p
-                    do j = 0, n
-                        do i = 0, m
-                            if (q_beta(1)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then
-                                rhs_vf(contxe + l)%sf(i, j, k) = rhs_vf(contxe + l)%sf(i, j, k) - (1._wp - q_beta(1)%sf(i, j, &
-                                       & k))/q_beta(1)%sf(i, j, k)*q_beta(3)%sf(i, j, k)
-                            end if
-                        end do
+            ! (q / (1 - beta)) * d(beta)/dt source
+            $:GPU_PARALLEL_LOOP(private='[i, j, k]', collapse=3)
+            do k = idwint(3)%beg, idwint(3)%end
+                do j = idwint(2)%beg, idwint(2)%end
+                    do i = idwint(1)%beg, idwint(1)%end
+                        if (q_beta(1)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then
+                            rhs_vf(contxe + l)%sf(i, j, k) = rhs_vf(contxe + l)%sf(i, j, k) - (1._wp - q_beta(1)%sf(i, j, &
+                                   & k))/q_beta(1)%sf(i, j, k)*q_beta(3)%sf(i, j, k)
+                        end if
                     end do
                 end do
-                $:END_GPU_PARALLEL_LOOP()
+            end do
+            $:END_GPU_PARALLEL_LOOP()
 
-                ! source in energy
-                $:GPU_PARALLEL_LOOP(private='[i, j, k]', collapse=3)
-                do k = idwbuff(3)%beg, idwbuff(3)%end
-                    do j = idwbuff(2)%beg, idwbuff(2)%end
-                        do i = idwbuff(1)%beg, idwbuff(1)%end
-                            q_beta(3)%sf(i, j, k) = q_prim_vf(E_idx)%sf(i, j, k)*q_prim_vf(contxe + l)%sf(i, j, k)
-                        end do
+            ! source in energy
+            $:GPU_PARALLEL_LOOP(private='[i, j, k]', collapse=3)
+            do k = idwbuff(3)%beg, idwbuff(3)%end
+                do j = idwbuff(2)%beg, idwbuff(2)%end
+                    do i = idwbuff(1)%beg, idwbuff(1)%end
+                        q_beta(3)%sf(i, j, k) = q_prim_vf(E_idx)%sf(i, j, k)*q_prim_vf(contxe + l)%sf(i, j, k)
                     end do
                 end do
-                $:END_GPU_PARALLEL_LOOP()
+            end do
+            $:END_GPU_PARALLEL_LOOP()
 
-                call s_gradient_dir(q_beta(3)%sf, q_beta(4)%sf, l)
+            call s_gradient_dir(q_beta(3)%sf, q_beta(4)%sf, l)
 
-                ! (beta / (1 - beta)) * d(Pu)/dl source
-                $:GPU_PARALLEL_LOOP(private='[i, j, k]', collapse=3)
-                do k = 0, p
-                    do j = 0, n
-                        do i = 0, m
-                            if (q_beta(1)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then
-                                rhs_vf(E_idx)%sf(i, j, k) = rhs_vf(E_idx)%sf(i, j, k) - q_beta(4)%sf(i, j, &
-                                       & k)*(1._wp - q_beta(1)%sf(i, j, k))/q_beta(1)%sf(i, j, k)
-                            end if
-                        end do
+            ! (beta / (1 - beta)) * d(Pu)/dl source
+            $:GPU_PARALLEL_LOOP(private='[i, j, k]', collapse=3)
+            do k = idwint(3)%beg, idwint(3)%end
+                do j = idwint(2)%beg, idwint(2)%end
+                    do i = idwint(1)%beg, idwint(1)%end
+                        if (q_beta(1)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then
+                            rhs_vf(E_idx)%sf(i, j, k) = rhs_vf(E_idx)%sf(i, j, k) - q_beta(4)%sf(i, j, &
+                                   & k)*(1._wp - q_beta(1)%sf(i, j, k))/q_beta(1)%sf(i, j, k)
+                        end if
                     end do
                 end do
-                $:END_GPU_PARALLEL_LOOP()
             end do
-        end if
+            $:END_GPU_PARALLEL_LOOP()
+        end do
+        call nvtxEndRange
 
     end subroutine s_compute_bubbles_EL_source
 
-    !> Compute the speed of sound from a given driving pressure
+    !> This procedure computes the speed of sound from a given driving pressure
+    !! @param q_prim_vf Primitive variables
+    !! @param pinf Driving pressure
+    !! @param cell Bubble cell
+    !! @param rhol Liquid density
+    !! @param gamma Liquid specific heat ratio
+    !! @param pi_inf Liquid stiffness
+    !! @param cson Calculated speed of sound
     subroutine s_compute_cson_from_pinf(q_prim_vf, pinf, cell, rhol, gamma, pi_inf, cson)
 
         $:GPU_ROUTINE(function_name='s_compute_cson_from_pinf', parallelism='[seq]', cray_inline=True)
@@ -725,26 +867,38 @@ contains
 
     end subroutine s_compute_cson_from_pinf
 
-    !> Smear the bubble effects onto the Eulerian grid
-    subroutine s_smear_voidfraction()
-
-        integer :: i, j, k, l
+    !> The purpose of this subroutine is to smear the effect of the bubbles in the Eulerian framework
+    subroutine s_smear_voidfraction(bc_type)
 
-        call nvtxStartRange("BUBBLES-LAGRANGE-KERNELS")
+        type(integer_field), dimension(1:num_dims,1:2), intent(in) :: bc_type
+        integer                                                    :: i, j, k, l
 
+        call nvtxStartRange("BUBBLES-LAGRANGE-SMEARING")
         $:GPU_PARALLEL_LOOP(private='[i, j, k, l]', collapse=4)
         do i = 1, q_beta_idx
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do k = idwbuff(2)%beg, idwbuff(2)%end
                     do j = idwbuff(1)%beg, idwbuff(1)%end
                         q_beta(i)%sf(j, k, l) = 0._wp
+                        kahan_comp(i)%sf(j, k, l) = 0._wp
                     end do
                 end do
             end do
         end do
         $:END_GPU_PARALLEL_LOOP()
 
-        call s_smoothfunction(nBubs, intfc_rad, intfc_vel, mtn_s, mtn_pos, q_beta)
+        ! Build cell list before smearing (CPU-side counting sort)
+        call s_build_cell_list(n_el_bubs_loc, mtn_s)
+
+        call s_smoothfunction(n_el_bubs_loc, intfc_rad, intfc_vel, mtn_s, mtn_pos, q_beta, kahan_comp)
+
+        call nvtxStartRange("BUBBLES-LAGRANGE-BETA-COMM")
+        if (lag_params%cluster_type >= 4) then
+            call s_populate_beta_buffers(q_beta, bc_type, 3)
+        else
+            call s_populate_beta_buffers(q_beta, bc_type, 2)
+        end if
+        call nvtxEndRange
 
         ! Store 1-beta
         $:GPU_PARALLEL_LOOP(private='[j, k, l]', collapse=3)
@@ -758,12 +912,19 @@ contains
             end do
         end do
         $:END_GPU_PARALLEL_LOOP()
-
         call nvtxEndRange
 
     end subroutine s_smear_voidfraction
 
-    !> Compute the bubble driving pressure p_inf
+    !> The purpose of this procedure is obtain the bubble driving pressure p_inf
+    !! @param bub_id Particle identifier
+    !! @param q_prim_vf  Primitive variables
+    !! @param ptype 1: p at infinity, 2: averaged P at the bubble location
+    !! @param f_pinfl Driving pressure
+    !! @param cell Bubble cell
+    !! @param preterm1 Pre-computed term 1
+    !! @param term2 Computed term 2
+    !! @param Romega Control volume radius
     subroutine s_get_pinf(bub_id, q_prim_vf, ptype, f_pinfl, cell, preterm1, term2, Romega)
 
         $:GPU_ROUTINE(function_name='s_get_pinf',parallelism='[seq]', cray_inline=True)
@@ -773,91 +934,170 @@ contains
         real(wp), intent(out)                               :: f_pinfl
         integer, dimension(3), intent(out)                  :: cell
         real(wp), intent(out), optional                     :: preterm1, term2, Romega
-        real(wp), dimension(3)                              :: scoord, psi
+        real(wp), dimension(3)                              :: scoord, psi_pos, psi_x, psi_y, psi_z
+        real(wp)                                            :: xi, eta, zeta
         real(wp)                                            :: dc, vol, aux
         real(wp)                                            :: volgas, term1, Rbeq, denom
         real(wp)                                            :: charvol, charpres, charvol2, charpres2
         integer, dimension(3)                               :: cellaux
         integer                                             :: i, j, k
         integer                                             :: smearGrid, smearGridz
-        logical                                             :: celloutside
 
-        scoord = mtn_s(bub_id,1:3,2)
         f_pinfl = 0._wp
 
-        !> Find current bubble cell
-        cell(:) = int(scoord(:))
-        $:GPU_LOOP(parallelism='[seq]')
-        do i = 1, num_dims
-            if (scoord(i) < 0._wp) cell(i) = cell(i) - 1
-        end do
+        if (moving_lag_bubbles) then
+            cell = fd_number - buff_size
+            call s_locate_cell(mtn_pos(bub_id,1:3,2), cell, mtn_s(bub_id,1:3,2))
+            scoord = mtn_s(bub_id,1:3,2)
+        else
+            scoord = mtn_s(bub_id,1:3,2)
+            cell(:) = int(scoord(:))
+            $:GPU_LOOP(parallelism='[seq]')
+            do i = 1, num_dims
+                if (scoord(i) < 0._wp) cell(i) = cell(i) - 1
+            end do
+        end if
 
         if ((lag_params%cluster_type == 1)) then
             !> Getting p_cell in terms of only the current cell by interpolation
-            !> Getting the cell volulme as Omega
-            if (p > 0) then
-                vol = dx(cell(1))*dy(cell(2))*dz(cell(3))
-            else
-                if (cyl_coord) then
-                    vol = dx(cell(1))*dy(cell(2))*y_cc(cell(2))*2._wp*pi
+
+            if (fd_order == 2) then  ! Bilinear interpolation
+
+                if (p > 0) then
+                    vol = dx(cell(1))*dy(cell(2))*dz(cell(3))
                 else
-                    vol = dx(cell(1))*dy(cell(2))*lag_params%charwidth
+                    if (cyl_coord) then
+                        vol = dx(cell(1))*dy(cell(2))*y_cc(cell(2))*2._wp*pi
+                    else
+                        vol = dx(cell(1))*dy(cell(2))*lag_params%charwidth
+                    end if
                 end if
-            end if
 
-            !> Obtain bilinear interpolation coefficients, based on the current location of the bubble.
-            psi(1) = (scoord(1) - real(cell(1)))*dx(cell(1)) + x_cb(cell(1) - 1)
-            if (cell(1) == (m + buff_size)) then
-                cell(1) = cell(1) - 1
-                psi(1) = 1._wp
-            else if (cell(1) == (-buff_size)) then
-                psi(1) = 0._wp
-            else
-                if (psi(1) < x_cc(cell(1))) cell(1) = cell(1) - 1
-                psi(1) = abs((psi(1) - x_cc(cell(1)))/(x_cc(cell(1) + 1) - x_cc(cell(1))))
-            end if
+                !> Obtain bilinear interpolation coefficients, based on the current location of the bubble.
+                psi_pos(1) = (scoord(1) - real(cell(1)))*dx(cell(1)) + x_cb(cell(1) - 1)
+                psi_pos(1) = abs((psi_pos(1) - x_cc(cell(1)))/(x_cc(cell(1) + 1) - x_cc(cell(1))))
 
-            psi(2) = (scoord(2) - real(cell(2)))*dy(cell(2)) + y_cb(cell(2) - 1)
-            if (cell(2) == (n + buff_size)) then
-                cell(2) = cell(2) - 1
-                psi(2) = 1._wp
-            else if (cell(2) == (-buff_size)) then
-                psi(2) = 0._wp
-            else
-                if (psi(2) < y_cc(cell(2))) cell(2) = cell(2) - 1
-                psi(2) = abs((psi(2) - y_cc(cell(2)))/(y_cc(cell(2) + 1) - y_cc(cell(2))))
-            end if
+                psi_pos(2) = (scoord(2) - real(cell(2)))*dy(cell(2)) + y_cb(cell(2) - 1)
+                psi_pos(2) = abs((psi_pos(2) - y_cc(cell(2)))/(y_cc(cell(2) + 1) - y_cc(cell(2))))
 
-            if (p > 0) then
-                psi(3) = (scoord(3) - real(cell(3)))*dz(cell(3)) + z_cb(cell(3) - 1)
-                if (cell(3) == (p + buff_size)) then
-                    cell(3) = cell(3) - 1
-                    psi(3) = 1._wp
-                else if (cell(3) == (-buff_size)) then
-                    psi(3) = 0._wp
+                if (p > 0) then
+                    psi_pos(3) = (scoord(3) - real(cell(3)))*dz(cell(3)) + z_cb(cell(3) - 1)
+                    psi_pos(3) = abs((psi_pos(3) - z_cc(cell(3)))/(z_cc(cell(3) + 1) - z_cc(cell(3))))
                 else
-                    if (psi(3) < z_cc(cell(3))) cell(3) = cell(3) - 1
-                    psi(3) = abs((psi(3) - z_cc(cell(3)))/(z_cc(cell(3) + 1) - z_cc(cell(3))))
+                    psi_pos(3) = 0._wp
                 end if
-            else
-                psi(3) = 0._wp
-            end if
 
-            !> Perform bilinear interpolation
-            if (p == 0) then  ! 2D
-                f_pinfl = q_prim_vf(E_idx)%sf(cell(1), cell(2), cell(3))*(1._wp - psi(1))*(1._wp - psi(2))
-                f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1) + 1, cell(2), cell(3))*psi(1)*(1._wp - psi(2))
-                f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1) + 1, cell(2) + 1, cell(3))*psi(1)*psi(2)
-                f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1), cell(2) + 1, cell(3))*(1._wp - psi(1))*psi(2)
-            else  ! 3D
-                f_pinfl = q_prim_vf(E_idx)%sf(cell(1), cell(2), cell(3))*(1._wp - psi(1))*(1._wp - psi(2))*(1._wp - psi(3))
-                f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1) + 1, cell(2), cell(3))*psi(1)*(1._wp - psi(2))*(1._wp - psi(3))
-                f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1) + 1, cell(2) + 1, cell(3))*psi(1)*psi(2)*(1._wp - psi(3))
-                f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1), cell(2) + 1, cell(3))*(1._wp - psi(1))*psi(2)*(1._wp - psi(3))
-                f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1), cell(2), cell(3) + 1)*(1._wp - psi(1))*(1._wp - psi(2))*psi(3)
-                f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1) + 1, cell(2), cell(3) + 1)*psi(1)*(1._wp - psi(2))*psi(3)
-                f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1) + 1, cell(2) + 1, cell(3) + 1)*psi(1)*psi(2)*psi(3)
-                f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1), cell(2) + 1, cell(3) + 1)*(1._wp - psi(1))*psi(2)*psi(3)
+                ! Calculate bilinear basis functions for each direction For normalized coordinate xi in [0, 1], the two basis
+                ! functions are: phi_0(xi) = 1 - xi, phi_1(xi) = xi
+
+                ! X-direction basis functions
+                psi_x(1) = 1._wp - psi_pos(1)  ! Left basis function
+                psi_x(2) = psi_pos(1)  ! Right basis function
+
+                ! Y-direction basis functions
+                psi_y(1) = 1._wp - psi_pos(2)  ! Left basis function
+                psi_y(2) = psi_pos(2)  ! Right basis function
+
+                if (p > 0) then
+                    ! Z-direction basis functions
+                    psi_z(1) = 1._wp - psi_pos(3)  ! Left basis function
+                    psi_z(2) = psi_pos(3)  ! Right basis function
+                else
+                    psi_z(1) = 1._wp
+                    psi_z(2) = 0._wp
+                end if
+
+                !> Perform bilinear interpolation
+                f_pinfl = 0._wp
+
+                if (p == 0) then  ! 2D - 4 point interpolation (2x2)
+                    do j = 1, 2
+                        do i = 1, 2
+                            f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1) + i - 1, cell(2) + j - 1, cell(3))*psi_x(i)*psi_y(j)
+                        end do
+                    end do
+                else
+                    do k = 1, 2
+                        do j = 1, 2
+                            do i = 1, 2
+                                f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1) + i - 1, cell(2) + j - 1, &
+                                                              & cell(3) + k - 1)*psi_x(i)*psi_y(j)*psi_z(k)
+                            end do
+                        end do
+                    end do
+                end if
+            else if (fd_order == 4) then  ! Biquadratic interpolation
+                if (p > 0) then
+                    vol = dx(cell(1))*dy(cell(2))*dz(cell(3))
+                else
+                    if (cyl_coord) then
+                        vol = dx(cell(1))*dy(cell(2))*y_cc(cell(2))*2._wp*pi
+                    else
+                        vol = dx(cell(1))*dy(cell(2))*lag_params%charwidth
+                    end if
+                end if
+
+                !> Obtain biquadratic interpolation coefficients, based on the current location of the bubble.
+                ! For biquadratic interpolation, we need coefficients for 3 points in each direction
+                psi_pos(1) = (scoord(1) - real(cell(1)))*dx(cell(1)) + x_cb(cell(1) - 1)
+                psi_pos(1) = (psi_pos(1) - x_cc(cell(1)))/(x_cc(cell(1) + 1) - x_cc(cell(1)))
+
+                psi_pos(2) = (scoord(2) - real(cell(2)))*dy(cell(2)) + y_cb(cell(2) - 1)
+                psi_pos(2) = (psi_pos(2) - y_cc(cell(2)))/(y_cc(cell(2) + 1) - y_cc(cell(2)))
+
+                if (p > 0) then
+                    psi_pos(3) = (scoord(3) - real(cell(3)))*dz(cell(3)) + z_cb(cell(3) - 1)
+                    psi_pos(3) = (psi_pos(3) - z_cc(cell(3)))/(z_cc(cell(3) + 1) - z_cc(cell(3)))
+                else
+                    psi_pos(3) = 0._wp
+                end if
+
+                ! Calculate biquadratic basis functions for each direction For normalized coordinate xi in [-1, 1], the three basis
+                ! functions are: phi_0(xi) = xi*(xi-1)/2, phi_1(xi) = (1-xi)*(1+xi), phi_2(xi) = xi*(xi+1)/2
+
+                ! X-direction basis functions
+                xi = 2._wp*psi_pos(1) - 1._wp  ! Convert to [-1, 1] range
+                psi_x(1) = xi*(xi - 1._wp)/2._wp  ! Left basis function
+                psi_x(2) = (1._wp - xi)*(1._wp + xi)  ! Center basis function
+                psi_x(3) = xi*(xi + 1._wp)/2._wp  ! Right basis function
+
+                ! Y-direction basis functions
+                eta = 2._wp*psi_pos(2) - 1._wp  ! Convert to [-1, 1] range
+                psi_y(1) = eta*(eta - 1._wp)/2._wp  ! Left basis function
+                psi_y(2) = (1._wp - eta)*(1._wp + eta)  ! Center basis function
+                psi_y(3) = eta*(eta + 1._wp)/2._wp  ! Right basis function
+
+                if (p > 0) then
+                    ! Z-direction basis functions
+                    zeta = 2._wp*psi_pos(3) - 1._wp  ! Convert to [-1, 1] range
+                    psi_z(1) = zeta*(zeta - 1._wp)/2._wp  ! Left basis function
+                    psi_z(2) = (1._wp - zeta)*(1._wp + zeta)  ! Center basis function
+                    psi_z(3) = zeta*(zeta + 1._wp)/2._wp  ! Right basis function
+                else
+                    psi_z(1) = 0._wp
+                    psi_z(2) = 1._wp
+                    psi_z(3) = 0._wp
+                end if
+
+                !> Perform biquadratic interpolation
+                f_pinfl = 0._wp
+
+                if (p == 0) then  ! 2D - 9 point interpolation (3x3)
+                    do j = 1, 3
+                        do i = 1, 3
+                            f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1) + i - 2, cell(2) + j - 2, cell(3))*psi_x(i)*psi_y(j)
+                        end do
+                    end do
+                else
+                    do k = 1, 3
+                        do j = 1, 3
+                            do i = 1, 3
+                                f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1) + i - 2, cell(2) + j - 2, &
+                                                              & cell(3) + k - 2)*psi_x(i)*psi_y(j)*psi_z(k)
+                            end do
+                        end do
+                    end do
+                end if
             end if
 
             ! R_Omega
@@ -887,56 +1127,25 @@ contains
                         cellaux(3) = cell(3) + k - (mapCells + 1)
                         if (p == 0) cellaux(3) = 0
 
-                        !> check if the current cell is outside the computational domain or not (including ghost cells)
-                        celloutside = .false.
-                        if (num_dims == 2) then
-                            if ((cellaux(1) < -buff_size) .or. (cellaux(2) < -buff_size)) then
-                                celloutside = .true.
-                            end if
-                            if (cyl_coord .and. y_cc(cellaux(2)) < 0._wp) then
-                                celloutside = .true.
-                            end if
-                            if ((cellaux(2) > n + buff_size) .or. (cellaux(1) > m + buff_size)) then
-                                celloutside = .true.
-                            end if
+                        !> Obtaining the cell volume
+                        if (p > 0) then
+                            vol = dx(cellaux(1))*dy(cellaux(2))*dz(cellaux(3))
                         else
-                            if ((cellaux(3) < -buff_size) .or. (cellaux(1) < -buff_size) .or. (cellaux(2) < -buff_size)) then
-                                celloutside = .true.
-                            end if
-
-                            if ((cellaux(3) > p + buff_size) .or. (cellaux(2) > n + buff_size) .or. (cellaux(1) > m + buff_size)) &
-                                & then
-                                celloutside = .true.
-                            end if
-                        end if
-                        if (.not. celloutside) then
-                            if (cyl_coord .and. (p == 0) .and. (y_cc(cellaux(2)) < 0._wp)) then
-                                celloutside = .true.
-                            end if
-                        end if
-
-                        if (.not. celloutside) then
-                            !> Obtaining the cell volulme
-                            if (p > 0) then
-                                vol = dx(cellaux(1))*dy(cellaux(2))*dz(cellaux(3))
+                            if (cyl_coord) then
+                                vol = dx(cellaux(1))*dy(cellaux(2))*y_cc(cellaux(2))*2._wp*pi
                             else
-                                if (cyl_coord) then
-                                    vol = dx(cellaux(1))*dy(cellaux(2))*y_cc(cellaux(2))*2._wp*pi
-                                else
-                                    vol = dx(cellaux(1))*dy(cellaux(2))*lag_params%charwidth
-                                end if
+                                vol = dx(cellaux(1))*dy(cellaux(2))*lag_params%charwidth
                             end if
-                            !> Update values
-                            charvol = charvol + vol
-                            charpres = charpres + q_prim_vf(E_idx)%sf(cellaux(1), cellaux(2), cellaux(3))*vol
-                            charvol2 = charvol2 + vol*q_beta(1)%sf(cellaux(1), cellaux(2), cellaux(3))
-                            charpres2 = charpres2 + q_prim_vf(E_idx)%sf(cellaux(1), cellaux(2), &
-                                                              & cellaux(3))*vol*q_beta(1)%sf(cellaux(1), cellaux(2), cellaux(3))
                         end if
+                        !> Update values
+                        charvol = charvol + vol
+                        charpres = charpres + q_prim_vf(E_idx)%sf(cellaux(1), cellaux(2), cellaux(3))*vol
+                        charvol2 = charvol2 + vol*q_beta(1)%sf(cellaux(1), cellaux(2), cellaux(3))
+                        charpres2 = charpres2 + q_prim_vf(E_idx)%sf(cellaux(1), cellaux(2), &
+                                                          & cellaux(3))*vol*q_beta(1)%sf(cellaux(1), cellaux(2), cellaux(3))
                     end do
                 end do
             end do
-
             f_pinfl = charpres2/charvol2
             vol = charvol
             dc = (3._wp*abs(vol)/(4._wp*pi))**(1._wp/3._wp)
@@ -966,126 +1175,342 @@ contains
 
     end subroutine s_get_pinf
 
-    !> Update Lagrangian bubble variables using TVD Runge-Kutta time stepping
-    impure subroutine s_update_lagrange_tdv_rk(stage)
+    !> This subroutine updates the Lagrange variables using the tvd RK time steppers. The time derivative of the bubble variables
+    !! must be stored at every stage to avoid precision errors.
+    !! @param stage Current tvd RK stage
+    impure subroutine s_update_lagrange_tdv_rk(q_prim_vf, bc_type, stage)
 
-        integer, intent(in) :: stage
-        integer             :: k
+        type(scalar_field), dimension(sys_size), intent(in)        :: q_prim_vf
+        type(integer_field), dimension(1:num_dims,1:2), intent(in) :: bc_type
+        integer, intent(in)                                        :: stage
+        integer                                                    :: k
 
         if (time_stepper == 1) then  ! 1st order TVD RK
+
             $:GPU_PARALLEL_LOOP(private='[k]')
-            do k = 1, nBubs
+            do k = 1, n_el_bubs_loc
                 ! u{1} = u{n} +  dt * RHS{n}
                 intfc_rad(k, 1) = intfc_rad(k, 1) + dt*intfc_draddt(k, 1)
                 intfc_vel(k, 1) = intfc_vel(k, 1) + dt*intfc_dveldt(k, 1)
-                mtn_pos(k,1:3,1) = mtn_pos(k,1:3,1) + dt*mtn_dposdt(k,1:3,1)
-                mtn_vel(k,1:3,1) = mtn_vel(k,1:3,1) + dt*mtn_dveldt(k,1:3,1)
                 gas_p(k, 1) = gas_p(k, 1) + dt*gas_dpdt(k, 1)
                 gas_mv(k, 1) = gas_mv(k, 1) + dt*gas_dmvdt(k, 1)
+                if (moving_lag_bubbles) then
+                    mtn_posPrev(k,1:3,1) = mtn_pos(k,1:3,1)
+                    mtn_pos(k,1:3,1) = mtn_pos(k,1:3,1) + dt*mtn_dposdt(k,1:3,1)
+                    mtn_vel(k,1:3,1) = mtn_vel(k,1:3,1) + dt*mtn_dveldt(k,1:3,1)
+                end if
             end do
             $:END_GPU_PARALLEL_LOOP()
 
             call s_transfer_data_to_tmp()
-            call s_write_void_evol(mytime)
+            if (moving_lag_bubbles) call s_enforce_EL_bubbles_boundary_conditions(q_prim_vf)
+            call s_smear_voidfraction(bc_type)
+            if (lag_params%write_void_evol) call s_write_void_evol(mytime)
             if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
-
             if (lag_params%write_bubbles) then
                 $:GPU_UPDATE(host='[gas_p, gas_mv, intfc_rad, intfc_vel]')
-                call s_write_lag_particles(mytime)
+                call s_write_lag_bubble_evol(mytime)
             end if
         else if (time_stepper == 2) then  ! 2nd order TVD RK
             if (stage == 1) then
                 $:GPU_PARALLEL_LOOP(private='[k]')
-                do k = 1, nBubs
+                do k = 1, n_el_bubs_loc
                     ! u{1} = u{n} +  dt * RHS{n}
                     intfc_rad(k, 2) = intfc_rad(k, 1) + dt*intfc_draddt(k, 1)
                     intfc_vel(k, 2) = intfc_vel(k, 1) + dt*intfc_dveldt(k, 1)
-                    mtn_pos(k,1:3,2) = mtn_pos(k,1:3,1) + dt*mtn_dposdt(k,1:3,1)
-                    mtn_vel(k,1:3,2) = mtn_vel(k,1:3,1) + dt*mtn_dveldt(k,1:3,1)
                     gas_p(k, 2) = gas_p(k, 1) + dt*gas_dpdt(k, 1)
                     gas_mv(k, 2) = gas_mv(k, 1) + dt*gas_dmvdt(k, 1)
+                    if (moving_lag_bubbles) then
+                        mtn_posPrev(k,1:3,2) = mtn_pos(k,1:3,1)
+                        mtn_pos(k,1:3,2) = mtn_pos(k,1:3,1) + dt*mtn_dposdt(k,1:3,1)
+                        mtn_vel(k,1:3,2) = mtn_vel(k,1:3,1) + dt*mtn_dveldt(k,1:3,1)
+                    end if
                 end do
                 $:END_GPU_PARALLEL_LOOP()
+
+                if (moving_lag_bubbles) call s_enforce_EL_bubbles_boundary_conditions(q_prim_vf)
+                call s_smear_voidfraction(bc_type)
             else if (stage == 2) then
                 $:GPU_PARALLEL_LOOP(private='[k]')
-                do k = 1, nBubs
+                do k = 1, n_el_bubs_loc
                     ! u{1} = u{n} + (1/2) * dt * (RHS{n} + RHS{1})
                     intfc_rad(k, 1) = intfc_rad(k, 1) + dt*(intfc_draddt(k, 1) + intfc_draddt(k, 2))/2._wp
                     intfc_vel(k, 1) = intfc_vel(k, 1) + dt*(intfc_dveldt(k, 1) + intfc_dveldt(k, 2))/2._wp
-                    mtn_pos(k,1:3,1) = mtn_pos(k,1:3,1) + dt*(mtn_dposdt(k,1:3,1) + mtn_dposdt(k,1:3,2))/2._wp
-                    mtn_vel(k,1:3,1) = mtn_vel(k,1:3,1) + dt*(mtn_dveldt(k,1:3,1) + mtn_dveldt(k,1:3,2))/2._wp
                     gas_p(k, 1) = gas_p(k, 1) + dt*(gas_dpdt(k, 1) + gas_dpdt(k, 2))/2._wp
                     gas_mv(k, 1) = gas_mv(k, 1) + dt*(gas_dmvdt(k, 1) + gas_dmvdt(k, 2))/2._wp
+                    if (moving_lag_bubbles) then
+                        mtn_posPrev(k,1:3,1) = mtn_pos(k,1:3,2)
+                        mtn_pos(k,1:3,1) = mtn_pos(k,1:3,1) + dt*(mtn_dposdt(k,1:3,1) + mtn_dposdt(k,1:3,2))/2._wp
+                        mtn_vel(k,1:3,1) = mtn_vel(k,1:3,1) + dt*(mtn_dveldt(k,1:3,1) + mtn_dveldt(k,1:3,2))/2._wp
+                    end if
                 end do
                 $:END_GPU_PARALLEL_LOOP()
 
                 call s_transfer_data_to_tmp()
-                call s_write_void_evol(mytime)
+                if (moving_lag_bubbles) call s_enforce_EL_bubbles_boundary_conditions(q_prim_vf)
+                call s_smear_voidfraction(bc_type)
+                if (lag_params%write_void_evol) call s_write_void_evol(mytime)
                 if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
-
                 if (lag_params%write_bubbles) then
                     $:GPU_UPDATE(host='[gas_p, gas_mv, intfc_rad, intfc_vel]')
-                    call s_write_lag_particles(mytime)
+                    call s_write_lag_bubble_evol(mytime)
                 end if
             end if
         else if (time_stepper == 3) then  ! 3rd order TVD RK
             if (stage == 1) then
                 $:GPU_PARALLEL_LOOP(private='[k]')
-                do k = 1, nBubs
+                do k = 1, n_el_bubs_loc
                     ! u{1} = u{n} +  dt * RHS{n}
                     intfc_rad(k, 2) = intfc_rad(k, 1) + dt*intfc_draddt(k, 1)
                     intfc_vel(k, 2) = intfc_vel(k, 1) + dt*intfc_dveldt(k, 1)
-                    mtn_pos(k,1:3,2) = mtn_pos(k,1:3,1) + dt*mtn_dposdt(k,1:3,1)
-                    mtn_vel(k,1:3,2) = mtn_vel(k,1:3,1) + dt*mtn_dveldt(k,1:3,1)
                     gas_p(k, 2) = gas_p(k, 1) + dt*gas_dpdt(k, 1)
                     gas_mv(k, 2) = gas_mv(k, 1) + dt*gas_dmvdt(k, 1)
+                    if (moving_lag_bubbles) then
+                        mtn_posPrev(k,1:3,2) = mtn_pos(k,1:3,1)
+                        mtn_pos(k,1:3,2) = mtn_pos(k,1:3,1) + dt*mtn_dposdt(k,1:3,1)
+                        mtn_vel(k,1:3,2) = mtn_vel(k,1:3,1) + dt*mtn_dveldt(k,1:3,1)
+                    end if
                 end do
                 $:END_GPU_PARALLEL_LOOP()
+
+                if (moving_lag_bubbles) call s_enforce_EL_bubbles_boundary_conditions(q_prim_vf)
+                call s_smear_voidfraction(bc_type)
             else if (stage == 2) then
                 $:GPU_PARALLEL_LOOP(private='[k]')
-                do k = 1, nBubs
+                do k = 1, n_el_bubs_loc
                     ! u{2} = u{n} + (1/4) * dt * [RHS{n} + RHS{1}]
                     intfc_rad(k, 2) = intfc_rad(k, 1) + dt*(intfc_draddt(k, 1) + intfc_draddt(k, 2))/4._wp
                     intfc_vel(k, 2) = intfc_vel(k, 1) + dt*(intfc_dveldt(k, 1) + intfc_dveldt(k, 2))/4._wp
-                    mtn_pos(k,1:3,2) = mtn_pos(k,1:3,1) + dt*(mtn_dposdt(k,1:3,1) + mtn_dposdt(k,1:3,2))/4._wp
-                    mtn_vel(k,1:3,2) = mtn_vel(k,1:3,1) + dt*(mtn_dveldt(k,1:3,1) + mtn_dveldt(k,1:3,2))/4._wp
                     gas_p(k, 2) = gas_p(k, 1) + dt*(gas_dpdt(k, 1) + gas_dpdt(k, 2))/4._wp
                     gas_mv(k, 2) = gas_mv(k, 1) + dt*(gas_dmvdt(k, 1) + gas_dmvdt(k, 2))/4._wp
+                    if (moving_lag_bubbles) then
+                        mtn_posPrev(k,1:3,2) = mtn_pos(k,1:3,2)
+                        mtn_pos(k,1:3,2) = mtn_pos(k,1:3,1) + dt*(mtn_dposdt(k,1:3,1) + mtn_dposdt(k,1:3,2))/4._wp
+                        mtn_vel(k,1:3,2) = mtn_vel(k,1:3,1) + dt*(mtn_dveldt(k,1:3,1) + mtn_dveldt(k,1:3,2))/4._wp
+                    end if
                 end do
                 $:END_GPU_PARALLEL_LOOP()
+
+                if (moving_lag_bubbles) call s_enforce_EL_bubbles_boundary_conditions(q_prim_vf)
+                call s_smear_voidfraction(bc_type)
             else if (stage == 3) then
                 $:GPU_PARALLEL_LOOP(private='[k]')
-                do k = 1, nBubs
+                do k = 1, n_el_bubs_loc
                     ! u{n+1} = u{n} + (2/3) * dt * [(1/4)* RHS{n} + (1/4)* RHS{1} + RHS{2}]
                     intfc_rad(k, 1) = intfc_rad(k, 1) + (2._wp/3._wp)*dt*(intfc_draddt(k, 1)/4._wp + intfc_draddt(k, &
                               & 2)/4._wp + intfc_draddt(k, 3))
                     intfc_vel(k, 1) = intfc_vel(k, 1) + (2._wp/3._wp)*dt*(intfc_dveldt(k, 1)/4._wp + intfc_dveldt(k, &
                               & 2)/4._wp + intfc_dveldt(k, 3))
-                    mtn_pos(k,1:3,1) = mtn_pos(k,1:3,1) + (2._wp/3._wp)*dt*(mtn_dposdt(k,1:3,1)/4._wp + mtn_dposdt(k,1:3, &
-                            & 2)/4._wp + mtn_dposdt(k,1:3,3))
-                    mtn_vel(k,1:3,1) = mtn_vel(k,1:3,1) + (2._wp/3._wp)*dt*(mtn_dveldt(k,1:3,1)/4._wp + mtn_dveldt(k,1:3, &
-                            & 2)/4._wp + mtn_dveldt(k,1:3,3))
                     gas_p(k, 1) = gas_p(k, 1) + (2._wp/3._wp)*dt*(gas_dpdt(k, 1)/4._wp + gas_dpdt(k, 2)/4._wp + gas_dpdt(k, 3))
                     gas_mv(k, 1) = gas_mv(k, 1) + (2._wp/3._wp)*dt*(gas_dmvdt(k, 1)/4._wp + gas_dmvdt(k, 2)/4._wp + gas_dmvdt(k, 3))
+                    if (moving_lag_bubbles) then
+                        mtn_posPrev(k,1:3,1) = mtn_pos(k,1:3,2)
+                        mtn_pos(k,1:3,1) = mtn_pos(k,1:3,1) + (2._wp/3._wp)*dt*(mtn_dposdt(k,1:3,1)/4._wp + mtn_dposdt(k,1:3, &
+                                & 2)/4._wp + mtn_dposdt(k,1:3,3))
+                        mtn_vel(k,1:3,1) = mtn_vel(k,1:3,1) + (2._wp/3._wp)*dt*(mtn_dveldt(k,1:3,1)/4._wp + mtn_dveldt(k,1:3, &
+                                & 2)/4._wp + mtn_dveldt(k,1:3,3))
+                    end if
                 end do
                 $:END_GPU_PARALLEL_LOOP()
 
                 call s_transfer_data_to_tmp()
-                call s_write_void_evol(mytime)
+                if (moving_lag_bubbles) call s_enforce_EL_bubbles_boundary_conditions(q_prim_vf)
+                call s_smear_voidfraction(bc_type)
+                if (lag_params%write_void_evol) call s_write_void_evol(mytime)
                 if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
-
                 if (lag_params%write_bubbles) then
                     $:GPU_UPDATE(host='[gas_p, gas_mv, gas_mg, intfc_rad, intfc_vel]')
-                    call s_write_lag_particles(mytime)
+                    call s_write_lag_bubble_evol(mytime)
                 end if
             end if
         end if
 
     end subroutine s_update_lagrange_tdv_rk
 
-    !> Locate the cell index for a given physical position
+    !> This subroutine enforces reflective and wall boundary conditions for EL bubbles
+    !! @param dest Destination for the bubble position update
+    impure subroutine s_enforce_EL_bubbles_boundary_conditions(q_prim_vf)
+
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
+        integer                                             :: k, i, q
+        integer                                             :: patch_id, newBubs, new_idx
+        real(wp)                                            :: offset
+        integer, dimension(3)                               :: cell
+
+        call nvtxStartRange("LAG-BC")
+        call nvtxStartRange("LAG-BC-DEV2HOST")
+        $:GPU_UPDATE(host='[bub_R0, Rmax_stats, Rmin_stats, gas_mg, gas_betaT, gas_betaC, bub_dphidt, lag_id, gas_p, gas_mv, &
+                     & intfc_rad, intfc_vel, mtn_pos, mtn_posPrev, mtn_vel, mtn_s, intfc_draddt, intfc_dveldt, gas_dpdt, &
+                     & gas_dmvdt, mtn_dposdt, mtn_dveldt, keep_bubble, n_el_bubs_loc, wrap_bubble_dir, wrap_bubble_loc]')
+        call nvtxEndRange
+
+        ! Handle MPI transfer of bubbles going to another processor's local domain
+        if (num_procs > 1) then
+            call nvtxStartRange("LAG-BC-TRANSFER-LIST")
+            call s_add_particles_to_transfer_list(n_el_bubs_loc, mtn_pos(:,:,2), mtn_posPrev(:,:,2))
+            call nvtxEndRange
+
+            call nvtxStartRange("LAG-BC-SENDRECV")
+            call s_mpi_sendrecv_particles(bub_R0, Rmax_stats, Rmin_stats, gas_mg, gas_betaT, gas_betaC, bub_dphidt, lag_id, &
+                                          & gas_p, gas_mv, intfc_rad, intfc_vel, mtn_pos, mtn_posPrev, mtn_vel, mtn_s, &
+                                          & intfc_draddt, intfc_dveldt, gas_dpdt, gas_dmvdt, mtn_dposdt, mtn_dveldt, lag_num_ts, &
+                                          & n_el_bubs_loc, 2)
+            call nvtxEndRange
+        end if
+
+        call nvtxStartRange("LAG-BC-HOST2DEV")
+        $:GPU_UPDATE(device='[bub_R0, Rmax_stats, Rmin_stats, gas_mg, gas_betaT, gas_betaC, bub_dphidt, lag_id, gas_p, gas_mv, &
+                     & intfc_rad, intfc_vel, mtn_pos, mtn_posPrev, mtn_vel, mtn_s, intfc_draddt, intfc_dveldt, gas_dpdt, &
+                     & gas_dmvdt, mtn_dposdt, mtn_dveldt, n_el_bubs_loc]')
+        call nvtxEndRange
+
+        $:GPU_PARALLEL_LOOP(private='[k, cell]')
+        do k = 1, n_el_bubs_loc
+            keep_bubble(k) = 1
+            wrap_bubble_loc(k,:) = 0
+            wrap_bubble_dir(k,:) = 0
+
+            ! Relocate bubbles at solid boundaries and delete bubbles that leave buffer regions
+            if (any(bc_x%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) .and. mtn_pos(k, 1, &
+                & 2) < x_cb(-1) + intfc_rad(k, 2)) then
+                mtn_pos(k, 1, 2) = x_cb(-1) + intfc_rad(k, 2)
+            else if (any(bc_x%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) .and. mtn_pos(k, 1, &
+                     & 2) > x_cb(m) - intfc_rad(k, 2)) then
+                mtn_pos(k, 1, 2) = x_cb(m) - intfc_rad(k, 2)
+            else if (bc_x%beg == BC_PERIODIC .and. mtn_pos(k, 1, 2) < pcomm_coords(1)%beg .and. mtn_posPrev(k, 1, &
+                     & 2) >= pcomm_coords(1)%beg) then
+                wrap_bubble_dir(k, 1) = 1
+                wrap_bubble_loc(k, 1) = -1
+            else if (bc_x%end == BC_PERIODIC .and. mtn_pos(k, 1, 2) > pcomm_coords(1)%end .and. mtn_posPrev(k, 1, &
+                     & 2) <= pcomm_coords(1)%end) then
+                wrap_bubble_dir(k, 1) = 1
+                wrap_bubble_loc(k, 1) = 1
+            else if (mtn_pos(k, 1, 2) >= x_cb(m)) then
+                keep_bubble(k) = 0
+            else if (mtn_pos(k, 1, 2) < x_cb(-1)) then
+                keep_bubble(k) = 0
+            end if
+
+            if (any(bc_y%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) .and. mtn_pos(k, 2, &
+                & 2) < y_cb(-1) + intfc_rad(k, 2)) then
+                mtn_pos(k, 2, 2) = y_cb(-1) + intfc_rad(k, 2)
+            else if (any(bc_y%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) .and. mtn_pos(k, 2, &
+                     & 2) > y_cb(n) - intfc_rad(k, 2)) then
+                mtn_pos(k, 2, 2) = y_cb(n) - intfc_rad(k, 2)
+            else if (bc_y%beg == BC_PERIODIC .and. mtn_pos(k, 2, 2) < pcomm_coords(2)%beg .and. mtn_posPrev(k, 2, &
+                     & 2) >= pcomm_coords(2)%beg) then
+                wrap_bubble_dir(k, 2) = 1
+                wrap_bubble_loc(k, 2) = -1
+            else if (bc_y%end == BC_PERIODIC .and. mtn_pos(k, 2, 2) > pcomm_coords(2)%end .and. mtn_posPrev(k, 2, &
+                     & 2) <= pcomm_coords(2)%end) then
+                wrap_bubble_dir(k, 2) = 1
+                wrap_bubble_loc(k, 2) = 1
+            else if (mtn_pos(k, 2, 2) >= y_cb(n)) then
+                keep_bubble(k) = 0
+            else if (mtn_pos(k, 2, 2) < y_cb(-1)) then
+                keep_bubble(k) = 0
+            end if
+
+            if (p > 0) then
+                if (any(bc_z%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) .and. mtn_pos(k, 3, &
+                    & 2) < z_cb(-1) + intfc_rad(k, 2)) then
+                    mtn_pos(k, 3, 2) = z_cb(-1) + intfc_rad(k, 2)
+                else if (any(bc_z%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) .and. mtn_pos(k, 3, &
+                         & 2) > z_cb(p) - intfc_rad(k, 2)) then
+                    mtn_pos(k, 3, 2) = z_cb(p) - intfc_rad(k, 2)
+                else if (bc_z%beg == BC_PERIODIC .and. mtn_pos(k, 3, 2) < pcomm_coords(3)%beg .and. mtn_posPrev(k, 3, &
+                         & 2) >= pcomm_coords(3)%beg) then
+                    wrap_bubble_dir(k, 3) = 1
+                    wrap_bubble_loc(k, 3) = -1
+                else if (bc_z%end == BC_PERIODIC .and. mtn_pos(k, 3, 2) > pcomm_coords(3)%end .and. mtn_posPrev(k, 3, &
+                         & 2) <= pcomm_coords(3)%end) then
+                    wrap_bubble_dir(k, 3) = 1
+                    wrap_bubble_loc(k, 3) = 1
+                else if (mtn_pos(k, 3, 2) >= z_cb(p)) then
+                    keep_bubble(k) = 0
+                else if (mtn_pos(k, 3, 2) < z_cb(-1)) then
+                    keep_bubble(k) = 0
+                end if
+            end if
+
+            if (keep_bubble(k) == 1) then
+                ! Remove bubbles that are no longer in a liquid
+                cell = fd_number - buff_size
+                call s_locate_cell(mtn_pos(k,1:3,2), cell, mtn_s(k,1:3,2))
+
+                if (q_prim_vf(advxb)%sf(cell(1), cell(2), cell(3)) < (1._wp - lag_params%valmaxvoid)) then
+                    keep_bubble(k) = 0
+                end if
+            end if
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        if (n_el_bubs_loc > 0) then
+            call nvtxStartRange("LAG-BC-DEV2HOST")
+            $:GPU_UPDATE(host='[bub_R0, Rmax_stats, Rmin_stats, gas_mg, gas_betaT, gas_betaC, bub_dphidt, lag_id, gas_p, gas_mv, &
+                         & intfc_rad, intfc_vel, mtn_pos, mtn_posPrev, mtn_vel, mtn_s, intfc_draddt, intfc_dveldt, gas_dpdt, &
+                         & gas_dmvdt, mtn_dposdt, mtn_dveldt, keep_bubble, n_el_bubs_loc, wrap_bubble_dir, wrap_bubble_loc]')
+            call nvtxEndRange
+
+            newBubs = 0
+            do k = 1, n_el_bubs_loc
+                if (keep_bubble(k) == 1) then
+                    newBubs = newBubs + 1
+                    if (newBubs /= k) then
+                        call s_copy_lag_bubble(newBubs, k)
+                        wrap_bubble_dir(newBubs,:) = wrap_bubble_dir(k,:)
+                        wrap_bubble_loc(newBubs,:) = wrap_bubble_loc(k,:)
+                    end if
+                end if
+            end do
+            n_el_bubs_loc = newBubs
+
+            ! Handle periodic wrapping of bubbles on same processor
+            do k = 1, n_el_bubs_loc
+                if (any(wrap_bubble_dir(k,:) == 1)) then
+                    do i = 1, num_dims
+                        if (wrap_bubble_dir(k, i) == 1) then
+                            offset = glb_bounds(i)%end - glb_bounds(i)%beg
+                            if (wrap_bubble_loc(k, i) == 1) then
+                                do q = 1, 2
+                                    mtn_pos(k, i, q) = mtn_pos(k, i, q) - offset
+                                    mtn_posPrev(k, i, q) = mtn_posPrev(k, i, q) - offset
+                                end do
+                            else if (wrap_bubble_loc(k, i) == -1) then
+                                do q = 1, 2
+                                    mtn_pos(k, i, q) = mtn_pos(k, i, q) + offset
+                                    mtn_posPrev(k, i, q) = mtn_posPrev(k, i, q) + offset
+                                end do
+                            end if
+                        end if
+                    end do
+                end if
+            end do
+            call nvtxStartRange("LAG-BC-HOST2DEV")
+            $:GPU_UPDATE(device='[bub_R0, Rmax_stats, Rmin_stats, gas_mg, gas_betaT, gas_betaC, bub_dphidt, lag_id, gas_p, &
+                         & gas_mv, intfc_rad, intfc_vel, mtn_pos, mtn_posPrev, mtn_vel, mtn_s, intfc_draddt, intfc_dveldt, &
+                         & gas_dpdt, gas_dmvdt, mtn_dposdt, mtn_dveldt, n_el_bubs_loc]')
+            call nvtxEndRange
+        end if
+
+        $:GPU_PARALLEL_LOOP(private='[cell]')
+        do k = 1, n_el_bubs_loc
+            cell = fd_number - buff_size
+            call s_locate_cell(mtn_pos(k,1:3,2), cell, mtn_s(k,1:3,2))
+        end do
+
+        call nvtxEndRange
+
+    end subroutine s_enforce_EL_bubbles_boundary_conditions
+
+    !> This subroutine returns the computational coordinate of the cell for the given position.
+    !! @param pos Input coordinates
+    !! @param cell Computational coordinate of the cell
+    !! @param scoord Calculated particle coordinates
     subroutine s_locate_cell(pos, cell, scoord)
 
+        $:GPU_ROUTINE(function_name='s_locate_cell',parallelism='[seq]', cray_inline=True)
+
         real(wp), dimension(3), intent(in)   :: pos
         real(wp), dimension(3), intent(out)  :: scoord
         integer, dimension(3), intent(inout) :: cell
@@ -1095,7 +1520,7 @@ contains
             cell(1) = cell(1) - 1
         end do
 
-        do while (pos(1) > x_cb(cell(1)))
+        do while (pos(1) >= x_cb(cell(1)))
             cell(1) = cell(1) + 1
         end do
 
@@ -1103,7 +1528,7 @@ contains
             cell(2) = cell(2) - 1
         end do
 
-        do while (pos(2) > y_cb(cell(2)))
+        do while (pos(2) >= y_cb(cell(2)))
             cell(2) = cell(2) + 1
         end do
 
@@ -1111,7 +1536,7 @@ contains
             do while (pos(3) < z_cb(cell(3) - 1))
                 cell(3) = cell(3) - 1
             end do
-            do while (pos(3) > z_cb(cell(3)))
+            do while (pos(3) >= z_cb(cell(3)))
                 cell(3) = cell(3) + 1
             end do
         end if
@@ -1132,13 +1557,13 @@ contains
 
     end subroutine s_locate_cell
 
-    !> Transfer data into the temporal variables
+    !> This subroutine transfer data into the temporal variables.
     impure subroutine s_transfer_data_to_tmp()
 
         integer :: k
 
         $:GPU_PARALLEL_LOOP(private='[k]')
-        do k = 1, nBubs
+        do k = 1, n_el_bubs_loc
             gas_p(k, 2) = gas_p(k, 1)
             gas_mv(k, 2) = gas_mv(k, 1)
             intfc_rad(k, 2) = intfc_rad(k, 1)
@@ -1152,7 +1577,9 @@ contains
 
     end subroutine s_transfer_data_to_tmp
 
-    !> Determine if a bubble position lies within the current MPI subdomain including ghost cells
+    !> The purpose of this procedure is to determine if the global coordinates of the bubbles are present in the current MPI
+    !! processor (including ghost cells).
+    !! @param pos_part Spatial coordinates of the bubble
     function particle_in_domain(pos_part)
 
         logical                            :: particle_in_domain
@@ -1162,22 +1589,23 @@ contains
 
         if (p == 0 .and. cyl_coord .neqv. .true.) then
             ! Defining a virtual z-axis that has the same dimensions as y-axis defined in the input file
-            particle_in_domain = ((pos_part(1) < x_cb(m + buff_size)) .and. (pos_part(1) >= x_cb(-buff_size - 1)) &
-                                  & .and. (pos_part(2) < y_cb(n + buff_size)) .and. (pos_part(2) >= y_cb(-buff_size - 1)) &
-                                  & .and. (pos_part(3) < lag_params%charwidth/2._wp) .and. (pos_part(3) >= &
-                                  & -lag_params%charwidth/2._wp))
+            particle_in_domain = ((pos_part(1) < x_cb(m + buff_size - fd_number)) .and. (pos_part(1) >= x_cb(fd_number &
+                                  & - buff_size - 1)) .and. (pos_part(2) < y_cb(n + buff_size - fd_number)) .and. (pos_part(2) &
+                                  & >= y_cb(fd_number - buff_size - 1)) .and. (pos_part(3) < lag_params%charwidth/2._wp) &
+                                  & .and. (pos_part(3) > -lag_params%charwidth/2._wp))
         else
             ! cyl_coord
-            particle_in_domain = ((pos_part(1) < x_cb(m + buff_size)) .and. (pos_part(1) >= x_cb(-buff_size - 1)) &
-                                  & .and. (abs(pos_part(2)) < y_cb(n + buff_size)) .and. (abs(pos_part(2)) >= max(y_cb(-buff_size &
-                                  & - 1), 0._wp)))
+            particle_in_domain = ((pos_part(1) < x_cb(m + buff_size - fd_number)) .and. (pos_part(1) >= x_cb(fd_number &
+                                  & - buff_size - 1)) .and. (abs(pos_part(2)) < y_cb(n + buff_size - fd_number)) &
+                                  & .and. (abs(pos_part(2)) >= max(y_cb(fd_number - buff_size - 1), 0._wp)))
         end if
 
         ! 3D
         if (p > 0) then
-            particle_in_domain = ((pos_part(1) < x_cb(m + buff_size)) .and. (pos_part(1) >= x_cb(-buff_size - 1)) &
-                                  & .and. (pos_part(2) < y_cb(n + buff_size)) .and. (pos_part(2) >= y_cb(-buff_size - 1)) &
-                                  & .and. (pos_part(3) < z_cb(p + buff_size)) .and. (pos_part(3) >= z_cb(-buff_size - 1)))
+            particle_in_domain = ((pos_part(1) < x_cb(m + buff_size - fd_number)) .and. (pos_part(1) >= x_cb(fd_number &
+                                  & - buff_size - 1)) .and. (pos_part(2) < y_cb(n + buff_size - fd_number)) .and. (pos_part(2) &
+                                  & >= y_cb(fd_number - buff_size - 1)) .and. (pos_part(3) < z_cb(p + buff_size - fd_number)) &
+                                  & .and. (pos_part(3) >= z_cb(fd_number - buff_size - 1)))
         end if
 
         ! For symmetric and wall boundary condition
@@ -1204,7 +1632,9 @@ contains
 
     end function particle_in_domain
 
-    !> Determine if a Lagrangian bubble is within the physical domain excluding ghost cells
+    !> The purpose of this procedure is to determine if the lagrangian bubble is located in the physical domain. The ghost cells are
+    !! not part of the physical domain.
+    !! @param pos_part Spatial coordinates of the bubble
     function particle_in_domain_physical(pos_part)
 
         logical                            :: particle_in_domain_physical
@@ -1220,7 +1650,11 @@ contains
 
     end function particle_in_domain_physical
 
-    !> Compute the gradient of a scalar field using second-order central differences on a non-uniform grid
+    !> The purpose of this procedure is to calculate the gradient of a scalar field along the x, y and z directions following a
+    !! second-order central difference considering uneven widths
+    !! @param q Input scalar field
+    !! @param dq Output gradient of q
+    !! @param dir Gradient spatial direction
     subroutine s_gradient_dir(q, dq, dir)
 
         real(stp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:), intent(inout) :: q, dq
@@ -1270,7 +1704,8 @@ contains
 
     end subroutine s_gradient_dir
 
-    !> Write Lagrangian bubble state data at each time step
+    !> Subroutine that writes on each time step the changes of the lagrangian bubbles.
+    !! @param qtime Current time
     impure subroutine s_write_lag_particles(qtime)
 
         real(wp), intent(in)                 :: qtime
@@ -1281,7 +1716,34 @@ contains
 
         write (file_loc, '(A,I0,A)') 'lag_bubble_evol_', proc_rank, '.dat'
         file_loc = trim(case_dir) // '/D/' // trim(file_loc)
-        inquire (FILE=trim(file_loc), EXIST=file_exist)
+        call my_inquire(trim(file_loc), file_exist)
+
+        if (precision == 1) then
+            FMT = "(A16,A14,8A16)"
+        else
+            FMT = "(A24,A14,8A24)"
+        end if
+
+        if (.not. file_exist) then
+            open (LAG_EVOL_ID, FILE=trim(file_loc), form='formatted', position='rewind')
+            write (LAG_EVOL_ID, FMT) 'currentTime', 'particleID', 'x', 'y', 'z', 'coreVaporMass', 'coreVaporConcentration', &
+                   & 'radius', 'interfaceVelocity', 'corePressure'
+        else
+            open (LAG_EVOL_ID, FILE=trim(file_loc), form='formatted', position='append')
+        end if
+
+    end subroutine s_write_lag_particles
+
+    !> @Brief Subroutine that opens the file to write the evolution of the lagrangian bubbles on each time step.
+    impure subroutine s_open_lag_bubble_evol()
+
+        character(LEN=path_len + 2*name_len) :: file_loc
+        logical                              :: file_exist
+        character(LEN=25)                    :: FMT
+
+        write (file_loc, '(A,I0,A)') 'lag_bubble_evol_', proc_rank, '.dat'
+        file_loc = trim(case_dir) // '/D/' // trim(file_loc)
+        call my_inquire(trim(file_loc), file_exist)
 
         if (precision == 1) then
             FMT = "(A16,A14,8A16)"
@@ -1290,13 +1752,25 @@ contains
         end if
 
         if (.not. file_exist) then
-            open (11, FILE=trim(file_loc), form='formatted', position='rewind')
-            write (11, FMT) 'currentTime', 'particleID', 'x', 'y', 'z', 'coreVaporMass', 'coreVaporConcentration', 'radius', &
-                   & 'interfaceVelocity', 'corePressure'
+            open (LAG_EVOL_ID, FILE=trim(file_loc), form='formatted', position='rewind')
+            write (LAG_EVOL_ID, FMT) 'currentTime', 'particleID', 'x', 'y', 'z', 'coreVaporMass', 'coreVaporConcentration', &
+                   & 'radius', 'interfaceVelocity', 'corePressure'
         else
-            open (11, FILE=trim(file_loc), form='formatted', position='append')
+            open (LAG_EVOL_ID, FILE=trim(file_loc), form='formatted', position='append')
         end if
 
+    end subroutine s_open_lag_bubble_evol
+
+    !> Subroutine that writes on each time step the changes of the lagrangian bubbles.
+    !! @param q_time Current time
+    impure subroutine s_write_lag_bubble_evol(qtime)
+
+        real(wp), intent(in)                 :: qtime
+        integer                              :: k, ios
+        character(LEN=25)                    :: FMT
+        character(LEN=path_len + 2*name_len) :: file_loc, path
+        logical                              :: file_exist
+
         if (precision == 1) then
             FMT = "(F16.8,I14,8F16.8)"
         else
@@ -1304,37 +1778,53 @@ contains
         end if
 
         ! Cycle through list
-        do k = 1, nBubs
-            write (11, FMT) qtime, lag_id(k, 1), mtn_pos(k, 1, 1), mtn_pos(k, 2, 1), mtn_pos(k, 3, 1), gas_mv(k, 1), gas_mv(k, &
-                   & 1)/(gas_mv(k, 1) + gas_mg(k)), intfc_rad(k, 1), intfc_vel(k, 1), gas_p(k, 1)
+        do k = 1, n_el_bubs_loc
+            write (LAG_EVOL_ID, FMT) qtime, lag_id(k, 1), mtn_pos(k, 1, 1), mtn_pos(k, 2, 1), mtn_pos(k, 3, 1), gas_mv(k, 1), &
+                   & gas_mv(k, 1)/(gas_mv(k, 1) + gas_mg(k)), intfc_rad(k, 1), intfc_vel(k, 1), gas_p(k, 1)
         end do
 
-        close (11)
+    end subroutine s_write_lag_bubble_evol
 
-    end subroutine s_write_lag_particles
+    impure subroutine s_close_lag_bubble_evol
 
-    !> Write void fraction statistics at each time step
-    impure subroutine s_write_void_evol(qtime)
+        close (LAG_EVOL_ID)
+
+    end subroutine s_close_lag_bubble_evol
+
+    subroutine s_open_void_evol
 
-        real(wp), intent(in)                 :: qtime
-        real(wp)                             :: volcell, voltot
-        real(wp)                             :: lag_void_max, lag_void_avg, lag_vol
-        real(wp)                             :: void_max_glb, void_avg_glb, vol_glb
-        integer                              :: i, j, k
         character(LEN=path_len + 2*name_len) :: file_loc
         logical                              :: file_exist
 
         if (proc_rank == 0) then
             write (file_loc, '(A)') 'voidfraction.dat'
             file_loc = trim(case_dir) // '/D/' // trim(file_loc)
-            inquire (FILE=trim(file_loc), EXIST=file_exist)
+            call my_inquire(trim(file_loc), file_exist)
             if (.not. file_exist) then
-                open (12, FILE=trim(file_loc), form='formatted', position='rewind')
+                open (LAG_VOID_ID, FILE=trim(file_loc), form='formatted', position='rewind')
+                ! write (12, *) 'currentTime, averageVoidFraction, ', & 'maximumVoidFraction, totalParticlesVolume' write (12, *)
+                ! 'The averageVoidFraction value does ', & 'not reflect the real void fraction in the cloud since the ', & 'cells
+                ! which do not have bubbles are not accounted'
             else
-                open (12, FILE=trim(file_loc), form='formatted', position='append')
+                open (LAG_VOID_ID, FILE=trim(file_loc), form='formatted', position='append')
             end if
         end if
 
+    end subroutine s_open_void_evol
+
+    !> Subroutine that writes some useful statistics related to the volume fraction of the particles (void fraction) in the
+    !! computational domain on each time step.
+    !! @param qtime Current time
+    impure subroutine s_write_void_evol(qtime)
+
+        real(wp), intent(in)                 :: qtime
+        real(wp)                             :: volcell, voltot
+        real(wp)                             :: lag_void_max, lag_void_avg, lag_vol
+        real(wp)                             :: void_max_glb, void_avg_glb, vol_glb
+        integer                              :: i, j, k
+        character(LEN=path_len + 2*name_len) :: file_loc
+        logical                              :: file_exist
+
         lag_void_max = 0._wp
         lag_void_avg = 0._wp
         lag_vol = 0._wp
@@ -1370,13 +1860,19 @@ contains
         if (lag_vol > 0._wp) lag_void_avg = lag_void_avg/lag_vol
 
         if (proc_rank == 0) then
-            write (12, '(6X,4e24.8)') qtime, lag_void_avg, lag_void_max, voltot
-            close (12)
+            write (LAG_VOID_ID, '(6X,4e24.8)') qtime, lag_void_avg, lag_void_max, voltot
         end if
 
     end subroutine s_write_void_evol
 
-    !> Write restart files for the Lagrangian bubble solver
+    subroutine s_close_void_evol
+
+        if (proc_rank == 0) close (LAG_VOID_ID)
+
+    end subroutine s_close_void_evol
+
+    !> Subroutine that writes the restarting files for the particles in the lagrangian solver.
+    !! @param t_step Current time step
     impure subroutine s_write_restart_lag_bubbles(t_step)
 
         ! Generic string used to store the address of a particular file
@@ -1393,14 +1889,14 @@ contains
         integer(KIND=MPI_OFFSET_KIND)          :: disp
         integer                                :: view
         integer, dimension(2)                  :: gsizes, lsizes, start_idx_part
-        integer, allocatable                   :: proc_bubble_counts(:)
+        integer, dimension(num_procs)          :: part_order, part_ord_mpi
+        integer, dimension(num_procs)          :: proc_bubble_counts
         real(wp), dimension(1:1,1:lag_io_vars) :: dummy
-
         dummy = 0._wp
 
         bub_id = 0._wp
-        if (nBubs /= 0) then
-            do k = 1, nBubs
+        if (n_el_bubs_loc /= 0) then
+            do k = 1, n_el_bubs_loc
                 if (particle_in_domain_physical(mtn_pos(k,1:3,1))) then
                     bub_id = bub_id + 1
                 end if
@@ -1409,8 +1905,6 @@ contains
 
         if (.not. parallel_io) return
 
-        allocate (proc_bubble_counts(num_procs))
-
         lsizes(1) = bub_id
         lsizes(2) = lag_io_vars
 
@@ -1458,26 +1952,22 @@ contains
         if (bub_id > 0) then
             allocate (MPI_IO_DATA_lag_bubbles(max(1, bub_id),1:lag_io_vars))
 
-            i = 1
-            do k = 1, nBubs
-                if (particle_in_domain_physical(mtn_pos(k,1:3,1))) then
-                    MPI_IO_DATA_lag_bubbles(i, 1) = real(lag_id(k, 1))
-                    MPI_IO_DATA_lag_bubbles(i,2:4) = mtn_pos(k,1:3,1)
-                    MPI_IO_DATA_lag_bubbles(i,5:7) = mtn_posPrev(k,1:3,1)
-                    MPI_IO_DATA_lag_bubbles(i,8:10) = mtn_vel(k,1:3,1)
-                    MPI_IO_DATA_lag_bubbles(i, 11) = intfc_rad(k, 1)
-                    MPI_IO_DATA_lag_bubbles(i, 12) = intfc_vel(k, 1)
-                    MPI_IO_DATA_lag_bubbles(i, 13) = bub_R0(k)
-                    MPI_IO_DATA_lag_bubbles(i, 14) = Rmax_stats(k)
-                    MPI_IO_DATA_lag_bubbles(i, 15) = Rmin_stats(k)
-                    MPI_IO_DATA_lag_bubbles(i, 16) = bub_dphidt(k)
-                    MPI_IO_DATA_lag_bubbles(i, 17) = gas_p(k, 1)
-                    MPI_IO_DATA_lag_bubbles(i, 18) = gas_mv(k, 1)
-                    MPI_IO_DATA_lag_bubbles(i, 19) = gas_mg(k)
-                    MPI_IO_DATA_lag_bubbles(i, 20) = gas_betaT(k)
-                    MPI_IO_DATA_lag_bubbles(i, 21) = gas_betaC(k)
-                    i = i + 1
-                end if
+            do k = 1, n_el_bubs_loc
+                MPI_IO_DATA_lag_bubbles(k, 1) = real(lag_id(k, 1))
+                MPI_IO_DATA_lag_bubbles(k,2:4) = mtn_pos(k,1:3,1)
+                MPI_IO_DATA_lag_bubbles(k,5:7) = mtn_posPrev(k,1:3,1)
+                MPI_IO_DATA_lag_bubbles(k,8:10) = mtn_vel(k,1:3,1)
+                MPI_IO_DATA_lag_bubbles(k, 11) = intfc_rad(k, 1)
+                MPI_IO_DATA_lag_bubbles(k, 12) = intfc_vel(k, 1)
+                MPI_IO_DATA_lag_bubbles(k, 13) = bub_R0(k)
+                MPI_IO_DATA_lag_bubbles(k, 14) = Rmax_stats(k)
+                MPI_IO_DATA_lag_bubbles(k, 15) = Rmin_stats(k)
+                MPI_IO_DATA_lag_bubbles(k, 16) = bub_dphidt(k)
+                MPI_IO_DATA_lag_bubbles(k, 17) = gas_p(k, 1)
+                MPI_IO_DATA_lag_bubbles(k, 18) = gas_mv(k, 1)
+                MPI_IO_DATA_lag_bubbles(k, 19) = gas_mg(k)
+                MPI_IO_DATA_lag_bubbles(k, 20) = gas_betaT(k)
+                MPI_IO_DATA_lag_bubbles(k, 21) = gas_betaC(k)
             end do
 
             call MPI_TYPE_CREATE_SUBARRAY(2, gsizes, lsizes, start_idx_part, MPI_ORDER_FORTRAN, mpi_p, view, ierr)
@@ -1510,20 +2000,18 @@ contains
 
             call MPI_FILE_CLOSE(ifile, ierr)
         end if
-
-        deallocate (proc_bubble_counts)
 #endif
 
     end subroutine s_write_restart_lag_bubbles
 
-    !> Compute the maximum and minimum radius of each bubble
+    !> This procedure calculates the maximum and minimum radius of each bubble.
     subroutine s_calculate_lag_bubble_stats()
 
         integer :: k
 
         $:GPU_PARALLEL_LOOP(private='[k]', reduction='[[Rmax_glb], [Rmin_glb]]', reductionOp='[MAX, MIN]', &
                             & copy='[Rmax_glb, Rmin_glb]')
-        do k = 1, nBubs
+        do k = 1, n_el_bubs_loc
             Rmax_glb = max(Rmax_glb, intfc_rad(k, 1)/bub_R0(k))
             Rmin_glb = min(Rmin_glb, intfc_rad(k, 1)/bub_R0(k))
             Rmax_stats(k) = max(Rmax_stats(k), intfc_rad(k, 1)/bub_R0(k))
@@ -1533,17 +2021,15 @@ contains
 
     end subroutine s_calculate_lag_bubble_stats
 
-    !> Write the maximum and minimum radius statistics for each bubble
-    impure subroutine s_write_lag_bubble_stats()
+    impure subroutine s_open_lag_bubble_stats()
 
-        integer                              :: k
         character(LEN=path_len + 2*name_len) :: file_loc
-        character(len=20)                    :: FMT
+        character(LEN=20)                    :: FMT
+        logical                              :: file_exist
 
         write (file_loc, '(A,I0,A)') 'stats_lag_bubbles_', proc_rank, '.dat'
         file_loc = trim(case_dir) // '/D/' // trim(file_loc)
-
-        $:GPU_UPDATE(host='[Rmax_glb, Rmin_glb]')
+        call my_inquire(trim(file_loc), file_exist)
 
         if (precision == 1) then
             FMT = "(A10,A14,5A16)"
@@ -1551,68 +2037,89 @@ contains
             FMT = "(A10,A14,5A24)"
         end if
 
-        open (13, FILE=trim(file_loc), form='formatted', position='rewind')
-        write (13, FMT) 'proc_rank', 'particleID', 'x', 'y', 'z', 'Rmax_glb', 'Rmin_glb'
-
-        if (precision == 1) then
-            FMT = "(I10,I14,5F16.8)"
+        if (.not. file_exist) then
+            open (LAG_STATS_ID, FILE=trim(file_loc), form='formatted', position='rewind')
+            write (LAG_STATS_ID, *) 'proc_rank, particleID, x, y, z, Rmax_glb, Rmin_glb'
         else
-            FMT = "(I10,I14,5F24.16)"
+            open (LAG_STATS_ID, FILE=trim(file_loc), form='formatted', position='append')
         end if
 
-        do k = 1, nBubs
-            write (13, FMT) proc_rank, lag_id(k, 1), mtn_pos(k, 1, 1), mtn_pos(k, 2, 1), mtn_pos(k, 3, 1), Rmax_stats(k), &
-                   & Rmin_stats(k)
-        end do
+    end subroutine s_open_lag_bubble_stats
 
-        close (13)
+    !> Subroutine that writes the maximum and minimum radius of each bubble.
+    impure subroutine s_write_lag_bubble_stats()
 
-    end subroutine s_write_lag_bubble_stats
+        integer                              :: k
+        character(LEN=path_len + 2*name_len) :: file_loc
+        character(LEN=20)                    :: FMT
 
-    !> Remove a specific Lagrangian bubble when dt becomes too small
-    impure subroutine s_remove_lag_bubble(bub_id)
+        $:GPU_UPDATE(host='[Rmax_glb, Rmin_glb]')
 
-        integer, intent(in) :: bub_id
-        integer             :: i
+        if (precision == 1) then
+            FMT = "(I10,I14,5F16.8)"
+        else
+            FMT = "(I10,I14,5F24.16)"
+        end if
 
-        $:GPU_LOOP(parallelism='[seq]')
-        do i = bub_id, nBubs - 1
-            lag_id(i, 1) = lag_id(i + 1, 1)
-            bub_R0(i) = bub_R0(i + 1)
-            Rmax_stats(i) = Rmax_stats(i + 1)
-            Rmin_stats(i) = Rmin_stats(i + 1)
-            gas_mg(i) = gas_mg(i + 1)
-            gas_betaT(i) = gas_betaT(i + 1)
-            gas_betaC(i) = gas_betaC(i + 1)
-            bub_dphidt(i) = bub_dphidt(i + 1)
-            gas_p(i,1:2) = gas_p(i + 1,1:2)
-            gas_mv(i,1:2) = gas_mv(i + 1,1:2)
-            intfc_rad(i,1:2) = intfc_rad(i + 1,1:2)
-            intfc_vel(i,1:2) = intfc_vel(i + 1,1:2)
-            mtn_pos(i,1:3,1:2) = mtn_pos(i + 1,1:3,1:2)
-            mtn_posPrev(i,1:3,1:2) = mtn_posPrev(i + 1,1:3,1:2)
-            mtn_vel(i,1:3,1:2) = mtn_vel(i + 1,1:3,1:2)
-            mtn_s(i,1:3,1:2) = mtn_s(i + 1,1:3,1:2)
-            intfc_draddt(i,1:lag_num_ts) = intfc_draddt(i + 1,1:lag_num_ts)
-            intfc_dveldt(i,1:lag_num_ts) = intfc_dveldt(i + 1,1:lag_num_ts)
-            gas_dpdt(i,1:lag_num_ts) = gas_dpdt(i + 1,1:lag_num_ts)
-            gas_dmvdt(i,1:lag_num_ts) = gas_dmvdt(i + 1,1:lag_num_ts)
+        do k = 1, n_el_bubs_loc
+            write (LAG_STATS_ID, FMT) proc_rank, lag_id(k, 1), mtn_pos(k, 1, 1), mtn_pos(k, 2, 1), mtn_pos(k, 3, 1), &
+                   & Rmax_stats(k), Rmin_stats(k)
         end do
 
-        nBubs = nBubs - 1
-        $:GPU_UPDATE(device='[nBubs]')
-
-    end subroutine s_remove_lag_bubble
+    end subroutine s_write_lag_bubble_stats
 
-    !> Finalize the Lagrangian bubble solver
+    subroutine s_close_lag_bubble_stats
+
+        close (LAG_STATS_ID)
+
+    end subroutine s_close_lag_bubble_stats
+
+    !> The purpose of this subroutine is to remove one specific particle if dt is too small.
+    !! @param bub_id Particle id
+    impure subroutine s_copy_lag_bubble(dest, src)
+
+        integer, intent(in) :: src, dest
+
+        bub_R0(dest) = bub_R0(src)
+        Rmax_stats(dest) = Rmax_stats(src)
+        Rmin_stats(dest) = Rmin_stats(src)
+        gas_mg(dest) = gas_mg(src)
+        gas_betaT(dest) = gas_betaT(src)
+        gas_betaC(dest) = gas_betaC(src)
+        bub_dphidt(dest) = bub_dphidt(src)
+        lag_id(dest, 1) = lag_id(src, 1)
+        gas_p(dest,1:2) = gas_p(src,1:2)
+        gas_mv(dest,1:2) = gas_mv(src,1:2)
+        intfc_rad(dest,1:2) = intfc_rad(src,1:2)
+        intfc_vel(dest,1:2) = intfc_vel(src,1:2)
+        mtn_vel(dest,1:3,1:2) = mtn_vel(src,1:3,1:2)
+        mtn_s(dest,1:3,1:2) = mtn_s(src,1:3,1:2)
+        mtn_pos(dest,1:3,1:2) = mtn_pos(src,1:3,1:2)
+        mtn_posPrev(dest,1:3,1:2) = mtn_posPrev(src,1:3,1:2)
+        intfc_draddt(dest,1:lag_num_ts) = intfc_draddt(src,1:lag_num_ts)
+        intfc_dveldt(dest,1:lag_num_ts) = intfc_dveldt(src,1:lag_num_ts)
+        gas_dpdt(dest,1:lag_num_ts) = gas_dpdt(src,1:lag_num_ts)
+        gas_dmvdt(dest,1:lag_num_ts) = gas_dmvdt(src,1:lag_num_ts)
+        mtn_dposdt(dest,1:3,1:lag_num_ts) = mtn_dposdt(src,1:3,1:lag_num_ts)
+        mtn_dveldt(dest,1:3,1:lag_num_ts) = mtn_dveldt(src,1:3,1:lag_num_ts)
+
+    end subroutine s_copy_lag_bubble
+
+    !> The purpose of this subroutine is to deallocate variables
     impure subroutine s_finalize_lagrangian_solver()
 
         integer :: i
 
+        if (lag_params%write_void_evol) call s_close_void_evol
+        if (lag_params%write_bubbles) call s_close_lag_bubble_evol()
+        if (lag_params%write_bubbles_stats) call s_close_lag_bubble_stats()
+
         do i = 1, q_beta_idx
             @:DEALLOCATE(q_beta(i)%sf)
+            @:DEALLOCATE(kahan_comp(i)%sf)
         end do
         @:DEALLOCATE(q_beta)
+        @:DEALLOCATE(kahan_comp)
 
         ! Deallocating space
         @:DEALLOCATE(lag_id)
@@ -1638,6 +2145,28 @@ contains
         @:DEALLOCATE(mtn_dposdt)
         @:DEALLOCATE(mtn_dveldt)
 
+        @:DEALLOCATE(keep_bubble)
+        @:DEALLOCATE(wrap_bubble_loc, wrap_bubble_dir)
+
+        ! Deallocate pressure gradient arrays and FD coefficients
+        if (lag_params%vel_model > 0 .and. lag_params%pressure_force) then
+            @:DEALLOCATE(grad_p_x)
+            @:DEALLOCATE(fd_coeff_x_pgrad)
+            if (n > 0) then
+                @:DEALLOCATE(grad_p_y)
+                @:DEALLOCATE(fd_coeff_y_pgrad)
+                if (p > 0) then
+                    @:DEALLOCATE(grad_p_z)
+                    @:DEALLOCATE(fd_coeff_z_pgrad)
+                end if
+            end if
+        end if
+
+        ! Deallocate cell list arrays
+        @:DEALLOCATE(cell_list_start)
+        @:DEALLOCATE(cell_list_count)
+        @:DEALLOCATE(cell_list_idx)
+
     end subroutine s_finalize_lagrangian_solver
 
 end module m_bubbles_EL
diff --git a/src/simulation/m_bubbles_EL_kernels.fpp b/src/simulation/m_bubbles_EL_kernels.fpp
index f1f80980b0..23bfc81b31 100644
--- a/src/simulation/m_bubbles_EL_kernels.fpp
+++ b/src/simulation/m_bubbles_EL_kernels.fpp
@@ -11,166 +11,272 @@ module m_bubbles_EL_kernels
 
     implicit none
 
+    ! Cell-centered pressure gradients (precomputed for translational motion)
+    real(wp), allocatable, dimension(:,:,:) :: grad_p_x, grad_p_y, grad_p_z
+    $:GPU_DECLARE(create='[grad_p_x, grad_p_y, grad_p_z]')
+
+    ! Finite-difference coefficients for pressure gradient computation
+    real(wp), allocatable, dimension(:,:) :: fd_coeff_x_pgrad
+    real(wp), allocatable, dimension(:,:) :: fd_coeff_y_pgrad
+    real(wp), allocatable, dimension(:,:) :: fd_coeff_z_pgrad
+    $:GPU_DECLARE(create='[fd_coeff_x_pgrad, fd_coeff_y_pgrad, fd_coeff_z_pgrad]')
+
+    ! Cell list for bubble-to-cell mapping (rebuilt each RK stage before smearing)
+    integer, allocatable, dimension(:,:,:) :: cell_list_start  ! (0:m, 0:n, 0:p)
+    integer, allocatable, dimension(:,:,:) :: cell_list_count  ! (0:m, 0:n, 0:p)
+    integer, allocatable, dimension(:)     :: cell_list_idx  ! (1:nBubs_glb) sorted bubble indices
+    $:GPU_DECLARE(create='[cell_list_start, cell_list_count, cell_list_idx]')
+
 contains
 
-    !> Smear the Lagrangian bubble effects onto the Eulerian grid using the selected kernel
-    subroutine s_smoothfunction(nBubs, lbk_rad, lbk_vel, lbk_s, lbk_pos, updatedvar)
+    !> The purpose of this subroutine is to smear the strength of the lagrangian bubbles into the Eulerian framework using different
+    !! approaches.
+    !! @param nBubs Number of lagrangian bubbles in the current domain
+    !! @param lbk_rad Radius of the bubbles
+    !! @param lbk_vel Interface velocity of the bubbles
+    !! @param lbk_s Computational coordinates of the bubbles
+    !! @param lbk_pos Spatial coordinates of the bubbles
+    !! @param updatedvar Eulerian variable to be updated
+    subroutine s_smoothfunction(nBubs, lbk_rad, lbk_vel, lbk_s, lbk_pos, updatedvar, kcomp)
 
         integer, intent(in)                                             :: nBubs
         real(wp), dimension(1:lag_params%nBubs_glb,1:3,1:2), intent(in) :: lbk_s, lbk_pos
         real(wp), dimension(1:lag_params%nBubs_glb,1:2), intent(in)     :: lbk_rad, lbk_vel
         type(scalar_field), dimension(:), intent(inout)                 :: updatedvar
+        type(scalar_field), dimension(:), intent(inout)                 :: kcomp
 
         smoothfunc:select case(lag_params%smooth_type)
         case (1)
-            call s_gaussian(nBubs, lbk_rad, lbk_vel, lbk_s, lbk_pos, updatedvar)
+            call s_gaussian(nBubs, lbk_rad, lbk_vel, lbk_s, lbk_pos, updatedvar, kcomp)
         case (2)
-            call s_deltafunc(nBubs, lbk_rad, lbk_vel, lbk_s, updatedvar)
+            call s_deltafunc(nBubs, lbk_rad, lbk_vel, lbk_s, updatedvar, kcomp)
         end select smoothfunc
 
     end subroutine s_smoothfunction
 
-    !> Apply the delta kernel function to map bubble effects onto the containing cell
-    subroutine s_deltafunc(nBubs, lbk_rad, lbk_vel, lbk_s, updatedvar)
+    !> Builds a sorted cell list mapping each interior cell (0:m,0:n,0:p) to its resident bubbles. Uses a counting-sort on the host
+    !! (O(nBubs + N_cells)). Must be called before s_gaussian each RK stage.
+    !! @param nBubs Number of lagrangian bubbles in the current domain
+    !! @param lbk_s Computational coordinates of the bubbles
+    subroutine s_build_cell_list(nBubs, lbk_s)
 
         integer, intent(in)                                             :: nBubs
         real(wp), dimension(1:lag_params%nBubs_glb,1:3,1:2), intent(in) :: lbk_s
-        real(wp), dimension(1:lag_params%nBubs_glb,1:2), intent(in)     :: lbk_rad, lbk_vel
-        type(scalar_field), dimension(:), intent(inout)                 :: updatedvar
-        integer, dimension(3)                                           :: cell
-        real(wp)                                                        :: strength_vel, strength_vol
-        real(wp)                                                        :: addFun1, addFun2, addFun3
-        real(wp)                                                        :: volpart, Vol
+        integer                                                         :: l, ci, cj, ck, idx
         real(wp), dimension(3)                                          :: s_coord
-        integer                                                         :: l
 
-        $:GPU_PARALLEL_LOOP(private='[l, s_coord, cell]')
+        ! Bring current bubble positions to host
+
+        $:GPU_UPDATE(host='[lbk_s]')
+
+        ! Pass 1: zero counts and count bubbles per cell
+        cell_list_count = 0
         do l = 1, nBubs
-            volpart = 4._wp/3._wp*pi*lbk_rad(l, 2)**3._wp
             s_coord(1:3) = lbk_s(l,1:3,2)
-            call s_get_cell(s_coord, cell)
-
-            strength_vol = volpart
-            strength_vel = 4._wp*pi*lbk_rad(l, 2)**2._wp*lbk_vel(l, 2)
+            ci = int(s_coord(1))
+            cj = int(s_coord(2))
+            ck = int(s_coord(3))
+            ! Clamp to interior (bubbles should already be in [0:m,0:n,0:p])
+            ci = max(0, min(ci, m))
+            cj = max(0, min(cj, n))
+            ck = max(0, min(ck, p))
+            cell_list_count(ci, cj, ck) = cell_list_count(ci, cj, ck) + 1
+        end do
 
-            if (num_dims == 2) then
-                Vol = dx(cell(1))*dy(cell(2))*lag_params%charwidth
-                if (cyl_coord) Vol = dx(cell(1))*dy(cell(2))*y_cc(cell(2))*2._wp*pi
-            else
-                Vol = dx(cell(1))*dy(cell(2))*dz(cell(3))
-            end if
+        ! Prefix sum to compute start indices (1-based into cell_list_idx)
+        idx = 1
+        do ck = 0, p
+            do cj = 0, n
+                do ci = 0, m
+                    cell_list_start(ci, cj, ck) = idx
+                    idx = idx + cell_list_count(ci, cj, ck)
+                end do
+            end do
+        end do
 
-            ! Update void fraction field
-            addFun1 = strength_vol/Vol
-            $:GPU_ATOMIC(atomic='update')
-            updatedvar(1)%sf(cell(1), cell(2), cell(3)) = updatedvar(1)%sf(cell(1), cell(2), cell(3)) + real(addFun1, kind=stp)
-
-            ! Update time derivative of void fraction
-            addFun2 = strength_vel/Vol
-            $:GPU_ATOMIC(atomic='update')
-            updatedvar(2)%sf(cell(1), cell(2), cell(3)) = updatedvar(2)%sf(cell(1), cell(2), cell(3)) + real(addFun2, kind=stp)
-
-            ! Product of two smeared functions Update void fraction * time derivative of void fraction
-            if (lag_params%cluster_type >= 4) then
-                addFun3 = (strength_vol*strength_vel)/Vol
-                $:GPU_ATOMIC(atomic='update')
-                updatedvar(5)%sf(cell(1), cell(2), cell(3)) = updatedvar(5)%sf(cell(1), cell(2), cell(3)) + real(addFun3, kind=stp)
-            end if
+        ! Pass 2: place bubble indices into cell_list_idx Temporarily reuse cell_list_count as a running offset
+        cell_list_count = 0
+        do l = 1, nBubs
+            s_coord(1:3) = lbk_s(l,1:3,2)
+            ci = int(s_coord(1))
+            cj = int(s_coord(2))
+            ck = int(s_coord(3))
+            ci = max(0, min(ci, m))
+            cj = max(0, min(cj, n))
+            ck = max(0, min(ck, p))
+            cell_list_idx(cell_list_start(ci, cj, ck) + cell_list_count(ci, cj, ck)) = l
+            cell_list_count(ci, cj, ck) = cell_list_count(ci, cj, ck) + 1
         end do
-        $:END_GPU_PARALLEL_LOOP()
 
-    end subroutine s_deltafunc
+        ! Send cell list arrays to GPU
+        $:GPU_UPDATE(device='[cell_list_start, cell_list_count, cell_list_idx]')
 
-    !> Apply the Gaussian kernel function to smear bubble effects onto surrounding cells
-    subroutine s_gaussian(nBubs, lbk_rad, lbk_vel, lbk_s, lbk_pos, updatedvar)
+    end subroutine s_build_cell_list
+
+    !> Cell-centric delta-function smearing using the cell list (no GPU atomics). Each bubble only affects the cell it resides in.
+    !! The outer GPU loop iterates over interior cells and sums contributions from resident bubbles.
+    subroutine s_deltafunc(nBubs, lbk_rad, lbk_vel, lbk_s, updatedvar, kcomp)
 
         integer, intent(in)                                             :: nBubs
-        real(wp), dimension(1:lag_params%nBubs_glb,1:3,1:2), intent(in) :: lbk_s, lbk_pos
+        real(wp), dimension(1:lag_params%nBubs_glb,1:3,1:2), intent(in) :: lbk_s
         real(wp), dimension(1:lag_params%nBubs_glb,1:2), intent(in)     :: lbk_rad, lbk_vel
         type(scalar_field), dimension(:), intent(inout)                 :: updatedvar
-        real(wp), dimension(3)                                          :: center
-        integer, dimension(3)                                           :: cell
-        real(wp)                                                        :: stddsv
+        type(scalar_field), dimension(:), intent(inout)                 :: kcomp
         real(wp)                                                        :: strength_vel, strength_vol
-        real(wp), dimension(3)                                          :: nodecoord
-        real(wp)                                                        :: addFun1, addFun2, addFun3
-        real(wp)                                                        :: func, func2, volpart
-        integer, dimension(3)                                           :: cellaux
-        real(wp), dimension(3)                                          :: s_coord
-        integer                                                         :: l, i, j, k
-        logical                                                         :: celloutside
-        integer                                                         :: smearGrid, smearGridz
-
-        smearGrid = mapCells - (-mapCells) + 1  ! Include the cell that contains the bubble (3+1+3)
-        smearGridz = smearGrid
-        if (p == 0) smearGridz = 1
-
-        $:GPU_PARALLEL_LOOP(private='[nodecoord, l, s_coord, cell, center]', copyin='[smearGrid, smearGridz]')
-        do l = 1, nBubs
-            nodecoord(1:3) = 0
-            center(1:3) = 0._wp
-            volpart = 4._wp/3._wp*pi*lbk_rad(l, 2)**3._wp
-            s_coord(1:3) = lbk_s(l,1:3,2)
-            center(1:2) = lbk_pos(l,1:2,2)
-            if (p > 0) center(3) = lbk_pos(l, 3, 2)
-            call s_get_cell(s_coord, cell)
-            call s_compute_stddsv(cell, volpart, stddsv)
-
-            strength_vol = volpart
-            strength_vel = 4._wp*pi*lbk_rad(l, 2)**2._wp*lbk_vel(l, 2)
-
-            $:GPU_LOOP(collapse=3,private='[cellaux, nodecoord]')
-            do i = 1, smearGrid
-                do j = 1, smearGrid
-                    do k = 1, smearGridz
-                        cellaux(1) = cell(1) + i - (mapCells + 1)
-                        cellaux(2) = cell(2) + j - (mapCells + 1)
-                        cellaux(3) = cell(3) + k - (mapCells + 1)
-                        if (p == 0) cellaux(3) = 0
-
-                        ! Check if the cells intended to smear the bubbles in are in the computational domain and redefine the cells
-                        ! for symmetric boundary
-                        call s_check_celloutside(cellaux, celloutside)
-
-                        if (.not. celloutside) then
-                            nodecoord(1) = x_cc(cellaux(1))
-                            nodecoord(2) = y_cc(cellaux(2))
-                            if (p > 0) nodecoord(3) = z_cc(cellaux(3))
-                            call s_applygaussian(center, cellaux, nodecoord, stddsv, 0._wp, func)
-                            if (lag_params%cluster_type >= 4) call s_applygaussian(center, cellaux, nodecoord, stddsv, 1._wp, func2)
-
-                            ! Relocate cells for bubbles intersecting symmetric boundaries
-                            if (any((/bc_x%beg, bc_x%end, bc_y%beg, bc_y%end, bc_z%beg, bc_z%end/) == BC_REFLECTIVE)) then
-                                call s_shift_cell_symmetric_bc(cellaux, cell)
-                            end if
-                        else
-                            func = 0._wp
-                            func2 = 0._wp
-                            cellaux(1) = cell(1)
-                            cellaux(2) = cell(2)
-                            cellaux(3) = cell(3)
-                            if (p == 0) cellaux(3) = 0
+        real(wp)                                                        :: volpart, Vol
+        real(wp)                                                        :: y_kahan, t_kahan
+        integer                                                         :: i, j, k, lb, bub_idx
+
+        $:GPU_PARALLEL_LOOP(collapse=3, &
+                            & private='[i, j, k, lb, bub_idx, volpart, Vol, strength_vel, strength_vol, y_kahan, t_kahan]')
+        do k = 0, p
+            do j = 0, n
+                do i = 0, m
+                    ! Cell volume
+                    if (num_dims == 2) then
+                        Vol = dx(i)*dy(j)*lag_params%charwidth
+                        if (cyl_coord) Vol = dx(i)*dy(j)*y_cc(j)*2._wp*pi
+                    else
+                        Vol = dx(i)*dy(j)*dz(k)
+                    end if
+
+                    ! Loop over bubbles in this cell
+                    $:GPU_LOOP(parallelism='[seq]')
+                    do lb = cell_list_start(i, j, k), cell_list_start(i, j, k) + cell_list_count(i, j, k) - 1
+                        bub_idx = cell_list_idx(lb)
+
+                        volpart = 4._wp/3._wp*pi*lbk_rad(bub_idx, 2)**3._wp
+                        strength_vol = volpart
+                        strength_vel = 4._wp*pi*lbk_rad(bub_idx, 2)**2._wp*lbk_vel(bub_idx, 2)
+
+                        ! Kahan summation for void fraction
+                        y_kahan = real(strength_vol/Vol, kind=wp) - kcomp(1)%sf(i, j, k)
+                        t_kahan = updatedvar(1)%sf(i, j, k) + y_kahan
+                        kcomp(1)%sf(i, j, k) = (t_kahan - updatedvar(1)%sf(i, j, k)) - y_kahan
+                        updatedvar(1)%sf(i, j, k) = t_kahan
+
+                        ! Kahan summation for time derivative of void fraction
+                        y_kahan = real(strength_vel/Vol, kind=wp) - kcomp(2)%sf(i, j, k)
+                        t_kahan = updatedvar(2)%sf(i, j, k) + y_kahan
+                        kcomp(2)%sf(i, j, k) = (t_kahan - updatedvar(2)%sf(i, j, k)) - y_kahan
+                        updatedvar(2)%sf(i, j, k) = t_kahan
+
+                        ! Product of two smeared functions
+                        if (lag_params%cluster_type >= 4) then
+                            y_kahan = real((strength_vol*strength_vel)/Vol, kind=wp) - kcomp(5)%sf(i, j, k)
+                            t_kahan = updatedvar(5)%sf(i, j, k) + y_kahan
+                            kcomp(5)%sf(i, j, k) = (t_kahan - updatedvar(5)%sf(i, j, k)) - y_kahan
+                            updatedvar(5)%sf(i, j, k) = t_kahan
                         end if
+                    end do
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
 
-                        ! Update void fraction field
-                        addFun1 = func*strength_vol
-                        $:GPU_ATOMIC(atomic='update')
-                        updatedvar(1)%sf(cellaux(1), cellaux(2), cellaux(3)) = updatedvar(1)%sf(cellaux(1), cellaux(2), &
-                                   & cellaux(3)) + real(addFun1, kind=stp)
+    end subroutine s_deltafunc
 
-                        ! Update time derivative of void fraction
-                        addFun2 = func*strength_vel
-                        $:GPU_ATOMIC(atomic='update')
-                        updatedvar(2)%sf(cellaux(1), cellaux(2), cellaux(3)) = updatedvar(2)%sf(cellaux(1), cellaux(2), &
-                                   & cellaux(3)) + real(addFun2, kind=stp)
+    !> Cell-centric gaussian smearing using the cell list (no GPU atomics). Each grid cell accumulates contributions from nearby
+    !! bubbles looked up via cell_list_start/count/idx.
+    subroutine s_gaussian(nBubs, lbk_rad, lbk_vel, lbk_s, lbk_pos, updatedvar, kcomp)
 
-                        ! Product of two smeared functions Update void fraction * time derivative of void fraction
-                        if (lag_params%cluster_type >= 4) then
-                            addFun3 = func2*strength_vol*strength_vel
-                            $:GPU_ATOMIC(atomic='update')
-                            updatedvar(5)%sf(cellaux(1), cellaux(2), cellaux(3)) = updatedvar(5)%sf(cellaux(1), cellaux(2), &
-                                       & cellaux(3)) + real(addFun3, kind=stp)
-                        end if
+        integer, intent(in)                                             :: nBubs
+        real(wp), dimension(1:lag_params%nBubs_glb,1:3,1:2), intent(in) :: lbk_s, lbk_pos
+        real(wp), dimension(1:lag_params%nBubs_glb,1:2), intent(in)     :: lbk_rad, lbk_vel
+        type(scalar_field), dimension(:), intent(inout)                 :: updatedvar
+        type(scalar_field), dimension(:), intent(inout)                 :: kcomp
+        real(wp), dimension(3)                                          :: center, nodecoord, s_coord
+        integer, dimension(3)                                           :: cell, cellijk
+        real(wp)                                                        :: stddsv, volpart
+        real(wp)                                                        :: strength_vel, strength_vol
+        real(wp)                                                        :: func, func2
+        real(wp)                                                        :: y_kahan, t_kahan
+        integer                                                         :: i, j, k, di, dj, dk, lb, bub_idx
+        integer                                                         :: di_beg, di_end, dj_beg, dj_end, dk_beg, dk_end
+        integer                                                         :: smear_x_beg, smear_x_end
+        integer                                                         :: smear_y_beg, smear_y_end
+        integer                                                         :: smear_z_beg, smear_z_end
+
+        ! Extended grid range for smearing (includes buffer cells for MPI communication)
+
+        smear_x_beg = -mapCells - 1
+        smear_x_end = m + mapCells + 1
+        smear_y_beg = merge(-mapCells - 1, 0, n > 0)
+        smear_y_end = merge(n + mapCells + 1, n, n > 0)
+        smear_z_beg = merge(-mapCells - 1, 0, p > 0)
+        smear_z_end = merge(p + mapCells + 1, p, p > 0)
+
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[i, j, k, di, dj, dk, lb, bub_idx, center, nodecoord, s_coord, cell, cellijk, &
+                            & stddsv, volpart, strength_vel, strength_vol, func, func2, y_kahan, t_kahan, di_beg, di_end, dj_beg, &
+                            & dj_end, dk_beg, dk_end]', copyin='[smear_x_beg, smear_x_end, smear_y_beg, smear_y_end, smear_z_beg, &
+                                & smear_z_end]')
+        do k = smear_z_beg, smear_z_end
+            do j = smear_y_beg, smear_y_end
+                do i = smear_x_beg, smear_x_end
+                    cellijk(1) = i
+                    cellijk(2) = j
+                    cellijk(3) = k
+
+                    nodecoord(1) = x_cc(i)
+                    nodecoord(2) = y_cc(j)
+                    nodecoord(3) = 0._wp
+                    if (p > 0) nodecoord(3) = z_cc(k)
+
+                    ! Neighbor cell range clamped to interior [0:m, 0:n, 0:p]
+                    di_beg = max(i - mapCells, 0)
+                    di_end = min(i + mapCells, m)
+                    dj_beg = max(j - mapCells, 0)
+                    dj_end = min(j + mapCells, n)
+                    dk_beg = max(k - mapCells, 0)
+                    dk_end = min(k + mapCells, p)
+
+                    $:GPU_LOOP(parallelism='[seq]')
+                    do dk = dk_beg, dk_end
+                        $:GPU_LOOP(parallelism='[seq]')
+                        do dj = dj_beg, dj_end
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do di = di_beg, di_end
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do lb = cell_list_start(di, dj, dk), cell_list_start(di, dj, dk) + cell_list_count(di, dj, dk) - 1
+                                    bub_idx = cell_list_idx(lb)
+
+                                    ! Bubble properties
+                                    volpart = 4._wp/3._wp*pi*lbk_rad(bub_idx, 2)**3._wp
+                                    s_coord(1:3) = lbk_s(bub_idx,1:3,2)
+                                    call s_get_cell(s_coord, cell)
+                                    call s_compute_stddsv(cell, volpart, stddsv)
+
+                                    strength_vol = volpart
+                                    strength_vel = 4._wp*pi*lbk_rad(bub_idx, 2)**2._wp*lbk_vel(bub_idx, 2)
+
+                                    center(1:2) = lbk_pos(bub_idx,1:2,2)
+                                    center(3) = 0._wp
+                                    if (p > 0) center(3) = lbk_pos(bub_idx, 3, 2)
+
+                                    call s_applygaussian(center, cellijk, nodecoord, stddsv, 0._wp, func)
+
+                                    ! Kahan summation for void fraction
+                                    y_kahan = real(func*strength_vol, kind=wp) - kcomp(1)%sf(i, j, k)
+                                    t_kahan = updatedvar(1)%sf(i, j, k) + y_kahan
+                                    kcomp(1)%sf(i, j, k) = (t_kahan - updatedvar(1)%sf(i, j, k)) - y_kahan
+                                    updatedvar(1)%sf(i, j, k) = t_kahan
+
+                                    ! Kahan summation for time derivative of void fraction
+                                    y_kahan = real(func*strength_vel, kind=wp) - kcomp(2)%sf(i, j, k)
+                                    t_kahan = updatedvar(2)%sf(i, j, k) + y_kahan
+                                    kcomp(2)%sf(i, j, k) = (t_kahan - updatedvar(2)%sf(i, j, k)) - y_kahan
+                                    updatedvar(2)%sf(i, j, k) = t_kahan
+
+                                    if (lag_params%cluster_type >= 4) then
+                                        call s_applygaussian(center, cellijk, nodecoord, stddsv, 1._wp, func2)
+                                        y_kahan = real(func2*strength_vol*strength_vel, kind=wp) - kcomp(5)%sf(i, j, k)
+                                        t_kahan = updatedvar(5)%sf(i, j, k) + y_kahan
+                                        kcomp(5)%sf(i, j, k) = (t_kahan - updatedvar(5)%sf(i, j, k)) - y_kahan
+                                        updatedvar(5)%sf(i, j, k) = t_kahan
+                                    end if
+                                end do
+                            end do
+                        end do
                     end do
                 end do
             end do
@@ -179,7 +285,7 @@ contains
 
     end subroutine s_gaussian
 
-    !> Evaluate the Gaussian kernel at a grid node for a given bubble center
+    !> The purpose of this subroutine is to apply the gaussian kernel function for each bubble (Maeda and Colonius, 2018)).
     subroutine s_applygaussian(center, cellaux, nodecoord, stddsv, strength_idx, func)
 
         $:GPU_ROUTINE(function_name='s_applygaussian',parallelism='[seq]', cray_inline=True)
@@ -190,8 +296,9 @@ contains
         real(wp), intent(in)               :: stddsv
         real(wp), intent(in)               :: strength_idx
         real(wp), intent(out)              :: func
+        integer                            :: i
         real(wp)                           :: distance
-        real(wp)                           :: theta, dtheta, L2, dzp, Lz2
+        real(wp)                           :: theta, dtheta, L2, dzp, Lz2, zc
         real(wp)                           :: Nr, Nr_count
 
         distance = sqrt((center(1) - nodecoord(1))**2._wp + (center(2) - nodecoord(2))**2._wp + (center(3) - nodecoord(3))**2._wp)
@@ -222,28 +329,26 @@ contains
                                                       & **(3._wp*(strength_idx + 1._wp))
                 end do
             else
-                !> 2D cartesian function:
-                ! We smear particles considering a virtual depth (lag_params%charwidth)
-                theta = 0._wp
-                Nr = ceiling(lag_params%charwidth/(y_cb(cellaux(2)) - y_cb(cellaux(2) - 1)))
-                Nr_count = 1._wp - mapCells*1._wp
-                dzp = y_cb(cellaux(2) + 1) - y_cb(cellaux(2))
-                Lz2 = (center(3) - (dzp*(0.5_wp + Nr_count) - lag_params%charwidth/2._wp))**2._wp
-                distance = sqrt((center(1) - nodecoord(1))**2._wp + (center(2) - nodecoord(2))**2._wp + Lz2)
-                func = dzp/lag_params%charwidth*exp(-0.5_wp*(distance/stddsv)**2._wp)/(sqrt(2._wp*pi)*stddsv)**3._wp
-                do while (Nr_count < Nr - 1._wp + ((mapCells - 1)*1._wp))
-                    Nr_count = Nr_count + 1._wp
-                    Lz2 = (center(3) - (dzp*(0.5_wp + Nr_count) - lag_params%charwidth/2._wp))**2._wp
+                !> 2D cartesian function: Equation (48) from Madea and Colonius 2018
+                ! We smear particles considering a virtual depth (lag_params%charwidth) with lag_params%charNz cells
+                dzp = (lag_params%charwidth/(lag_params%charNz + 1._wp))
+
+                func = 0._wp
+                do i = 0, lag_params%charNz
+                    zc = (-lag_params%charwidth/2._wp + dzp*(0.5_wp + i))  ! Center of virtual cell i in z-direction
+                    Lz2 = (center(3) - zc)**2._wp
                     distance = sqrt((center(1) - nodecoord(1))**2._wp + (center(2) - nodecoord(2))**2._wp + Lz2)
-                    func = func + dzp/lag_params%charwidth*exp(-0.5_wp*(distance/stddsv)**2._wp)/(sqrt(2._wp*pi)*stddsv) &
-                        & **(3._wp*(strength_idx + 1._wp))
+                    func = func + dzp/lag_params%charwidth*exp(-0.5_wp*(distance/stddsv)**2._wp)/(sqrt(2._wp*pi)*stddsv)**3._wp
                 end do
             end if
         end if
 
     end subroutine s_applygaussian
 
-    !> Check if the current cell is outside the computational domain including ghost cells
+    !> The purpose of this subroutine is to check if the current cell is outside the computational domain or not (including ghost
+    !! cells).
+    !! @param cellaux Tested cell to smear the bubble effect in.
+    !! @param celloutside If true, then cellaux is outside the computational domain.
     subroutine s_check_celloutside(cellaux, celloutside)
 
         $:GPU_ROUTINE(function_name='s_check_celloutside',parallelism='[seq]', cray_inline=True)
@@ -275,7 +380,9 @@ contains
 
     end subroutine s_check_celloutside
 
-    !> Relocate cells that intersect a symmetric boundary
+    !> This subroutine relocates the current cell, if it intersects a symmetric boundary.
+    !! @param cell Cell of the current bubble
+    !! @param cellaux Cell to map the bubble effect in.
     subroutine s_shift_cell_symmetric_bc(cellaux, cell)
 
         $:GPU_ROUTINE(function_name='s_shift_cell_symmetric_bc', parallelism='[seq]', cray_inline=True)
@@ -312,6 +419,9 @@ contains
     end subroutine s_shift_cell_symmetric_bc
 
     !> Calculates the standard deviation of the bubble being smeared in the Eulerian framework.
+    !! @param cell Cell where the bubble is located
+    !! @param volpart Volume of the bubble
+    !! @param stddsv Standard deviaton
     subroutine s_compute_stddsv(cell, volpart, stddsv)
 
         $:GPU_ROUTINE(function_name='s_compute_stddsv',parallelism='[seq]', cray_inline=True)
@@ -338,7 +448,7 @@ contains
         end if
 
         !> Compute Standard deviaton
-        if (((volpart/charvol) > 0.5_wp*lag_params%valmaxvoid) .or. (lag_params%smooth_type == 1)) then
+        if ((volpart/charvol) > 0.5_wp*lag_params%valmaxvoid .or. (lag_params%smooth_type == 1)) then
             rad = (3._wp*volpart/(4._wp*pi))**(1._wp/3._wp)
             stddsv = 1._wp*lag_params%epsilonb*max(chardist, rad)
         else
@@ -347,7 +457,11 @@ contains
 
     end subroutine s_compute_stddsv
 
-    !> Compute the characteristic cell volume
+    !> The purpose of this procedure is to calculate the characteristic cell volume
+    !! @param cellx x-direction cell index
+    !! @param celly y-direction cell index
+    !! @param cellz z-direction cell index
+    !! @param Charvol Characteristic volume
     subroutine s_get_char_vol(cellx, celly, cellz, Charvol)
 
         $:GPU_ROUTINE(function_name='s_get_char_vol',parallelism='[seq]', cray_inline=True)
@@ -367,7 +481,9 @@ contains
 
     end subroutine s_get_char_vol
 
-    !> Convert bubble computational coordinates from real to integer cell indices
+    !> This subroutine transforms the computational coordinates of the bubble from real type into integer.
+    !! @param s_cell Computational coordinates of the bubble, real type
+    !! @param get_cell Computational coordinates of the bubble, integer type
     subroutine s_get_cell(s_cell, get_cell)
 
         $:GPU_ROUTINE(function_name='s_get_cell',parallelism='[seq]', cray_inline=True)
@@ -383,4 +499,221 @@ contains
 
     end subroutine s_get_cell
 
+    !> Precomputes cell-centered pressure gradients (dp/dx, dp/dy, dp/dz) at all cell centers using finite-difference coefficients
+    !! of the specified order. This avoids scattered memory accesses to the pressure field when computing translational bubble
+    !! forces.
+    !! @param q_prim_vf Primitive variables (pressure is at index E_idx)
+    subroutine s_compute_pressure_gradients(q_prim_vf)
+
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
+        integer                                             :: i, j, k, r
+
+        ! dp/dx at all cell centers
+
+        $:GPU_PARALLEL_LOOP(private='[i, j, k, r]', collapse=3)
+        do k = 0, p
+            do j = 0, n
+                do i = 0, m
+                    grad_p_x(i, j, k) = 0._wp
+                    $:GPU_LOOP(parallelism='[seq]')
+                    do r = -fd_number, fd_number
+                        grad_p_x(i, j, k) = grad_p_x(i, j, k) + q_prim_vf(E_idx)%sf(i + r, j, k)*fd_coeff_x_pgrad(r, i)
+                    end do
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        ! dp/dy at all cell centers
+        if (n > 0) then
+            $:GPU_PARALLEL_LOOP(private='[i, j, k, r]', collapse=3)
+            do k = 0, p
+                do j = 0, n
+                    do i = 0, m
+                        grad_p_y(i, j, k) = 0._wp
+                        $:GPU_LOOP(parallelism='[seq]')
+                        do r = -fd_number, fd_number
+                            grad_p_y(i, j, k) = grad_p_y(i, j, k) + q_prim_vf(E_idx)%sf(i, j + r, k)*fd_coeff_y_pgrad(r, j)
+                        end do
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
+
+        ! dp/dz at all cell centers
+        if (p > 0) then
+            $:GPU_PARALLEL_LOOP(private='[i, j, k, r]', collapse=3)
+            do k = 0, p
+                do j = 0, n
+                    do i = 0, m
+                        grad_p_z(i, j, k) = 0._wp
+                        $:GPU_LOOP(parallelism='[seq]')
+                        do r = -fd_number, fd_number
+                            grad_p_z(i, j, k) = grad_p_z(i, j, k) + q_prim_vf(E_idx)%sf(i, j, k + r)*fd_coeff_z_pgrad(r, k)
+                        end do
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
+
+    end subroutine s_compute_pressure_gradients
+
+    !! This function interpolates the velocity of Eulerian field at the position of the bubble.
+    !! @param pos Position of the bubble in directiion i
+    !! @param cell Computational coordinates of the bubble
+    !! @param i Direction of the velocity (1: x, 2: y, 3: z)
+    !! @param q_prim_vf Eulerian field with primitive variables
+    !! @return v Interpolated velocity at the position of the bubble
+    function f_interpolate_velocity(pos, cell, i, q_prim_vf) result(v)
+
+        $:GPU_ROUTINE(parallelism='[seq]')
+        real(wp), intent(in)                                :: pos
+        integer, dimension(3), intent(in)                   :: cell
+        integer, intent(in)                                 :: i
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
+        real(wp)                                            :: v
+        real(wp), dimension(fd_order + 1)                   :: xi, eta, L
+
+        if (fd_order == 2) then
+            if (i == 1) then
+                xi(1) = x_cc(cell(1) - 1)
+                eta(1) = q_prim_vf(momxb)%sf(cell(1) - 1, cell(2), cell(3))
+                xi(2) = x_cc(cell(1))
+                eta(2) = q_prim_vf(momxb)%sf(cell(1), cell(2), cell(3))
+                xi(3) = x_cc(cell(1) + 1)
+                eta(3) = q_prim_vf(momxb)%sf(cell(1) + 1, cell(2), cell(3))
+            else if (i == 2) then
+                xi(1) = y_cc(cell(2) - 1)
+                eta(1) = q_prim_vf(momxb + 1)%sf(cell(1), cell(2) - 1, cell(3))
+                xi(2) = y_cc(cell(2))
+                eta(2) = q_prim_vf(momxb + 1)%sf(cell(1), cell(2), cell(3))
+                xi(3) = y_cc(cell(2) + 1)
+                eta(3) = q_prim_vf(momxb + 1)%sf(cell(1), cell(2) + 1, cell(3))
+            else if (i == 3) then
+                xi(1) = z_cc(cell(3) - 1)
+                eta(1) = q_prim_vf(momxe)%sf(cell(1), cell(2), cell(3) - 1)
+                xi(2) = z_cc(cell(3))
+                eta(2) = q_prim_vf(momxe)%sf(cell(1), cell(2), cell(3))
+                xi(3) = z_cc(cell(3) + 1)
+                eta(3) = q_prim_vf(momxe)%sf(cell(1), cell(2), cell(3) + 1)
+            end if
+
+            L(1) = ((pos - xi(2))*(pos - xi(3)))/((xi(1) - xi(2))*(xi(1) - xi(3)))
+            L(2) = ((pos - xi(1))*(pos - xi(3)))/((xi(2) - xi(1))*(xi(2) - xi(3)))
+            L(3) = ((pos - xi(1))*(pos - xi(2)))/((xi(3) - xi(1))*(xi(3) - xi(2)))
+
+            v = L(1)*eta(1) + L(2)*eta(2) + L(3)*eta(3)
+        else if (fd_order == 4) then
+            if (i == 1) then
+                xi(1) = x_cc(cell(1) - 2)
+                eta(1) = q_prim_vf(momxb)%sf(cell(1) - 2, cell(2), cell(3))
+                xi(2) = x_cc(cell(1) - 1)
+                eta(2) = q_prim_vf(momxb)%sf(cell(1) - 1, cell(2), cell(3))
+                xi(3) = x_cc(cell(1))
+                eta(3) = q_prim_vf(momxb)%sf(cell(1), cell(2), cell(3))
+                xi(4) = x_cc(cell(1) + 1)
+                eta(4) = q_prim_vf(momxb)%sf(cell(1) + 1, cell(2), cell(3))
+                xi(5) = x_cc(cell(1) + 2)
+                eta(5) = q_prim_vf(momxb)%sf(cell(1) + 2, cell(2), cell(3))
+            else if (i == 2) then
+                xi(1) = y_cc(cell(2) - 2)
+                eta(1) = q_prim_vf(momxb + 1)%sf(cell(1), cell(2) - 2, cell(3))
+                xi(2) = y_cc(cell(2) - 1)
+                eta(2) = q_prim_vf(momxb + 1)%sf(cell(1), cell(2) - 1, cell(3))
+                xi(3) = y_cc(cell(2))
+                eta(3) = q_prim_vf(momxb + 1)%sf(cell(1), cell(2), cell(3))
+                xi(4) = y_cc(cell(2) + 1)
+                eta(4) = q_prim_vf(momxb + 1)%sf(cell(1), cell(2) + 1, cell(3))
+                xi(5) = y_cc(cell(2) + 2)
+                eta(5) = q_prim_vf(momxb + 1)%sf(cell(1), cell(2) + 2, cell(3))
+            else if (i == 3) then
+                xi(1) = z_cc(cell(3) - 2)
+                eta(1) = q_prim_vf(momxe)%sf(cell(1), cell(2), cell(3) - 2)
+                xi(2) = z_cc(cell(3) - 1)
+                eta(2) = q_prim_vf(momxe)%sf(cell(1), cell(2), cell(3) - 1)
+                xi(3) = z_cc(cell(3))
+                eta(3) = q_prim_vf(momxe)%sf(cell(1), cell(2), cell(3))
+                xi(4) = z_cc(cell(3) + 1)
+                eta(4) = q_prim_vf(momxe)%sf(cell(1), cell(2), cell(3) + 1)
+                xi(5) = z_cc(cell(3) + 2)
+                eta(5) = q_prim_vf(momxe)%sf(cell(1), cell(2), cell(3) + 2)
+            end if
+
+            L(1) = ((pos - xi(2))*(pos - xi(3))*(pos - xi(4))*(pos - xi(5)))/((xi(1) - xi(2))*(xi(1) - xi(3))*(xi(1) - xi(4)) &
+              & *(xi(2) - xi(5)))
+            L(2) = ((pos - xi(1))*(pos - xi(3))*(pos - xi(4))*(pos - xi(5)))/((xi(2) - xi(1))*(xi(2) - xi(3))*(xi(2) - xi(4)) &
+              & *(xi(2) - xi(5)))
+            L(3) = ((pos - xi(1))*(pos - xi(2))*(pos - xi(4))*(pos - xi(5)))/((xi(3) - xi(1))*(xi(3) - xi(2))*(xi(3) - xi(4)) &
+              & *(xi(3) - xi(5)))
+            L(4) = ((pos - xi(1))*(pos - xi(2))*(pos - xi(3))*(pos - xi(4)))/((xi(4) - xi(1))*(xi(4) - xi(2))*(xi(4) - xi(3)) &
+              & *(xi(4) - xi(5)))
+            L(5) = ((pos - xi(1))*(pos - xi(2))*(pos - xi(3))*(pos - xi(4)))/((xi(5) - xi(1))*(xi(5) - xi(2))*(xi(5) - xi(3)) &
+              & *(xi(5) - xi(4)))
+
+            v = L(1)*eta(1) + L(2)*eta(2) + L(3)*eta(3) + L(4)*eta(4) + L(5)*eta(5)
+        end if
+
+    end function f_interpolate_velocity
+
+    !! This function calculates the force on a bubble based on the pressure gradient, velocity, and drag model.
+    !! @param pos Position of the bubble in direction i
+    !! @param rad Radius of the bubble
+    !! @param rdot Radial velocity of the bubble
+    !! @param vel Velocity of the bubble
+    !! @param mg Mass of the gas in the bubble
+    !! @param mv Mass of the liquid in the bubble
+    !! @param Re Reynolds number
+    !! @param rho Density of the fluid
+    !! @param cell Computational coordinates of the bubble
+    !! @param i Direction of the velocity (1: x, 2: y, 3: z)
+    !! @param q_prim_vf Eulerian field with primitive variables
+    !! @return a Acceleration of the bubble in direction i
+    function f_get_bubble_force(pos, rad, rdot, vel, mg, mv, Re, rho, cell, i, q_prim_vf) result(force)
+
+        $:GPU_ROUTINE(parallelism='[seq]')
+        real(wp), intent(in)                                :: pos, rad, rdot, mg, mv, Re, rho, vel
+        integer, dimension(3), intent(in)                   :: cell
+        integer, intent(in)                                 :: i
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
+        real(wp)                                            :: dp, vol, force
+        real(wp)                                            :: v_rel
+
+        if (fd_order > 1) then
+            v_rel = vel - f_interpolate_velocity(pos, cell, i, q_prim_vf)
+        else
+            v_rel = vel - q_prim_vf(momxb + i - 1)%sf(cell(1), cell(2), cell(3))
+        end if
+
+        force = 0._wp
+
+        if (lag_params%drag_model == 1) then  ! Free slip Stokes drag
+            force = force - (4._wp*pi*rad*v_rel)/Re
+        else if (lag_params%drag_model == 2) then  ! No slip Stokes drag
+            force = force - (6._wp*pi*rad*v_rel)/Re
+        else if (lag_params%drag_model == 3) then  ! Levich drag
+            force = force - (12._wp*pi*rad*v_rel)/Re
+        end if
+
+        if (lag_pressure_force) then
+            ! Use precomputed cell-centered pressure gradients
+            if (i == 1) then
+                dp = grad_p_x(cell(1), cell(2), cell(3))
+            else if (i == 2) then
+                dp = grad_p_y(cell(1), cell(2), cell(3))
+            else if (i == 3) then
+                dp = grad_p_z(cell(1), cell(2), cell(3))
+            end if
+
+            vol = (4._wp/3._wp)*pi*(rad**3._wp)
+            force = force - vol*dp
+        end if
+
+        if (lag_params%gravity_force) then
+            force = force + (mg + mv)*accel_bf(i)
+        end if
+
+    end function f_get_bubble_force
+
 end module m_bubbles_EL_kernels
diff --git a/src/simulation/m_compute_levelset.fpp b/src/simulation/m_compute_levelset.fpp
index da1dcd9109..88153df8ee 100644
--- a/src/simulation/m_compute_levelset.fpp
+++ b/src/simulation/m_compute_levelset.fpp
@@ -20,7 +20,7 @@ module m_compute_levelset
 
 contains
 
-    !> Dispatch level-set distance and normal computations for all ghost points based on patch geometry type
+    !> @brief Dispatches level-set distance and normal computations for all ghost points based on their patch geometry type.
     impure subroutine s_apply_levelset(gps, num_gps)
 
         type(ghost_point), dimension(:), intent(inout) :: gps
@@ -73,24 +73,27 @@ contains
 
     end subroutine s_apply_levelset
 
-    !> Compute the signed distance and outward normal from a ghost point to a circular immersed boundary
+    !> @brief Computes the signed distance and outward normal from a ghost point to a circular immersed boundary.
     subroutine s_circle_levelset(gp)
 
         $:GPU_ROUTINE(parallelism='[seq]')
 
         type(ghost_point), intent(inout) :: gp
         real(wp)                         :: radius, dist
-        real(wp), dimension(2)           :: center
+        real(wp), dimension(2)           :: center             !< x and y coordinates in local IB frame
         real(wp), dimension(3)           :: dist_vec
         integer                          :: i, j, ib_patch_id  !< Loop index variables
+
         ib_patch_id = gp%ib_patch_id
         i = gp%loc(1)
         j = gp%loc(2)
 
         radius = patch_ib(ib_patch_id)%radius
 
-        dist_vec(1) = x_cc(i) - patch_ib(ib_patch_id)%x_centroid - real(gp%x_periodicity, wp)*(x_domain%end - x_domain%beg)
-        dist_vec(2) = y_cc(j) - patch_ib(ib_patch_id)%y_centroid - real(gp%y_periodicity, wp)*(y_domain%end - y_domain%beg)
+        dist_vec(1) = x_cc(i) - patch_ib(ib_patch_id)%x_centroid - real(gp%x_periodicity, &
+                 & wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        dist_vec(2) = y_cc(j) - patch_ib(ib_patch_id)%y_centroid - real(gp%y_periodicity, &
+                 & wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
         dist_vec(3) = 0._wp
         dist = sqrt(sum(dist_vec**2))
 
@@ -103,7 +106,7 @@ contains
 
     end subroutine s_circle_levelset
 
-    !> Compute the signed distance and outward normal from a ghost point to a 2D NACA airfoil surface
+    !> @brief Computes the signed distance and outward normal from a ghost point to a 2D NACA airfoil surface.
     subroutine s_airfoil_levelset(gp)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -116,18 +119,19 @@ contains
         real(wp), dimension(1:2)         :: center
         real(wp), dimension(1:3,1:3)     :: rotation, inverse_rotation
         integer                          :: i, j, k, ib_patch_id  !< Loop index variables
+
         ib_patch_id = gp%ib_patch_id
         i = gp%loc(1)
         j = gp%loc(2)
 
-        center(1) = patch_ib(ib_patch_id)%x_centroid + real(gp%x_periodicity, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(ib_patch_id)%y_centroid + real(gp%y_periodicity, wp)*(y_domain%end - y_domain%beg)
+        center(1) = patch_ib(ib_patch_id)%x_centroid + real(gp%x_periodicity, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(ib_patch_id)%y_centroid + real(gp%y_periodicity, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
         inverse_rotation(:,:) = patch_ib(ib_patch_id)%rotation_matrix_inverse(:,:)
         rotation(:,:) = patch_ib(ib_patch_id)%rotation_matrix(:,:)
         offset(:) = patch_ib(ib_patch_id)%centroid_offset(:)
 
-        xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp]  ! get coordinate frame centered on IB
-        xy_local = matmul(inverse_rotation, xy_local)  ! rotate the frame into the IB's coordinate
+        xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp]
+        xy_local = matmul(inverse_rotation, xy_local)
         xy_local = xy_local - offset  ! airfoils are a patch that require a centroid offset
 
         if (xy_local(2) >= 0._wp) then
@@ -182,7 +186,8 @@ contains
 
     end subroutine s_airfoil_levelset
 
-    !> Compute the signed distance and outward normal from a ghost point to a 3D extruded airfoil surface
+    !> @brief Computes the signed distance and outward normal from a ghost point to a 3D extruded airfoil surface including spanwise
+    !! end caps.
     subroutine s_3d_airfoil_levelset(gp)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -196,14 +201,15 @@ contains
         real(wp), dimension(1:3,1:3)     :: rotation, inverse_rotation
         real(wp)                         :: length_z
         integer                          :: i, j, k, l, ib_patch_id            !< Loop index variables
+
         ib_patch_id = gp%ib_patch_id
         i = gp%loc(1)
         j = gp%loc(2)
         l = gp%loc(3)
 
-        center(1) = patch_ib(ib_patch_id)%x_centroid + real(gp%x_periodicity, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(ib_patch_id)%y_centroid + real(gp%y_periodicity, wp)*(y_domain%end - y_domain%beg)
-        center(3) = patch_ib(ib_patch_id)%z_centroid + real(gp%z_periodicity, wp)*(z_domain%end - z_domain%beg)
+        center(1) = patch_ib(ib_patch_id)%x_centroid + real(gp%x_periodicity, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(ib_patch_id)%y_centroid + real(gp%y_periodicity, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
+        center(3) = patch_ib(ib_patch_id)%z_centroid + real(gp%z_periodicity, wp)*(glb_bounds(3)%end - glb_bounds(3)%beg)
         lz = patch_ib(ib_patch_id)%length_z
         inverse_rotation(:,:) = patch_ib(ib_patch_id)%rotation_matrix_inverse(:,:)
         rotation(:,:) = patch_ib(ib_patch_id)%rotation_matrix(:,:)
@@ -213,7 +219,7 @@ contains
         z_min = -lz/2
 
         xyz_local = [x_cc(i), y_cc(j), z_cc(l)] - center
-        xyz_local = matmul(inverse_rotation, xyz_local)  ! rotate the frame into the IB's coordinates
+        xyz_local = matmul(inverse_rotation, xyz_local)
         xyz_local = xyz_local - offset  ! airfoils are a patch that require a centroid offset
 
         if (xyz_local(2) >= 0._wp) then
@@ -296,14 +302,15 @@ contains
         integer                          :: i, j, k             !< Loop index variables
         integer                          :: idx                 !< Shortest path direction indicator
         integer                          :: ib_patch_id         !< patch ID
+
         ib_patch_id = gp%ib_patch_id
         i = gp%loc(1)
         j = gp%loc(2)
 
         length_x = patch_ib(ib_patch_id)%length_x
         length_y = patch_ib(ib_patch_id)%length_y
-        center(1) = patch_ib(ib_patch_id)%x_centroid + real(gp%x_periodicity, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(ib_patch_id)%y_centroid + real(gp%y_periodicity, wp)*(y_domain%end - y_domain%beg)
+        center(1) = patch_ib(ib_patch_id)%x_centroid + real(gp%x_periodicity, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(ib_patch_id)%y_centroid + real(gp%y_periodicity, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
         inverse_rotation(:,:) = patch_ib(ib_patch_id)%rotation_matrix_inverse(:,:)
         rotation(:,:) = patch_ib(ib_patch_id)%rotation_matrix(:,:)
 
@@ -348,7 +355,8 @@ contains
 
     end subroutine s_rectangle_levelset
 
-    !> Compute the signed distance and outward normal from a ghost point to an elliptical immersed boundary
+    !> @brief Computes the signed distance and outward normal from a ghost point to an elliptical immersed boundary via a quadratic
+    !! projection.
     subroutine s_ellipse_levelset(gp)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -363,14 +371,15 @@ contains
         integer                          :: i, j, k                  !< Loop index variables
         integer                          :: idx                      !< Shortest path direction indicator
         integer                          :: ib_patch_id              !< patch ID
+
         ib_patch_id = gp%ib_patch_id
         i = gp%loc(1)
         j = gp%loc(2)
 
         length_x = patch_ib(ib_patch_id)%length_x
         length_y = patch_ib(ib_patch_id)%length_y
-        center(1) = patch_ib(ib_patch_id)%x_centroid + real(gp%x_periodicity, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(ib_patch_id)%y_centroid + real(gp%y_periodicity, wp)*(y_domain%end - y_domain%beg)
+        center(1) = patch_ib(ib_patch_id)%x_centroid + real(gp%x_periodicity, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(ib_patch_id)%y_centroid + real(gp%y_periodicity, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
         inverse_rotation(:,:) = patch_ib(ib_patch_id)%rotation_matrix_inverse(:,:)
         rotation(:,:) = patch_ib(ib_patch_id)%rotation_matrix(:,:)
 
@@ -398,7 +407,7 @@ contains
 
     end subroutine s_ellipse_levelset
 
-    !> Compute the signed distance and outward normal from a ghost point to a cuboid immersed boundary
+    !> @brief Computes the signed distance and outward normal from a ghost point to the nearest face of a cuboid immersed boundary.
     subroutine s_cuboid_levelset(gp)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -413,6 +422,7 @@ contains
         real(wp), dimension(1:3,1:3)     :: rotation, inverse_rotation
         integer                          :: i, j, k              !< Loop index variables
         integer                          :: ib_patch_id          !< patch ID
+
         ib_patch_id = gp%ib_patch_id
         i = gp%loc(1)
         j = gp%loc(2)
@@ -422,9 +432,9 @@ contains
         length_y = patch_ib(ib_patch_id)%length_y
         length_z = patch_ib(ib_patch_id)%length_z
 
-        center(1) = patch_ib(ib_patch_id)%x_centroid + real(gp%x_periodicity, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(ib_patch_id)%y_centroid + real(gp%y_periodicity, wp)*(y_domain%end - y_domain%beg)
-        center(3) = patch_ib(ib_patch_id)%z_centroid + real(gp%z_periodicity, wp)*(z_domain%end - z_domain%beg)
+        center(1) = patch_ib(ib_patch_id)%x_centroid + real(gp%x_periodicity, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(ib_patch_id)%y_centroid + real(gp%y_periodicity, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
+        center(3) = patch_ib(ib_patch_id)%z_centroid + real(gp%z_periodicity, wp)*(glb_bounds(3)%end - glb_bounds(3)%beg)
 
         inverse_rotation(:,:) = patch_ib(ib_patch_id)%rotation_matrix_inverse(:,:)
         rotation(:,:) = patch_ib(ib_patch_id)%rotation_matrix(:,:)
@@ -437,7 +447,7 @@ contains
         Back = -length_z/2
 
         xyz_local = [x_cc(i), y_cc(j), z_cc(k)] - center  ! get coordinate frame centered on IB
-        xyz_local = matmul(inverse_rotation, xyz_local)  ! rotate the frame into the IB's coordinate
+        xyz_local = matmul(inverse_rotation, xyz_local)
 
         dist_left = Left - xyz_local(1)
         dist_right = xyz_local(1) - Right
@@ -485,7 +495,7 @@ contains
 
     end subroutine s_cuboid_levelset
 
-    !> Compute the signed distance and outward normal from a ghost point to a spherical immersed boundary
+    !> @brief Computes the signed distance and outward normal from a ghost point to a spherical immersed boundary.
     subroutine s_sphere_levelset(gp)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -494,15 +504,16 @@ contains
         real(wp)                         :: radius, dist
         real(wp), dimension(3)           :: dist_vec, center, periodicity
         integer                          :: i, j, k, ib_patch_id  !< Loop index variables
+
         ib_patch_id = gp%ib_patch_id
         i = gp%loc(1)
         j = gp%loc(2)
         k = gp%loc(3)
 
         radius = patch_ib(ib_patch_id)%radius
-        periodicity(1) = real(gp%x_periodicity, wp)*(x_domain%end - x_domain%beg)
-        periodicity(2) = real(gp%y_periodicity, wp)*(y_domain%end - y_domain%beg)
-        periodicity(3) = real(gp%z_periodicity, wp)*(z_domain%end - z_domain%beg)
+        periodicity(1) = real(gp%x_periodicity, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        periodicity(2) = real(gp%y_periodicity, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
+        periodicity(3) = real(gp%z_periodicity, wp)*(glb_bounds(3)%end - glb_bounds(3)%beg)
         center(1) = patch_ib(ib_patch_id)%x_centroid
         center(2) = patch_ib(ib_patch_id)%y_centroid
         center(3) = patch_ib(ib_patch_id)%z_centroid
@@ -521,7 +532,8 @@ contains
 
     end subroutine s_sphere_levelset
 
-    !> Compute the signed distance and outward normal from a ghost point to a cylindrical immersed boundary
+    !> @brief Computes the signed distance and outward normal from a ghost point to a cylindrical immersed boundary surface and end
+    !! caps.
     subroutine s_cylinder_levelset(gp)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -542,9 +554,9 @@ contains
         k = gp%loc(3)
 
         radius = patch_ib(ib_patch_id)%radius
-        center(1) = patch_ib(ib_patch_id)%x_centroid + real(gp%x_periodicity, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(ib_patch_id)%y_centroid + real(gp%y_periodicity, wp)*(y_domain%end - y_domain%beg)
-        center(3) = patch_ib(ib_patch_id)%z_centroid + real(gp%z_periodicity, wp)*(z_domain%end - z_domain%beg)
+        center(1) = patch_ib(ib_patch_id)%x_centroid + real(gp%x_periodicity, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(ib_patch_id)%y_centroid + real(gp%y_periodicity, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
+        center(3) = patch_ib(ib_patch_id)%z_centroid + real(gp%z_periodicity, wp)*(glb_bounds(3)%end - glb_bounds(3)%beg)
         length(1) = patch_ib(ib_patch_id)%length_x
         length(2) = patch_ib(ib_patch_id)%length_y
         length(3) = patch_ib(ib_patch_id)%length_z
@@ -570,7 +582,7 @@ contains
         end if
 
         xyz_local = [x_cc(i), y_cc(j), z_cc(k)] - center  ! get coordinate frame centered on IB
-        xyz_local = matmul(inverse_rotation, xyz_local)  ! rotate the frame into the IB's coordinates
+        xyz_local = matmul(inverse_rotation, xyz_local)
 
         ! get distance to flat edge of cylinder
         side_pos = dot_product(xyz_local, dist_sides_vec)
@@ -596,6 +608,7 @@ contains
     end subroutine s_cylinder_levelset
 
     !> The STL patch is a 2/3D geometry that is imported from an STL file.
+    !! @param gp Ghost point to compute levelset for
     subroutine s_model_levelset(gp)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -618,12 +631,12 @@ contains
 
         center = 0._wp
         if (.not. f_is_default(patch_ib(patch_id)%x_centroid)) center(1) = patch_ib(patch_id)%x_centroid + real(gp%x_periodicity, &
-            & wp)*(x_domain%end - x_domain%beg)
+            & wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
         if (.not. f_is_default(patch_ib(patch_id)%y_centroid)) center(2) = patch_ib(patch_id)%y_centroid + real(gp%y_periodicity, &
-            & wp)*(y_domain%end - y_domain%beg)
+            & wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
         if (p > 0) then
             if (.not. f_is_default(patch_ib(patch_id)%z_centroid)) center(3) = patch_ib(patch_id)%z_centroid &
-                & + real(gp%z_periodicity, wp)*(z_domain%end - z_domain%beg)
+                & + real(gp%z_periodicity, wp)*(glb_bounds(3)%end - glb_bounds(3)%beg)
         end if
 
         inverse_rotation(:,:) = patch_ib(patch_id)%rotation_matrix_inverse(:,:)
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index 086a218289..5161b244dc 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -30,19 +30,8 @@ module m_data_output
         & s_finalize_data_output_module
 
     integer                                       :: ib_state_unit = -1  !< I/O unit for IB state binary file
-    real(wp), allocatable, dimension(:,:,:)       :: icfl_sf             !< ICFL stability criterion
-    real(wp), allocatable, dimension(:,:,:)       :: vcfl_sf             !< VCFL stability criterion
-    real(wp), allocatable, dimension(:,:,:)       :: ccfl_sf             !< CCFL stability criterion
-    real(wp), allocatable, dimension(:,:,:)       :: Rc_sf               !< Rc stability criterion
     real(wp), public, allocatable, dimension(:,:) :: c_mass
-    $:GPU_DECLARE(create='[icfl_sf, vcfl_sf, ccfl_sf, Rc_sf, c_mass]')
-
-    real(wp) :: icfl_max_loc, icfl_max_glb  !< ICFL stability extrema on local and global grids
-    real(wp) :: vcfl_max_loc, vcfl_max_glb  !< VCFL stability extrema on local and global grids
-    real(wp) :: ccfl_max_loc, ccfl_max_glb  !< CCFL stability extrema on local and global grids
-    real(wp) :: Rc_min_loc, Rc_min_glb      !< Rc stability extrema on local and global grids
-    $:GPU_DECLARE(create='[icfl_max_loc, icfl_max_glb, vcfl_max_loc, vcfl_max_glb]')
-    $:GPU_DECLARE(create='[ccfl_max_loc, ccfl_max_glb, Rc_min_loc, Rc_min_glb]')
+    $:GPU_DECLARE(create='[c_mass]')
 
     !> @name ICFL, VCFL, CCFL and Rc stability criteria extrema over all the time-steps
     !> @{
@@ -57,6 +46,12 @@ module m_data_output
 contains
 
     !> Write data files. Dispatch subroutine that replaces procedure pointer.
+    !! @param q_cons_vf Conservative variables
+    !! @param q_T_sf Temperature scalar field
+    !! @param q_prim_vf Primitive variables
+    !! @param t_step Current time step
+    !! @param bc_type Boundary condition type
+    !! @param beta Eulerian void fraction from lagrangian bubbles
     impure subroutine s_write_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, bc_type, beta)
 
         type(scalar_field), dimension(sys_size), intent(inout)      :: q_cons_vf
@@ -74,13 +69,17 @@ contains
 
     end subroutine s_write_data_files
 
-    !> Open the run-time information file and write the stability criteria table header
+    !> The purpose of this subroutine is to open a new or pre- existing run-time information file and append to it the basic header
+    !! information relevant to current simulation. In general, this requires generating a table header for those stability criteria
+    !! which will be written at every time-step.
     impure subroutine s_open_run_time_information_file
 
         character(LEN=name_len), parameter :: file_name = 'run_time.inf'  !< Name of the run-time information file
         character(LEN=path_len + name_len) :: file_path                   !< Relative path to a file in the case directory
         character(LEN=8)                   :: file_date                   !< Creation date of the run-time information file
 
+        ! Opening the run-time information file
+
         file_path = trim(case_dir) // '/' // trim(file_name)
 
         open (3, FILE=trim(file_path), form='formatted', STATUS='replace')
@@ -98,25 +97,32 @@ contains
 
         write (3, '(A)') ''; write (3, '(A)') ''
 
+        ! Generating table header for the stability criteria to be outputted
         write (3, '(13X,A9,13X,A10,13X,A10,13X,A10)', advance="no") trim('Time-step'), trim('dt'), trim('Time'), trim('ICFL Max')
 
         if (viscous) then
             write (3, '(13X,A10,13X,A16)', advance="no") trim('VCFL Max'), trim('Rc Min')
         end if
 
+        if (bubbles_lagrange) then
+            write (3, '(13X,A10)', advance="no") trim('N Bubbles')
+        end if
+
         write (3, *)  ! new line
 
     end subroutine s_open_run_time_information_file
 
-    !> Open center-of-mass data files for writing
+    !> This opens a formatted data file where the root processor can write out the CoM information
     impure subroutine s_open_com_files()
 
         character(len=path_len + 3*name_len) :: file_path  !< Relative path to the CoM file in the case directory
         integer                              :: i          !< Generic loop iterator
 
         do i = 1, num_fluids
+            ! Generating the relative path to the CoM data file
             write (file_path, '(A,I0,A)') '/fluid', i, '_com.dat'
             file_path = trim(case_dir) // trim(file_path)
+            ! Creating the formatted data file and setting up its structure
             open (i + 120, file=trim(file_path), form='formatted', position='append', status='unknown')
             if (n == 0) then
                 write (i + 120, '(A)') '    Non-Dimensional Time ' // '    Total Mass ' // '    x-loc ' // '    Total Volume    '
@@ -133,17 +139,19 @@ contains
 
     end subroutine s_open_com_files
 
-    !> Open flow probe data files for writing
+    !> This opens a formatted data file where the root processor can write out flow probe information
     impure subroutine s_open_probe_files
 
-        character(LEN=path_len + 3*name_len) :: file_path  !< Relative path to the probe data file in the case directory
+        character(LEN=path_len + 3*name_len) :: file_path  !< Relative path to the grid and conservative variables data files
         integer                              :: i          !< Generic loop iterator
         logical                              :: file_exist
 
         do i = 1, num_probes
+            ! Generating the relative path to the data file
             write (file_path, '(A,I0,A)') '/D/probe', i, '_prim.dat'
             file_path = trim(case_dir) // trim(file_path)
 
+            ! Creating the formatted data file and setting up its structure
             inquire (file=trim(file_path), exist=file_exist)
 
             if (file_exist) then
@@ -164,7 +172,6 @@ contains
 
     end subroutine s_open_probe_files
 
-    !> Open the immersed boundary state file for binary output
     impure subroutine s_open_ib_state_file
 
         character(len=path_len + 2*name_len) :: file_loc
@@ -177,7 +184,11 @@ contains
 
     end subroutine s_open_ib_state_file
 
-    !> Write stability criteria extrema to the run-time information file at the given time step
+    !> The goal of the procedure is to output to the run-time information file the stability criteria extrema in the entire
+    !! computational domain and at the given time-step. Moreover, the subroutine is also in charge of tracking these stability
+    !! criteria extrema over all time-steps.
+    !! @param q_prim_vf Cell-average primitive variables
+    !! @param t_step Current time step
     impure subroutine s_write_run_time_information(q_prim_vf, t_step)
 
         type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
@@ -191,19 +202,27 @@ contains
             real(wp), dimension(num_fluids) :: alpha  !< Cell-avg. volume fraction
             real(wp), dimension(num_vels)   :: vel    !< Cell-avg. velocity
         #:endif
-        real(wp)               :: vel_sum  !< Cell-avg. velocity sum
-        real(wp)               :: pres     !< Cell-avg. pressure
-        real(wp)               :: gamma    !< Cell-avg. sp. heat ratio
-        real(wp)               :: pi_inf   !< Cell-avg. liquid stiffness function
-        real(wp)               :: qv       !< Cell-avg. internal energy reference value
-        real(wp)               :: c        !< Cell-avg. sound speed
-        real(wp)               :: H        !< Cell-avg. enthalpy
-        real(wp), dimension(2) :: Re       !< Cell-avg. Reynolds numbers
+        real(wp)               :: vel_sum                     !< Cell-avg. velocity sum
+        real(wp)               :: pres                        !< Cell-avg. pressure
+        real(wp)               :: gamma                       !< Cell-avg. sp. heat ratio
+        real(wp)               :: pi_inf                      !< Cell-avg. liquid stiffness function
+        real(wp)               :: qv                          !< Cell-avg. internal energy reference value
+        real(wp)               :: c                           !< Cell-avg. sound speed
+        real(wp)               :: H                           !< Cell-avg. enthalpy
+        real(wp), dimension(2) :: Re                          !< Cell-avg. Reynolds numbers
         integer                :: j, k, l
-
+        real(wp)               :: icfl_max_loc, icfl_max_glb  !< ICFL stability extrema on local and global grids
+        real(wp)               :: vcfl_max_loc, vcfl_max_glb  !< VCFL stability extrema on local and global grids
+        real(wp)               :: ccfl_max_loc, ccfl_max_glb  !< CCFL stability extrema on local and global grids
+        real(wp)               :: Rc_min_loc, Rc_min_glb      !< Rc stability extrema on local and global grids
+        real(wp)               :: icfl, vcfl, Rc
+
+        icfl_max_loc = 0._wp
+        vcfl_max_loc = 0._wp
+        Rc_min_loc = huge(1.0_wp)
         ! Computing Stability Criteria at Current Time-step
-
-        $:GPU_PARALLEL_LOOP(collapse=3, private='[j, k, l, vel, alpha, Re, rho, vel_sum, pres, gamma, pi_inf, c, H, qv]')
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[j, k, l, vel, alpha, Re, rho, vel_sum, pres, gamma, pi_inf, c, H, qv, icfl, &
+                            & vcfl, Rc]', reduction='[[icfl_max_loc, vcfl_max_loc], [Rc_min_loc]]', reductionOp='[max, min]')
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -212,49 +231,32 @@ contains
                     call s_compute_speed_of_sound(pres, rho, gamma, pi_inf, H, alpha, vel_sum, 0._wp, c, qv)
 
                     if (viscous) then
-                        call s_compute_stability_from_dt(vel, c, rho, Re, j, k, l, icfl_sf, vcfl_sf, Rc_sf)
+                        call s_compute_stability_from_dt(vel, c, rho, Re, j, k, l, icfl, vcfl, Rc)
                     else
-                        call s_compute_stability_from_dt(vel, c, rho, Re, j, k, l, icfl_sf)
+                        call s_compute_stability_from_dt(vel, c, rho, Re, j, k, l, icfl)
                     end if
+
+                    icfl_max_loc = max(icfl_max_loc, icfl)
+                    vcfl_max_loc = max(vcfl_max_loc, merge(vcfl, 0.0_wp, viscous))
+                    Rc_min_loc = min(Rc_min_loc, merge(Rc, huge(1.0_wp), viscous))
                 end do
             end do
         end do
         $:END_GPU_PARALLEL_LOOP()
+        ! end: Computing Stability Criteria at Current Time-step
 
-#ifdef _CRAYFTN
-        $:GPU_UPDATE(host='[icfl_sf]')
-
-        if (viscous) then
-            $:GPU_UPDATE(host='[vcfl_sf, Rc_sf]')
-        end if
-
-        icfl_max_loc = maxval(icfl_sf)
-
-        if (viscous) then
-            vcfl_max_loc = maxval(vcfl_sf)
-            Rc_min_loc = minval(Rc_sf)
-        end if
-#else
-        #:call GPU_PARALLEL(copyout='[icfl_max_loc]', copyin='[icfl_sf]')
-            icfl_max_loc = maxval(icfl_sf)
-        #:endcall GPU_PARALLEL
-        if (viscous .or. dummy) then
-            #:call GPU_PARALLEL(copyout='[vcfl_max_loc, Rc_min_loc]', copyin='[vcfl_sf,Rc_sf]')
-                vcfl_max_loc = maxval(vcfl_sf)
-                Rc_min_loc = minval(Rc_sf)
-            #:endcall GPU_PARALLEL
-        end if
-#endif
-
+        ! Determining global stability criteria extrema at current time-step
         if (num_procs > 1) then
-            call s_mpi_reduce_stability_criteria_extrema(icfl_max_loc, vcfl_max_loc, Rc_min_loc, icfl_max_glb, vcfl_max_glb, &
-                & Rc_min_glb)
+            call s_mpi_reduce_stability_criteria_extrema(icfl_max_loc, vcfl_max_loc, Rc_min_loc, n_el_bubs_loc, icfl_max_glb, &
+                & vcfl_max_glb, Rc_min_glb, n_el_bubs_glb)
         else
             icfl_max_glb = icfl_max_loc
             if (viscous) vcfl_max_glb = vcfl_max_loc
             if (viscous) Rc_min_glb = Rc_min_loc
+            if (bubbles_lagrange) n_el_bubs_glb = n_el_bubs_loc
         end if
 
+        ! Determining the stability criteria extrema over all the time-steps
         if (icfl_max_glb > icfl_max) icfl_max = icfl_max_glb
 
         if (viscous) then
@@ -262,6 +264,7 @@ contains
             if (Rc_min_glb < Rc_min) Rc_min = Rc_min_glb
         end if
 
+        ! Outputting global stability criteria extrema at current time-step
         if (proc_rank == 0) then
             write (3, '(13X,I9,13X,F10.6,13X,F10.6,13X,F10.6)', advance="no") t_step, dt, mytime, icfl_max_glb
 
@@ -269,6 +272,10 @@ contains
                 write (3, '(13X,F10.6,13X,ES16.6)', advance="no") vcfl_max_glb, Rc_min_glb
             end if
 
+            if (bubbles_lagrange) then
+                write (3, '(13X,I10)', advance="no") n_el_bubs_glb
+            end if
+
             write (3, *)  ! new line
 
             if (.not. f_approx_equal(icfl_max_glb, icfl_max_glb)) then
@@ -286,13 +293,25 @@ contains
                     call s_mpi_abort('VCFL is greater than 1.0. Exiting.')
                 end if
             end if
+
+            if (bubbles_lagrange) then
+                if (n_el_bubs_glb == 0) then
+                    call s_mpi_abort('No Lagrangian bubbles remain in the domain. Exiting.')
+                end if
+            end if
         end if
 
         call s_mpi_barrier()
 
     end subroutine s_write_run_time_information
 
-    !> Write grid and conservative variable data files in serial format
+    !> The goal of this subroutine is to output the grid and conservative variables data files for given time-step.
+    !! @param q_cons_vf Cell-average conservative variables
+    !! @param q_T_sf Temperature scalar field
+    !! @param q_prim_vf Cell-average primitive variables
+    !! @param t_step Current time-step
+    !! @param bc_type Boundary condition type
+    !! @param beta Eulerian void fraction from lagrangian bubbles
     impure subroutine s_write_serial_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, bc_type, beta)
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
@@ -301,14 +320,18 @@ contains
         integer, intent(in) :: t_step
         type(scalar_field), intent(inout), optional :: beta
         type(integer_field), dimension(1:num_dims,-1:1), intent(in) :: bc_type
-        character(LEN=path_len + 2*name_len) :: t_step_dir  !< Relative path to the current time-step directory
-        character(LEN=path_len + 3*name_len) :: file_path   !< Relative path to the grid and conservative variables data files
-        logical :: file_exist                               !< Logical used to check existence of current time-step directory
+        character(LEN=path_len + 2*name_len) :: t_step_dir
+        character(LEN=path_len + 3*name_len) :: file_path  !< Relative path to the grid and conservative variables data files
+        logical :: file_exist                              !< Logical used to check existence of current time-step directory
         character(LEN=15) :: FMT
         integer :: i, j, k, l, r
-        real(wp) :: gamma, lit_gamma, pi_inf, qv            !< Temporary EOS params
+        real(wp) :: gamma, lit_gamma, pi_inf, qv           !< Temporary EOS params
+
+        ! Creating or overwriting the time-step root directory
 
         write (t_step_dir, '(A,I0,A,I0)') trim(case_dir) // '/p_all'
+
+        ! Creating or overwriting the current time-step directory
         write (t_step_dir, '(a,i0,a,i0)') trim(case_dir) // '/p_all/p', proc_rank, '/', t_step
 
         file_path = trim(t_step_dir) // '/.'
@@ -316,11 +339,13 @@ contains
         if (file_exist) call s_delete_directory(trim(t_step_dir))
         call s_create_directory(trim(t_step_dir))
 
+        ! Writing the grid data file in the x-direction
         file_path = trim(t_step_dir) // '/x_cb.dat'
 
         open (2, FILE=trim(file_path), form='unformatted', STATUS='new')
         write (2) x_cb(-1:m); close (2)
 
+        ! Writing the grid data files in the y- and z-directions
         if (n > 0) then
             file_path = trim(t_step_dir) // '/y_cb.dat'
 
@@ -335,6 +360,7 @@ contains
             end if
         end if
 
+        ! Writing the conservative variables data files
         do i = 1, sys_size
             write (file_path, '(A,I0,A)') trim(t_step_dir) // '/q_cons_vf', i, '.dat'
 
@@ -378,6 +404,11 @@ contains
         ! Writing the IB markers
         if (ib) then
             call s_write_serial_ib_data(t_step)
+            ! write (file_path, '(A,I0,A)') trim(t_step_dir)//'/ib.dat'
+
+            ! open (2, FILE=trim(file_path), & FORM='unformatted', & STATUS='new')
+
+            ! write (2) ib_markers%sf(0:m, 0:n, 0:p); close (2)
         end if
 
         gamma = gammas(1)
@@ -391,6 +422,7 @@ contains
             FMT = "(2F40.14)"
         end if
 
+        ! writing an output directory
         write (t_step_dir, '(A,I0,A,I0)') trim(case_dir) // '/D'
         file_path = trim(t_step_dir) // '/.'
 
@@ -409,9 +441,10 @@ contains
             end if
         end if
 
+        ! 1D
         if (n == 0 .and. p == 0) then
             if (model_eqns == 2 .and. (.not. igr)) then
-                do i = 1, sys_size
+                do i = 1, sys_size  ! TODO: check if sys_size is correct
                     write (file_path, '(A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir) // '/prim.', i, '.', proc_rank, '.', t_step, '.dat'
 
                     open (2, FILE=trim(file_path))
@@ -471,6 +504,7 @@ contains
             FMT = "(3F40.14)"
         end if
 
+        ! 2D
         if ((n > 0) .and. (p == 0)) then
             do i = 1, sys_size
                 write (file_path, '(A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir) // '/cons.', i, '.', proc_rank, '.', t_step, '.dat'
@@ -528,7 +562,7 @@ contains
             end if
 
             if (prim_vars_wrt .and. (.not. igr)) then
-                do i = 1, sys_size
+                do i = 1, sys_size  ! TODO: check if sys_size is correct
                     write (file_path, '(A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir) // '/prim.', i, '.', proc_rank, '.', t_step, '.dat'
 
                     open (2, FILE=trim(file_path))
@@ -555,6 +589,7 @@ contains
             FMT = "(4F40.14)"
         end if
 
+        ! 3D
         if (p > 0) then
             do i = 1, sys_size
                 write (file_path, '(A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir) // '/cons.', i, '.', proc_rank, '.', t_step, '.dat'
@@ -622,7 +657,7 @@ contains
             end if
 
             if (prim_vars_wrt .and. (.not. igr)) then
-                do i = 1, sys_size
+                do i = 1, sys_size  ! TODO: check if sys_size is correct
                     write (file_path, '(A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir) // '/prim.', i, '.', proc_rank, '.', t_step, '.dat'
 
                     open (2, FILE=trim(file_path))
@@ -648,7 +683,11 @@ contains
 
     end subroutine s_write_serial_data_files
 
-    !> Write grid and conservative variable data files in parallel via MPI I/O
+    !> The goal of this subroutine is to output the grid and conservative variables data files for given time-step.
+    !! @param q_cons_vf Cell-average conservative variables
+    !! @param t_step Current time-step
+    !! @param bc_type Boundary condition type
+    !! @param beta Eulerian void fraction from lagrangian bubbles
     impure subroutine s_write_parallel_data_files(q_cons_vf, t_step, bc_type, beta)
 
         type(scalar_field), dimension(sys_size), intent(inout)      :: q_cons_vf
@@ -669,6 +708,7 @@ contains
         character(len=10)                    :: t_step_string
         integer                              :: i        !< Generic loop iterator
         integer                              :: alt_sys  !< Altered system size for the lagrangian subgrid bubble model
+
         ! Down sampling variables
         integer :: m_ds, n_ds, p_ds
         integer :: m_glb_ds, n_glb_ds, p_glb_ds
@@ -687,6 +727,7 @@ contains
         if (file_per_process) then
             call s_int_to_str(t_step, t_step_string)
 
+            ! Initialize MPI data I/O
             if (down_sample) then
                 call s_initialize_mpi_data_ds(q_cons_temp_ds)
             else
@@ -708,8 +749,10 @@ contains
             call s_mpi_barrier()
             call DelayFileAccess(proc_rank)
 
+            ! Initialize MPI data I/O
             call s_initialize_mpi_data(q_cons_vf)
 
+            ! Open the file to write all flow variables
             write (file_loc, '(I0,A,i7.7,A)') t_step, '_', proc_rank, '.dat'
             file_loc = trim(case_dir) // '/restart_data/lustre_' // trim(t_step_string) // trim(mpiiofs) // trim(file_loc)
             inquire (FILE=trim(file_loc), EXIST=file_exist)
@@ -719,17 +762,20 @@ contains
             call MPI_FILE_OPEN(MPI_COMM_SELF, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), mpi_info_int, ifile, ierr)
 
             if (down_sample) then
+                ! Size of local arrays
                 data_size = (m_ds + 3)*(n_ds + 3)*(p_ds + 3)
                 m_glb_save = m_glb_ds + 1
                 n_glb_save = n_glb_ds + 1
                 p_glb_save = p_glb_ds + 1
             else
+                ! Size of local arrays
                 data_size = (m + 1)*(n + 1)*(p + 1)
                 m_glb_save = m_glb + 1
                 n_glb_save = n_glb + 1
                 p_glb_save = p_glb + 1
             end if
 
+            ! Resize some integers so MPI can write even the biggest files
             m_MOK = int(m_glb_save + 1, MPI_OFFSET_KIND)
             n_MOK = int(n_glb_save + 1, MPI_OFFSET_KIND)
             p_MOK = int(p_glb_save + 1, MPI_OFFSET_KIND)
@@ -739,11 +785,13 @@ contains
             NVARS_MOK = int(sys_size, MPI_OFFSET_KIND)
 
             if (bubbles_euler) then
-                do i = 1, sys_size
+                ! Write the data for each variable
+                do i = 1, sys_size  ! TODO: check if sys_size is correct
                     var_MOK = int(i, MPI_OFFSET_KIND)
 
                     call MPI_FILE_WRITE_ALL(ifile, MPI_IO_DATA%var(i)%sf, data_size*mpi_io_type, mpi_io_p, status, ierr)
                 end do
+                ! Write pb and mv for non-polytropic qbmm
                 if (qbmm .and. .not. polytropic) then
                     do i = sys_size + 1, sys_size + 2*nb*nnode
                         var_MOK = int(i, MPI_OFFSET_KIND)
@@ -769,6 +817,8 @@ contains
 
             call MPI_FILE_CLOSE(ifile, ierr)
         else
+            ! Initialize MPI data I/O
+
             if (ib) then
                 call s_initialize_mpi_data(q_cons_vf, ib_markers)
             else if (present(beta)) then
@@ -785,8 +835,10 @@ contains
             end if
             call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), mpi_info_int, ifile, ierr)
 
+            ! Size of local arrays
             data_size = (m + 1)*(n + 1)*(p + 1)
 
+            ! Resize some integers so MPI can write even the biggest files
             m_MOK = int(m_glb + 1, MPI_OFFSET_KIND)
             n_MOK = int(n_glb + 1, MPI_OFFSET_KIND)
             p_MOK = int(p_glb + 1, MPI_OFFSET_KIND)
@@ -796,18 +848,22 @@ contains
             NVARS_MOK = int(alt_sys, MPI_OFFSET_KIND)
 
             if (bubbles_euler) then
-                do i = 1, sys_size
+                ! Write the data for each variable
+                do i = 1, sys_size  ! TODO: check if sys_size is correct
                     var_MOK = int(i, MPI_OFFSET_KIND)
 
+                    ! Initial displacement to skip at beginning of file
                     disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1)
 
                     call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_DATA%view(i), 'native', mpi_info_int, ierr)
                     call MPI_FILE_WRITE_ALL(ifile, MPI_IO_DATA%var(i)%sf, data_size*mpi_io_type, mpi_io_p, status, ierr)
                 end do
+                ! Write pb and mv for non-polytropic qbmm
                 if (qbmm .and. .not. polytropic) then
                     do i = sys_size + 1, sys_size + 2*nb*nnode
                         var_MOK = int(i, MPI_OFFSET_KIND)
 
+                        ! Initial displacement to skip at beginning of file
                         disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1)
 
                         call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_DATA%view(i), 'native', mpi_info_int, ierr)
@@ -818,6 +874,7 @@ contains
                 do i = 1, sys_size  ! TODO: check if sys_size is correct
                     var_MOK = int(i, MPI_OFFSET_KIND)
 
+                    ! Initial displacement to skip at beginning of file
                     disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1)
 
                     call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_DATA%view(i), 'native', mpi_info_int, ierr)
@@ -825,9 +882,11 @@ contains
                 end do
             end if
 
+            ! Correction for the lagrangian subgrid bubble model
             if (present(beta)) then
                 var_MOK = int(sys_size + 1, MPI_OFFSET_KIND)
 
+                ! Initial displacement to skip at beginning of file
                 disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1)
 
                 call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_DATA%view(sys_size + 1), 'native', mpi_info_int, ierr)
@@ -836,21 +895,33 @@ contains
 
             call MPI_FILE_CLOSE(ifile, ierr)
 
+            ! Write ib data
             if (ib) then
                 call s_write_parallel_ib_data(t_step)
+                ! write (file_loc, '(A)') 'ib.dat' file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc) call
+                ! MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), & mpi_info_int, ifile, ierr)
+
+                ! var_MOK = int(sys_size + 1, MPI_OFFSET_KIND) disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1 +
+                ! int(t_step/t_step_save))
+
+                ! call MPI_FILE_SET_VIEW(ifile, disp, MPI_INTEGER, MPI_IO_IB_DATA%view, & 'native', mpi_info_int, ierr) call
+                ! MPI_FILE_WRITE_ALL(ifile, MPI_IO_IB_DATA%var%sf, data_size, & MPI_INTEGER, status, ierr) call
+                ! MPI_FILE_CLOSE(ifile, ierr)
             end if
         end if
 #endif
 
     end subroutine s_write_parallel_data_files
 
-    !> Write immersed boundary marker data to a serial (per-processor) unformatted file
+    !> @brief Writes immersed boundary marker data to a serial (per-processor) unformatted file.
     subroutine s_write_serial_ib_data(time_step)
 
         integer, intent(in)                  :: time_step
         character(LEN=path_len + 2*name_len) :: file_path
         character(LEN=path_len + 2*name_len) :: t_step_dir
 
+        ! Creating or overwriting the time-step root directory
+
         write (t_step_dir, '(A,I0,A,I0)') trim(case_dir) // '/p_all'
         write (t_step_dir, '(a,i0,a,i0)') trim(case_dir) // '/p_all/p', proc_rank, '/', time_step
         write (file_path, '(A,I0,A)') trim(t_step_dir) // '/ib_data.dat'
@@ -862,7 +933,7 @@ contains
 
     end subroutine s_write_serial_ib_data
 
-    !> Write immersed boundary marker data in parallel using MPI I/O
+    !> @brief Writes immersed boundary marker data in parallel using MPI I/O.
     subroutine s_write_parallel_ib_data(time_step)
 
         integer, intent(in) :: time_step
@@ -877,6 +948,7 @@ contains
 
         $:GPU_UPDATE(host='[ib_markers%sf]')
 
+        ! Size of local arrays
         data_size = (m + 1)*(n + 1)*(p + 1)
         m_MOK = int(m_glb + 1, MPI_OFFSET_KIND)
         n_MOK = int(n_glb + 1, MPI_OFFSET_KIND)
@@ -899,7 +971,7 @@ contains
 
     end subroutine s_write_parallel_ib_data
 
-    !> Dispatch immersed boundary data output to the serial or parallel writer
+    !> @brief Dispatches immersed boundary data output to the serial or parallel writer.
     subroutine s_write_ib_data_file(time_step)
 
         integer, intent(in) :: time_step
@@ -912,7 +984,7 @@ contains
 
     end subroutine s_write_ib_data_file
 
-    !> Write IB state records to D/ib_state.dat (rank 0 only)
+    !> @brief Writes IB state records to D/ib_state.dat. Must be called only on rank 0.
     impure subroutine s_write_ib_state_file()
 
         integer :: i
@@ -924,7 +996,9 @@ contains
 
     end subroutine s_write_ib_state_file
 
-    !> Write center-of-mass data at the current time step
+    !> This writes a formatted data file where the root processor can write out the CoM information
+    !! @param t_step Current time-step
+    !! @param c_mass_in Center of mass information
     impure subroutine s_write_com_files(t_step, c_mass_in)
 
         integer, intent(in)                            :: t_step
@@ -932,6 +1006,8 @@ contains
         integer                                        :: i            !< Generic loop iterator
         real(wp)                                       :: nondim_time  !< Non-dimensional time
 
+        ! Non-dimensional time calculation
+
         if (t_step_old /= dflt_int) then
             nondim_time = real(t_step + t_step_old, wp)*dt
         else
@@ -957,7 +1033,10 @@ contains
 
     end subroutine s_write_com_files
 
-    !> Write flow probe data at the current time step
+    !> This writes a formatted data file for the flow probe information
+    !! @param t_step Current time-step
+    !! @param q_cons_vf Conservative variables
+    !! @param accel_mag Acceleration magnitude information
     impure subroutine s_write_probe_files(t_step, q_cons_vf, accel_mag)
 
         integer, intent(in)                                 :: t_step
@@ -1004,6 +1083,7 @@ contains
 
         T = dflt_T_guess
 
+        ! Non-dimensional time calculation
         if (time_stepper == 23) then
             nondim_time = mytime
         else
@@ -1015,6 +1095,7 @@ contains
         end if
 
         do i = 1, num_probes
+            ! Zeroing out flow variables for all processors
             rho = 0._wp
             do s = 1, num_vels
                 vel(s) = 0._wp
@@ -1041,6 +1122,7 @@ contains
             end do
             damage_state = 0._wp
 
+            ! Find probe location in terms of indices on a specific processor
             if (n == 0) then
                 if ((probe(i)%x >= x_cb(-1)) .and. (probe(i)%x <= x_cb(m))) then
                     do s = -1, m
@@ -1494,17 +1576,21 @@ contains
 
     end subroutine s_write_probe_files
 
-    !> Write footer with stability criteria extrema and run-time to the information file, then close it
+    !> The goal of this subroutine is to write to the run-time information file basic footer information applicable to the current
+    !! computation and to close the file when done. The footer contains the stability criteria extrema over all of the time-steps
+    !! and the simulation run-time.
     impure subroutine s_close_run_time_information_file
 
         real(wp) :: run_time  !< Run-time of the simulation
 
+        ! Writing the footer of and closing the run-time information file
+
         write (3, '(A)') '    '
         write (3, '(A)') ''
 
         write (3, '(A,F9.6)') 'ICFL Max: ', icfl_max
         if (viscous) write (3, '(A,F9.6)') 'VCFL Max: ', vcfl_max
-        if (viscous) write (3, '(A,F10.6)') 'Rc Min: ', Rc_min
+        if (viscous) write (3, '(A,ES16.6)') 'Rc Min: ', Rc_min
 
         call cpu_time(run_time)
 
@@ -1518,7 +1604,7 @@ contains
     !> Closes communication files
     impure subroutine s_close_com_files()
 
-        integer :: i  !< Generic loop iterator
+        integer :: i
 
         do i = 1, num_fluids
             close (i + 120)
@@ -1529,7 +1615,7 @@ contains
     !> Closes probe files
     impure subroutine s_close_probe_files
 
-        integer :: i  !< Generic loop iterator
+        integer :: i
 
         do i = 1, num_probes
             close (i + 30)
@@ -1537,28 +1623,25 @@ contains
 
     end subroutine s_close_probe_files
 
-    !> Close the immersed boundary state file
     impure subroutine s_close_ib_state_file
 
         close (ib_state_unit)
 
     end subroutine s_close_ib_state_file
 
-    !> Initialize the data output module
+    !> The computation of parameters, the allocation of memory, the association of pointers and/or the execution of any other
+    !! procedures that are necessary to setup the module.
     impure subroutine s_initialize_data_output_module
 
         integer :: i, m_ds, n_ds, p_ds
 
+        ! Allocating/initializing ICFL, VCFL, CCFL and Rc stability criteria
+
         if (run_time_info) then
-            @:ALLOCATE(icfl_sf(0:m, 0:n, 0:p))
             icfl_max = 0._wp
-
             if (viscous) then
-                @:ALLOCATE(vcfl_sf(0:m, 0:n, 0:p))
-                @:ALLOCATE(Rc_sf  (0:m, 0:n, 0:p))
-
                 vcfl_max = 0._wp
-                Rc_min = 1.e3_wp
+                Rc_min = 1.e12_wp
             end if
         end if
 
@@ -1588,13 +1671,6 @@ contains
             @:DEALLOCATE(c_mass)
         end if
 
-        if (run_time_info) then
-            @:DEALLOCATE(icfl_sf)
-            if (viscous) then
-                @:DEALLOCATE(vcfl_sf, Rc_sf)
-            end if
-        end if
-
         if (down_sample) then
             do i = 1, sys_size
                 deallocate (q_cons_temp_ds(i)%sf)
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index 079d36ebc2..c0d3d9c2ae 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -14,6 +14,7 @@ module m_global_parameters
 
     use m_derived_types
     use m_helper_basic
+
     ! $:USE_GPU_MODULE()
 
     implicit none
@@ -26,8 +27,10 @@ module m_global_parameters
     character(LEN=path_len) :: case_dir       !< Case folder location
     logical                 :: run_time_info  !< Run-time output flag
     integer                 :: t_step_old     !< Existing IC/grid folder
+
     ! Computational Domain Parameters
     integer :: proc_rank  !< Rank of the local processor
+
     !> @name Number of cells in the x-, y- and z-directions, respectively
     !> @{
     integer :: m, n, p
@@ -51,20 +54,22 @@ module m_global_parameters
     !> @name Cell-boundary (CB) locations in the x-, y- and z-directions, respectively
     !> @{
     real(wp), target, allocatable, dimension(:) :: x_cb, y_cb, z_cb
+    type(bounds_info), dimension(3)             :: glb_bounds
     !> @}
 
     !> @name Cell-center (CC) locations in the x-, y- and z-directions, respectively
     !> @{
     real(wp), target, allocatable, dimension(:) :: x_cc, y_cc, z_cc
     !> @}
-    ! type(bounds_info) :: x_domain, y_domain, z_domain !< Locations of the domain bounds in the x-, y- and z-coordinate directions
+
     !> @name Cell-width distributions in the x-, y- and z-directions, respectively
     !> @{
     real(wp), target, allocatable, dimension(:) :: dx, dy, dz
     !> @}
 
     real(wp) :: dt  !< Size of the time-step
-    $:GPU_DECLARE(create='[x_cb, y_cb, z_cb, x_cc, y_cc, z_cc, dx, dy, dz, dt, m, n, p]')
+
+    $:GPU_DECLARE(create='[x_cb, y_cb, z_cb, x_cc, y_cc, z_cc, dx, dy, dz, dt, m, n, p, glb_bounds]')
 
     !> @name Starting time-step iteration, stopping time-step iteration and the number of time-step iterations between successive
     !! solution backups, respectively
@@ -81,6 +86,7 @@ module m_global_parameters
 
     logical :: cfl_adap_dt, cfl_const_dt, cfl_dt
     integer :: t_step_print  !< Number of time-steps between printouts
+
     ! Simulation Algorithm Parameters
     integer :: model_eqns  !< Multicomponent flow model
     #:if MFC_CASE_OPTIMIZATION
@@ -206,10 +212,12 @@ module m_global_parameters
     integer  :: relax_model   !< Relaxation model
     real(wp) :: palpha_eps    !< trigger parameter for the p relaxation procedure, phase change model
     real(wp) :: ptgalpha_eps  !< trigger parameter for the pTg relaxation procedure, phase change model
+
     $:GPU_DECLARE(create='[relax, relax_model, palpha_eps, ptgalpha_eps]')
 
-    integer :: num_bc_patches
-    logical :: bc_io
+    integer               :: num_bc_patches
+    logical               :: bc_io
+    logical, dimension(3) :: periodic_bc
     !> @name Boundary conditions (BC) in the x-, y- and z-directions, respectively
     !> @{
     type(int_bounds_info) :: bc_x, bc_y, bc_z
@@ -221,18 +229,27 @@ module m_global_parameters
 #elif defined(MFC_OpenMP)
     $:GPU_DECLARE(create='[bc_x, bc_y, bc_z]')
 #endif
-    type(bounds_info) :: x_domain, y_domain, z_domain
-    $:GPU_DECLARE(create='[x_domain, y_domain, z_domain]')
-    real(wp) :: x_a, y_a, z_a
-    real(wp) :: x_b, y_b, z_b
-    logical  :: parallel_io       !< Format of the data files
-    logical  :: file_per_process  !< shared file or not when using parallel io
-    integer  :: precision         !< Precision of output files
-    logical  :: down_sample       !< down sample the output files
+
+    logical :: parallel_io       !< Format of the data files
+    logical :: file_per_process  !< shared file or not when using parallel io
+    integer :: precision         !< Precision of output files
+    logical :: down_sample       !< down sample the output files
     $:GPU_DECLARE(create='[down_sample]')
 
-    integer, allocatable, dimension(:)            :: proc_coords  !< Processor coordinates in MPI_CART_COMM
-    integer, allocatable, dimension(:)            :: start_idx    !< Starting cell-center index of local processor in global grid
+    integer, allocatable, dimension(:)           :: proc_coords  !< Processor coordinates in MPI_CART_COMM
+    type(bounds_info), allocatable, dimension(:) :: pcomm_coords
+    $:GPU_DECLARE(create='[pcomm_coords]')
+    !! Coordinates for EL particle transfer
+
+    type(bounds_info), allocatable, dimension(:) :: pcomm_coords_ghost
+    $:GPU_DECLARE(create='[pcomm_coords_ghost]')
+    !! Coordinates for EL particle transfer
+
+    type(int_bounds_info), dimension(3)    :: nidx  !< Indices for neighboring processors
+    integer, allocatable, dimension(:,:,:) :: neighbor_ranks
+    !! Neighbor ranks
+
+    integer, allocatable, dimension(:)            :: start_idx  !< Starting cell-center index of local processor in global grid
     type(mpi_io_var), public                      :: MPI_IO_DATA
     type(mpi_io_ib_var), public                   :: MPI_IO_IB_DATA
     type(mpi_io_airfoil_ib_var), public           :: MPI_IO_airfoil_IB_DATA
@@ -293,7 +310,9 @@ module m_global_parameters
 
     $:GPU_DECLARE(create='[Re_size, Re_size_max, Re_idx]')
 
-    ! WENO averaging flag: use arithmetic mean or unaltered WENO-reconstructed cell-boundary values
+    ! The WENO average (WA) flag regulates whether the calculation of any cell- average spatial derivatives is carried out in each
+    ! cell by utilizing the arithmetic mean of the left and right, WENO-reconstructed, cell-boundary values or simply, the unaltered
+    ! left and right, WENO-reconstructed, cell- boundary values.
     !> @{
     real(wp) :: wa_flg
     !> @}
@@ -311,24 +330,46 @@ module m_global_parameters
 
     $:GPU_DECLARE(create='[dir_idx, dir_flg, dir_idx_tau]')
 
+    !> The number of cells that are necessary to be able to store enough boundary conditions data to march the solution in the
+    !! physical computational domain to the next time-step.
     integer :: buff_size  !< Number of ghost cells for boundary condition storage
+
     $:GPU_DECLARE(create='[buff_size]')
 
-    integer                  :: shear_num              !< Number of shear stress components
-    integer, dimension(3)    :: shear_indices          !< Indices of the stress components that represent shear stress
-    integer                  :: shear_BC_flip_num      !< Number of shear stress components to reflect for boundary conditions
+    integer, allocatable :: beta_vars(:)  !< Indices of variables to communicate for bubble/particle coupling
+
+    $:GPU_DECLARE(create='[beta_vars]')
+
+    integer               :: shear_num !! Number of shear stress components
+    integer, dimension(3) :: shear_indices      !< Indices of the stress components that represent shear stress
+    integer               :: shear_BC_flip_num  !< Number of shear stress components to reflect for boundary conditions
+    !> Indices of shear stress components to reflect for boundary conditions. Size: (1:3, 1:shear_BC_flip_num) for (x/y/z,
+    !! [indices])
     integer, dimension(3, 2) :: shear_BC_flip_indices  !< Shear stress BC reflection indices (1:3, 1:shear_BC_flip_num)
+
     $:GPU_DECLARE(create='[shear_num, shear_indices, shear_BC_flip_num, shear_BC_flip_indices]')
 
     ! END: Simulation Algorithm Parameters
 
     ! Fluids Physical Parameters
 
+    !> Database of the physical parameters of each of the fluids that is present in the flow. These include the stiffened gas
+    !! equation of state parameters, and the Reynolds numbers.
     type(physical_parameters), dimension(num_fluids_max) :: fluid_pp  !< Stiffened gas EOS parameters and Reynolds numbers per fluid
+
     ! Subgrid Bubble Parameters
     type(subgrid_bubble_physical_parameters) :: bub_pp
-    integer                                  :: fd_order   !< Finite-difference order for CoM and flow probe derivatives
-    integer                                  :: fd_number  !< Finite-difference half-stencil size: MAX(1, fd_order/2)
+
+    ! Subgrid Particle Parameters
+    type(subgrid_particle_physical_parameters) :: particle_pp
+
+    !> The order of the finite-difference (fd) approximations of the first-order derivatives that need to be evaluated when the CoM
+    !! or flow probe data files are to be written at each time step
+    integer :: fd_order
+
+    !> The finite-difference number is given by MAX(1, fd_order/2). Essentially, it is a measure of the half-size of the
+    !! finite-difference stencil for the selected order of accuracy.
+    integer :: fd_number
     $:GPU_DECLARE(create='[fd_order, fd_number]')
 
     logical                                              :: probe_wrt
@@ -352,6 +393,9 @@ module m_global_parameters
     type(ib_patch_parameters), dimension(num_patches_max) :: patch_ib  !< Immersed boundary patch parameters
     type(vec3_dt), allocatable, dimension(:)              :: airfoil_grid_u, airfoil_grid_l
     integer                                               :: Np
+    !! Database of the immersed boundary patch parameters for each of the patches employed in the configuration of the initial
+    !! condition. Note that the maximum allowable number of patches, num_patches_max, may be changed in the module
+    !! m_derived_types.f90.
 
     $:GPU_DECLARE(create='[ib, num_ibs, patch_ib, Np, airfoil_grid_u, airfoil_grid_l]')
     !> @}
@@ -429,6 +473,12 @@ module m_global_parameters
     $:GPU_DECLARE(create='[R0ref, p0ref, rho0ref, T0ref, ss, pv, vd, mu_l, mu_v, mu_g, gam_v, gam_g, M_v, M_g, cp_v, cp_g, R_v, R_g]')
     !> @}
 
+    !> @name Solid particle physical parameters
+    !> @{
+    real(wp) :: cp_particle, rho0ref_particle
+    $:GPU_DECLARE(create='[rho0ref_particle, cp_particle]')
+    !> @}
+
     !> @name Acoustic acoustic_source parameters
     !> @{
     logical                                              :: acoustic_source  !< Acoustic source switch
@@ -470,16 +520,31 @@ module m_global_parameters
 
     !> @name lagrangian subgrid bubble parameters
     !> @{!
-    logical                           :: bubbles_lagrange  !< Lagrangian subgrid bubble model switch
-    type(bubbles_lagrange_parameters) :: lag_params        !< Lagrange bubbles' parameters
-    $:GPU_DECLARE(create='[bubbles_lagrange, lag_params]')
+    logical                           :: bubbles_lagrange                        !< Lagrangian subgrid bubble model switch
+    logical                           :: particles_lagrange                      !< Lagrangian subgrid particle model switch
+    type(bubbles_lagrange_parameters) :: lag_params                              !< Lagrange bubbles' parameters
+    integer                           :: n_el_bubs_loc, n_el_bubs_glb            !< Number of Lagrangian bubbles (local and global)
+    integer                           :: n_el_particles_loc, n_el_particles_glb  !< Number of Lagrangian bubbles (local and global)
+    logical                           :: moving_lag_bubbles
+    logical                           :: moving_lag_particles
+    logical                           :: lag_pressure_force
+    logical                           :: lag_gravity_force
+    integer                           :: lag_vel_model, lag_drag_model
+    $:GPU_DECLARE(create='[bubbles_lagrange, lag_params, n_el_bubs_loc, n_el_bubs_glb]')
+    $:GPU_DECLARE(create='[particles_lagrange, n_el_particles_loc, n_el_particles_glb]')
+    $:GPU_DECLARE(create='[moving_lag_particles]')
+    $:GPU_DECLARE(create='[moving_lag_bubbles, lag_vel_model, lag_drag_model]')
+    $:GPU_DECLARE(create='[lag_pressure_force, lag_gravity_force]')
     !> @}
 
     real(wp) :: Bx0  !< Constant magnetic field in the x-direction (1D)
     $:GPU_DECLARE(create='[Bx0]')
 
     logical :: fft_wrt
+    !> AMDFlang workaround: keep a dummy logical to avoid a compiler case-optimization bug when a parameter+GPU-kernel conditional
+    !! is false
     logical :: dummy  !< AMDFlang workaround for case-optimization + GPU-kernel bug
+
     !> @name Continuum damage model parameters
     !> @{!
     real(wp) :: tau_star       !< Stress threshold for continuum damage modeling
@@ -502,6 +567,7 @@ contains
     impure subroutine s_assign_default_values_to_user_inputs
 
         integer :: i, j  !< Generic loop iterator
+
         ! Logistics
 
         case_dir = '.'
@@ -596,6 +662,7 @@ contains
 
         num_bc_patches = 0
         bc_io = .false.
+        periodic_bc = .false.
 
         bc_x%beg = dflt_int; bc_x%end = dflt_int
         bc_y%beg = dflt_int; bc_y%end = dflt_int
@@ -608,9 +675,9 @@ contains
             #:endfor
         #:endfor
 
-        x_domain%beg = dflt_real; x_domain%end = dflt_real
-        y_domain%beg = dflt_real; y_domain%end = dflt_real
-        z_domain%beg = dflt_real; z_domain%end = dflt_real
+        glb_bounds(1)%beg = dflt_real; glb_bounds(1)%end = dflt_real
+        glb_bounds(2)%beg = dflt_real; glb_bounds(2)%end = dflt_real
+        glb_bounds(3)%beg = dflt_real; glb_bounds(3)%end = dflt_real
 
         ! Fluids physical parameters
         do i = 1, num_fluids_max
@@ -645,6 +712,10 @@ contains
         bub_pp%R_v = dflt_real; R_v = dflt_real
         bub_pp%R_g = dflt_real; R_g = dflt_real
 
+        ! Subgrid particle parameters
+        particle_pp%rho0ref_particle = dflt_real
+        particle_pp%cp_particle = dflt_real
+
         ! Tait EOS
         rhoref = dflt_real
         pref = dflt_real
@@ -774,10 +845,29 @@ contains
         lag_params%massTransfer_model = .false.
         lag_params%write_bubbles = .false.
         lag_params%write_bubbles_stats = .false.
+        lag_params%write_void_evol = .false.
         lag_params%nBubs_glb = dflt_int
+        lag_params%vel_model = dflt_int
+        lag_params%drag_model = dflt_int
+        lag_params%pressure_force = .true.
+        lag_params%gravity_force = .false.
         lag_params%epsilonb = 1._wp
         lag_params%charwidth = dflt_real
+        lag_params%charNz = dflt_int
         lag_params%valmaxvoid = dflt_real
+        lag_params%input_path = 'input/lag_bubbles.dat'
+        lag_params%nParticles_glb = dflt_int
+        lag_params%qs_drag_model = dflt_int
+        lag_params%stokes_drag = dflt_int
+        lag_params%added_mass_model = dflt_int
+        lag_params%interpolation_order = dflt_int
+        lag_params%collision_force = .false.
+
+        moving_lag_bubbles = .false.
+        lag_vel_model = dflt_int
+
+        particles_lagrange = .false.
+        moving_lag_particles = .false.
 
         ! Continuum damage model
         tau_star = dflt_real
@@ -837,7 +927,8 @@ contains
 
     end subroutine s_assign_default_values_to_user_inputs
 
-    !> Initialize the global parameters module
+    !> The computation of parameters, the allocation of memory, the association of pointers and/or the execution of any other
+    !! procedures that are necessary to setup the module.
     impure subroutine s_initialize_global_parameters_module
 
         integer :: i, j, k
@@ -845,7 +936,6 @@ contains
 
         #:if not MFC_CASE_OPTIMIZATION
             ! Determining the degree of the WENO polynomials
-
             if (recon_type == WENO_TYPE) then
                 weno_polyn = (weno_order - 1)/2
                 if (teno) then
@@ -863,7 +953,9 @@ contains
             $:GPU_UPDATE(device='[igr, igr_order, igr_iter_solver]')
         #:endif
 
-        ! Initialize counts: viscous fluids, surface-tension interfaces, curvature interfaces
+        ! Initializing the number of fluids for which viscous effects will be non-negligible, the number of distinctive material
+        ! interfaces for which surface tension will be important and also, the number of fluids for which the physical and geometric
+        ! curvatures of the interfaces will be computed
         Re_size = 0
         Re_size_max = 0
 
@@ -883,19 +975,21 @@ contains
             sys_size = adv_idx%end
 
             ! Volume Fraction Model
-        else
+        else  ! Fully 3D cylindrical grid
             ! Annotating structure of the state and flux vectors belonging to the system of equations defined by the selected number
             ! of spatial dimensions and the volume fraction model
             if (model_eqns == 2) then
-                cont_idx%beg = 1
+                cont_idx%beg = 1  ! one continuity equation
                 cont_idx%end = num_fluids
-                mom_idx%beg = cont_idx%end + 1
+                mom_idx%beg = cont_idx%end + 1  ! one momentum equation in each direction
                 mom_idx%end = cont_idx%end + num_vels
-                E_idx = mom_idx%end + 1
+                E_idx = mom_idx%end + 1  ! one energy equation
 
                 if (igr) then
-                    ! IGR: volume fractions after energy (N-1 for N fluids; skipped when num_fluids=1)
-                    adv_idx%beg = E_idx + 1  ! Alpha for fluid 1
+                    ! Volume fractions are stored in the indices immediately following the energy equation. IGR tracks a total of
+                    ! (N-1) volume fractions for N fluids, hence the "-1" in adv_idx%end. If num_fluids = 1 then adv_idx%end <
+                    ! adv_idx%beg, which skips all loops over the volume fractions since there is no volume fraction to track
+                    adv_idx%beg = E_idx + 1
                     adv_idx%end = E_idx + num_fluids - 1
                 else
                     ! Volume fractions are stored in the indices immediately following the energy equation. WENO/MUSCL + Riemann
@@ -978,11 +1072,11 @@ contains
                     sys_size = B_idx%end
                 end if
             else if (model_eqns == 3) then
-                cont_idx%beg = 1
+                cont_idx%beg = 1  ! one continuity equation
                 cont_idx%end = num_fluids
-                mom_idx%beg = cont_idx%end + 1
+                mom_idx%beg = cont_idx%end + 1  ! one momentum equation in each direction
                 mom_idx%end = cont_idx%end + num_vels
-                E_idx = mom_idx%end + 1
+                E_idx = mom_idx%end + 1  ! one energy equation
                 adv_idx%beg = E_idx + 1
                 adv_idx%end = E_idx + num_fluids
                 alf_idx = adv_idx%end
@@ -1029,7 +1123,8 @@ contains
                 end if
             end if
 
-            ! Count fluids with non-negligible viscous effects (Re > 0)
+            ! Determining the number of fluids for which the shear and the volume Reynolds numbers, e.g. viscous effects, are
+            ! important
             do i = 1, num_fluids
                 if (fluid_pp(i)%Re(1) > 0) Re_size(1) = Re_size(1) + 1
                 if (fluid_pp(i)%Re(2) > 0) Re_size(2) = Re_size(2) + 1
@@ -1122,6 +1217,16 @@ contains
 
         ! END: Volume Fraction Model
 
+        if (bubbles_lagrange) then
+            @:ALLOCATE(beta_vars(1:3))
+            beta_vars(1:3) = [1, 2, 5]
+            $:GPU_UPDATE(device='[beta_vars]')
+        else if (particles_lagrange) then
+            @:ALLOCATE(beta_vars(1:8))
+            beta_vars(1:8) = [1, 2, 3, 4, 5, 6, 7, 8]
+            $:GPU_UPDATE(device='[beta_vars]')
+        end if
+
         if (chemistry) then
             species_idx%beg = sys_size + 1
             species_idx%end = sys_size + num_species
@@ -1134,7 +1239,10 @@ contains
         else if (bubbles_lagrange) then
             allocate (MPI_IO_DATA%view(1:sys_size + 1))
             allocate (MPI_IO_DATA%var(1:sys_size + 1))
-        else
+        else if (particles_lagrange) then
+            allocate (MPI_IO_DATA%view(1:sys_size + 1))
+            allocate (MPI_IO_DATA%var(1:sys_size + 1))
+        else  ! Fully 3D cylindrical grid
             allocate (MPI_IO_DATA%view(1:sys_size))
             allocate (MPI_IO_DATA%var(1:sys_size))
         end if
@@ -1155,9 +1263,16 @@ contains
                 allocate (MPI_IO_DATA%var(i)%sf(0:m,0:n,0:p))
                 MPI_IO_DATA%var(i)%sf => null()
             end do
+        else if (particles_lagrange) then
+            do i = 1, sys_size + 1
+                allocate (MPI_IO_DATA%var(i)%sf(0:m,0:n,0:p))
+                MPI_IO_DATA%var(i)%sf => null()
+            end do
         end if
 
-        ! Configure WENO averaging flag (arithmetic mean vs. unaltered values)
+        ! Configuring the WENO average flag that will be used to regulate whether any spatial derivatives are to computed in each
+        ! cell by using the arithmetic mean of left and right, WENO-reconstructed, cell-boundary values or otherwise, the unaltered
+        ! left and right, WENO-reconstructed, cell-boundary values
         wa_flg = 0._wp; if (weno_avg) wa_flg = 1._wp
         $:GPU_UPDATE(device='[wa_flg]')
 
@@ -1169,20 +1284,16 @@ contains
         if (ib) allocate (MPI_IO_IB_DATA%var%sf(0:m,0:n,0:p))
         Np = 0
 
-        if (elasticity) then
-            fd_number = max(1, fd_order/2)
-        end if
-
+        if (elasticity) fd_number = max(1, fd_order/2)
         if (mhd) then  ! TODO merge with above; waiting for hyperelasticity PR
             fd_number = max(1, fd_order/2)
         end if
-
-        if (probe_wrt) then
-            fd_number = max(1, fd_order/2)
-        end if
+        if (probe_wrt) fd_number = max(1, fd_order/2)
+        if (bubbles_lagrange) fd_number = max(1, fd_order/2)
+        if (particles_lagrange) fd_number = max(1, fd_order/2)
 
         call s_configure_coordinate_bounds(recon_type, weno_polyn, muscl_polyn, igr_order, buff_size, idwint, idwbuff, viscous, &
-                                           & bubbles_lagrange, m, n, p, num_dims, igr, ib)
+                                           & bubbles_lagrange, particles_lagrange, m, n, p, num_dims, igr, ib, fd_number)
         $:GPU_UPDATE(device='[idwint, idwbuff]')
 
         ! Configuring Coordinate Direction Indexes
@@ -1295,6 +1406,8 @@ contains
         #:endif
 
         allocate (proc_coords(1:num_dims))
+        @:ALLOCATE(pcomm_coords(1:num_dims))
+        @:ALLOCATE(pcomm_coords_ghost(1:num_dims))
 
         if (parallel_io .neqv. .true.) return
 
@@ -1327,6 +1440,14 @@ contains
         end if
 
         deallocate (proc_coords)
+
+        @:DEALLOCATE(pcomm_coords)
+        @:DEALLOCATE(pcomm_coords_ghost)
+
+        if (bubbles_lagrange .or. particles_lagrange) then
+            @:DEALLOCATE(beta_vars)
+        end if
+
         if (parallel_io) then
             deallocate (start_idx)
 
@@ -1334,6 +1455,10 @@ contains
                 do i = 1, sys_size + 1
                     MPI_IO_DATA%var(i)%sf => null()
                 end do
+            else if (particles_lagrange) then
+                do i = 1, sys_size + 1
+                    MPI_IO_DATA%var(i)%sf => null()
+                end do
             else
                 do i = 1, sys_size
                     MPI_IO_DATA%var(i)%sf => null()
@@ -1355,6 +1480,10 @@ contains
         if (p == 0) return
         @:DEALLOCATE(z_cb, z_cc, dz)
 
+        if (allocated(neighbor_ranks)) then
+            @:DEALLOCATE(neighbor_ranks)
+        end if
+
     end subroutine s_finalize_global_parameters_module
 
 end module m_global_parameters
diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp
index a7d8e45eee..b394e25e58 100644
--- a/src/simulation/m_ib_patches.fpp
+++ b/src/simulation/m_ib_patches.fpp
@@ -32,26 +32,26 @@ module m_ib_patches
     integer  :: smooth_patch_id
     real(wp) :: smooth_coeff
     $:GPU_DECLARE(create='[smooth_patch_id, smooth_coeff]')
-    ! These variables are analogous in both meaning and use to the similarly named components in the ic_patch_parameters type (see
-    ! m_derived_types.f90 for additional details). They are employed as a means to more concisely perform the actions necessary to
-    ! lay out a particular patch on the grid.
+    !! These variables are analogous in both meaning and use to the similarly named components in the ic_patch_parameters type (see
+    !! m_derived_types.f90 for additional details). They are employed as a means to more concisely perform the actions necessary to
+    !! lay out a particular patch on the grid.
 
     real(wp) :: cart_x, cart_y, cart_z
     real(wp) :: sph_phi
     $:GPU_DECLARE(create='[cart_x, cart_y, cart_z, sph_phi]')
-    ! Variables to be used to hold cell locations in Cartesian coordinates if 3D simulation is using cylindrical coordinates
+    !! Variables to be used to hold cell locations in Cartesian coordinates if 3D simulation is using cylindrical coordinates
 
     type(bounds_info) :: x_boundary, y_boundary, z_boundary
     $:GPU_DECLARE(create='[x_boundary, y_boundary, z_boundary]')
-    ! These variables combine the centroid and length parameters associated with a particular patch to yield the locations of the
-    ! patch boundaries in the x-, y- and z-coordinate directions. They are used as a means to concisely perform the actions
-    ! necessary to lay out a particular patch on the grid.
+    !! These variables combine the centroid and length parameters associated with a particular patch to yield the locations of the
+    !! patch boundaries in the x-, y- and z-coordinate directions. They are used as a means to concisely perform the actions
+    !! necessary to lay out a particular patch on the grid.
 
     character(len=5) :: istr  !< string to store int to string result for error checking
 
 contains
 
-    !> Apply all immersed boundary patch geometries to mark interior cells in the IB marker array
+    !> @brief Applies all immersed boundary patch geometries to mark interior cells in the IB marker array.
     impure subroutine s_apply_ib_patches(ib_markers)
 
         type(integer_field), intent(inout) :: ib_markers
@@ -112,21 +112,26 @@ contains
 
     end subroutine s_apply_ib_patches
 
-    !> Mark cells inside a circular immersed boundary
+    !> The circular patch is a 2D geometry that may be used, for example, in creating a bubble or a droplet. The geometry of the
+    !! patch is well-defined when its centroid and radius are provided. Note that the circular patch DOES allow for the smoothing of
+    !! its boundary.
+    !! @param patch_id is the patch identifier
+    !! @param ib_markers Array to track patch ids
+    !! @param ib True if this patch is an immersed boundary
     subroutine s_ib_circle(patch_id, ib_markers, xp, yp)
 
         integer, intent(in)                :: patch_id
         integer, intent(in)                :: xp, yp                !< integers containing the periodicity projection information
         type(integer_field), intent(inout) :: ib_markers
-        real(wp), dimension(1:2)           :: center
+        real(wp), dimension(1:2)           :: center                !< x and y coordinates in local IB frame
         real(wp)                           :: radius
         integer                            :: i, j, il, ir, jl, jr  !< Generic loop iterators
         integer                            :: encoded_patch_id
 
         ! Transferring the circular patch's radius, centroid, smearing patch identity and smearing coefficient information
 
-        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg)
+        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
         radius = patch_ib(patch_id)%radius
 
         ! encode the periodicity information into the patch_id
@@ -140,7 +145,8 @@ contains
         call get_bounding_indices(center(1) - radius, center(1) + radius, x_cc, il, ir)
         call get_bounding_indices(center(2) - radius, center(2) + radius, y_cc, jl, jr)
 
-        ! Assign primitive variables if circle covers cell and patch has write permission
+        ! Checking whether the circle covers a particular cell in the domain and verifying whether the current patch has permission
+        ! to write to that cell. If both queries check out, the primitive variables of the current patch are assigned to this cell.
 
         $:GPU_PARALLEL_LOOP(private='[i, j]', copyin='[encoded_patch_id, center, radius]', collapse=2)
         do j = jl, jr
@@ -154,23 +160,25 @@ contains
 
     end subroutine s_ib_circle
 
-    !> Mark cells inside a 2D NACA 4-digit airfoil immersed boundary
+    !> @brief Marks cells inside a 2D NACA 4-digit airfoil immersed boundary using upper and lower surface grids.
+    !! @param patch_id is the patch identifier
+    !! @param ib_markers Array to track patch ids
     subroutine s_ib_airfoil(patch_id, ib_markers, xp, yp)
 
         integer, intent(in)                :: patch_id
         type(integer_field), intent(inout) :: ib_markers
-        integer, intent(in)                :: xp, yp            !< integers containing the periodicity projection information
+        integer, intent(in)                :: xp, yp                   !< integers containing the periodicity projection information
         real(wp)                           :: f, ca_in, pa, ma, ta
         real(wp)                           :: xa, yt, xu, yu, xl, yl, xc, yc, dycdxc, sin_c, cos_c
-        integer                            :: i, j, k, il, ir, jl, jr
+        integer                            :: i, j, k, il, ir, jl, jr  !< Generic loop iterators
         integer                            :: Np1, Np2
         integer                            :: encoded_patch_id
-        real(wp), dimension(1:3)           :: xy_local, offset  !< x and y coordinates in local IB frame
-        real(wp), dimension(1:2)           :: center            !< x and y coordinates in local IB frame
+        real(wp), dimension(1:3)           :: xy_local, offset         !< x and y coordinates in local IB frame
+        real(wp), dimension(1:2)           :: center                   !< x and y coordinates in local IB frame
         real(wp), dimension(1:3,1:3)       :: inverse_rotation
 
-        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg)
+        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
         ca_in = patch_ib(patch_id)%c
         pa = patch_ib(patch_id)%p
         ma = patch_ib(patch_id)%m
@@ -187,7 +195,7 @@ contains
             @:ALLOCATE(airfoil_grid_u(1:Np))
             @:ALLOCATE(airfoil_grid_l(1:Np))
 
-            ! TODO :: The below instantiations are already handled by the loop below
+            ! TODO :: The below instantiations are already handles by the loop below
             airfoil_grid_u(1)%x = 0._wp
             airfoil_grid_u(1)%y = 0._wp
 
@@ -195,7 +203,7 @@ contains
             airfoil_grid_l(1)%y = 0._wp
 
             do i = 1, Np1 + Np2 - 1
-                ! TODO :: This allocates the upper and lower airfoil arrays, and does not need to be performed each time the IB
+                ! TODO :: This allocated the upper and lower airfoil arrays, and does not need to be performed each time the IB
                 ! markers are updated. Place this as a separate subroutine.
                 if (i <= Np1) then
                     xc = i*(pa*ca_in/Np1)
@@ -257,9 +265,9 @@ contains
                             & ca_in, airfoil_grid_u, airfoil_grid_l]', collapse=2)
         do j = jl, jr
             do i = il, ir
-                xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp]  ! get coordinate frame centered on IB
-                xy_local = matmul(inverse_rotation, xy_local)  ! rotate the frame into the IB's coordinates
-                xy_local = xy_local - offset  ! airfoils are a patch that require a centroid offset
+                xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp]
+                xy_local = matmul(inverse_rotation, xy_local)
+                xy_local = xy_local - offset
 
                 if (xy_local(1) >= 0._wp .and. xy_local(1) <= ca_in) then
                     xa = xy_local(1)/ca_in
@@ -277,11 +285,13 @@ contains
                         end do
                         if (f_approx_equal(airfoil_grid_u(k)%x, xy_local(1))) then
                             if (xy_local(2) <= airfoil_grid_u(k)%y) then
+                                !!IB
                                 ib_markers%sf(i, j, 0) = encoded_patch_id
                             end if
                         else
                             f = (airfoil_grid_u(k)%x - xy_local(1))/(airfoil_grid_u(k)%x - airfoil_grid_u(k - 1)%x)
                             if (xy_local(2) <= ((1._wp - f)*airfoil_grid_u(k)%y + f*airfoil_grid_u(k - 1)%y)) then
+                                !!IB
                                 ib_markers%sf(i, j, 0) = encoded_patch_id
                             end if
                         end if
@@ -292,12 +302,14 @@ contains
                         end do
                         if (f_approx_equal(airfoil_grid_l(k)%x, xy_local(1))) then
                             if (xy_local(2) >= airfoil_grid_l(k)%y) then
+                                !!IB
                                 ib_markers%sf(i, j, 0) = encoded_patch_id
                             end if
                         else
                             f = (airfoil_grid_l(k)%x - xy_local(1))/(airfoil_grid_l(k)%x - airfoil_grid_l(k - 1)%x)
 
                             if (xy_local(2) >= ((1._wp - f)*airfoil_grid_l(k)%y + f*airfoil_grid_l(k - 1)%y)) then
+                                !!IB
                                 ib_markers%sf(i, j, 0) = encoded_patch_id
                             end if
                         end if
@@ -309,7 +321,10 @@ contains
 
     end subroutine s_ib_airfoil
 
-    !> Mark cells inside a 3D extruded NACA 4-digit airfoil immersed boundary with finite span
+    !> @brief Marks cells inside a 3D extruded NACA 4-digit airfoil immersed boundary with finite span.
+    !! @param patch_id is the patch identifier
+    !! @param ib_markers Array to track patch ids
+    !! @param ib True if this patch is an immersed boundary
     subroutine s_ib_3D_airfoil(patch_id, ib_markers, xp, yp, zp)
 
         integer, intent(in) :: patch_id
@@ -322,9 +337,9 @@ contains
         real(wp), dimension(1:3) :: xyz_local, center, offset  !< x, y, z coordinates in local IB frame
         real(wp), dimension(1:3,1:3) :: inverse_rotation
 
-        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg)
-        center(3) = patch_ib(patch_id)%z_centroid + real(zp, wp)*(z_domain%end - z_domain%beg)
+        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
+        center(3) = patch_ib(patch_id)%z_centroid + real(zp, wp)*(glb_bounds(3)%end - glb_bounds(3)%beg)
         lz = patch_ib(patch_id)%length_z
         ca_in = patch_ib(patch_id)%c
         pa = patch_ib(patch_id)%p
@@ -418,8 +433,8 @@ contains
                 do i = il, ir
                     xyz_local = [x_cc(i) - center(1), y_cc(j) - center(2), &
                                       & z_cc(l) - center(3)]  ! get coordinate frame centered on IB
-                    xyz_local = matmul(inverse_rotation, xyz_local)  ! rotate the frame into the IB's coordinates
-                    xyz_local = xyz_local - offset  ! airfoils are a patch that require a centroid offset
+                    xyz_local = matmul(inverse_rotation, xyz_local)
+                    xyz_local = xyz_local - offset
 
                     if (xyz_local(3) >= z_min .and. xyz_local(3) <= z_max) then
                         if (xyz_local(1) >= 0._wp .and. xyz_local(1) <= ca_in) then
@@ -436,6 +451,7 @@ contains
                                 else
                                     f = (airfoil_grid_u(k)%x - xyz_local(1))/(airfoil_grid_u(k)%x - airfoil_grid_u(k - 1)%x)
                                     if (xyz_local(2) <= ((1._wp - f)*airfoil_grid_u(k)%y + f*airfoil_grid_u(k - 1)%y)) then
+                                        !!IB
                                         ib_markers%sf(i, j, l) = encoded_patch_id
                                     end if
                                 end if
@@ -446,12 +462,14 @@ contains
                                 end do
                                 if (f_approx_equal(airfoil_grid_l(k)%x, xyz_local(1))) then
                                     if (xyz_local(2) >= airfoil_grid_l(k)%y) then
+                                        !!IB
                                         ib_markers%sf(i, j, l) = encoded_patch_id
                                     end if
                                 else
                                     f = (airfoil_grid_l(k)%x - xyz_local(1))/(airfoil_grid_l(k)%x - airfoil_grid_l(k - 1)%x)
 
                                     if (xyz_local(2) >= ((1._wp - f)*airfoil_grid_l(k)%y + f*airfoil_grid_l(k - 1)%y)) then
+                                        !!IB
                                         ib_markers%sf(i, j, l) = encoded_patch_id
                                     end if
                                 end if
@@ -465,23 +483,29 @@ contains
 
     end subroutine s_ib_3D_airfoil
 
-    !> Mark cells inside a rectangular immersed boundary
+    !> The rectangular patch is a 2D geometry that may be used, for example, in creating a solid boundary, or pre-/post- shock
+    !! region, in alignment with the axes of the Cartesian coordinate system. The geometry of such a patch is well- defined when its
+    !! centroid and lengths in the x- and y- coordinate directions are provided. Please note that the rectangular patch DOES NOT
+    !! allow for the smoothing of its boundaries.
+    !! @param patch_id is the patch identifier
+    !! @param ib_markers Array to track patch ids
+    !! @param ib True if this patch is an immersed boundary
     subroutine s_ib_rectangle(patch_id, ib_markers, xp, yp)
 
         integer, intent(in)                :: patch_id
         type(integer_field), intent(inout) :: ib_markers
         integer, intent(in)                :: xp, yp                !< integers containing the periodicity projection information
-        integer                            :: i, j, il, ir, jl, jr  !< generic loop iterators
+        integer                            :: i, j, il, ir, jl, jr  !< Generic loop iterators
         integer                            :: encoded_patch_id
-        real(wp)                           :: corner_distance       !< Equation of state parameters
+        real(wp)                           :: corner_distance
         real(wp), dimension(1:3)           :: xy_local              !< x and y coordinates in local IB frame
         real(wp), dimension(1:2)           :: length, center        !< x and y coordinates in local IB frame
         real(wp), dimension(1:3,1:3)       :: inverse_rotation
 
         ! Transferring the rectangle's centroid and length information
 
-        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg)
+        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
         length(1) = patch_ib(patch_id)%length_x
         length(2) = patch_ib(patch_id)%length_y
         inverse_rotation(:,:) = patch_ib(patch_id)%rotation_matrix_inverse(:,:)
@@ -498,7 +522,9 @@ contains
         call get_bounding_indices(center(1) - corner_distance, center(1) + corner_distance, x_cc, il, ir)
         call get_bounding_indices(center(2) - corner_distance, center(2) + corner_distance, y_cc, jl, jr)
 
-        ! Assign primitive variables if rectangle covers cell and patch has write permission
+        ! Checking whether the rectangle covers a particular cell in the domain and verifying whether the current patch has the
+        ! permission to write to that cell. If both queries check out, the primitive variables of the current patch are assigned to
+        ! this cell.
         $:GPU_PARALLEL_LOOP(private='[i, j, xy_local]', copyin='[encoded_patch_id, center, length, inverse_rotation, x_cc, &
                             & y_cc]', collapse=2)
         do j = jl, jr
@@ -518,12 +544,18 @@ contains
 
     end subroutine s_ib_rectangle
 
-    !> Mark cells inside a spherical immersed boundary
+    !> The spherical patch is a 3D geometry that may be used, for example, in creating a bubble or a droplet. The patch geometry is
+    !! well-defined when its centroid and radius are provided. Please note that the spherical patch DOES allow for the smoothing of
+    !! its boundary.
+    !! @param patch_id is the patch identifier
+    !! @param ib_markers Array to track patch ids
+    !! @param ib True if this patch is an immersed boundary
     subroutine s_ib_sphere(patch_id, ib_markers, xp, yp, zp)
 
         integer, intent(in)                :: patch_id
         type(integer_field), intent(inout) :: ib_markers
         integer, intent(in)                :: xp, yp, zp  !< integers containing the periodicity projection information
+
         ! Generic loop iterators
         integer                  :: i, j, k
         integer                  :: il, ir, jl, jr, kl, kr
@@ -531,13 +563,14 @@ contains
         real(wp)                 :: radius
         real(wp), dimension(1:3) :: center
 
-        ! Variables to initialize the pressure field that corresponds to the bubble-collapse test case found in Tiwari et al. (2013)
+        !! Variables to initialize the pressure field that corresponds to the bubble-collapse test case found in Tiwari et al.
+        !! (2013)
 
         ! Transferring spherical patch's radius, centroid, smoothing patch identity and smoothing coefficient information
 
-        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg)
-        center(3) = patch_ib(patch_id)%z_centroid + real(zp, wp)*(z_domain%end - z_domain%beg)
+        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
+        center(3) = patch_ib(patch_id)%z_centroid + real(zp, wp)*(glb_bounds(3)%end - glb_bounds(3)%beg)
         radius = patch_ib(patch_id)%radius
 
         ! encode the periodicity information into the patch_id
@@ -560,6 +593,7 @@ contains
         do k = kl, kr
             do j = jl, jr
                 do i = il, ir
+                    ! do i = -gp_layers, m+gp_layers
                     if (grid_geometry == 3) then
                         call s_convert_cylindrical_to_cartesian_coord(y_cc(j), z_cc(k))
                     else
@@ -577,7 +611,12 @@ contains
 
     end subroutine s_ib_sphere
 
-    !> Mark cells inside a cuboidal immersed boundary
+    !> The cuboidal patch is a 3D geometry that may be used, for example, in creating a solid boundary, or pre-/post-shock region,
+    !! which is aligned with the axes of the Cartesian coordinate system. The geometry of such a patch is well- defined when its
+    !! centroid and lengths in the x-, y- and z-coordinate directions are provided. Please notice that the cuboidal patch DOES NOT
+    !! allow for the smearing of its boundaries.
+    !! @param patch_id is the patch identifier
+    !! @param ib_markers Array to track patch ids
     subroutine s_ib_cuboid(patch_id, ib_markers, xp, yp, zp)
 
         integer, intent(in)                :: patch_id
@@ -591,9 +630,9 @@ contains
 
         ! Transferring the cuboid's centroid and length information
 
-        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg)
-        center(3) = patch_ib(patch_id)%z_centroid + real(zp, wp)*(z_domain%end - z_domain%beg)
+        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
+        center(3) = patch_ib(patch_id)%z_centroid + real(zp, wp)*(glb_bounds(3)%end - glb_bounds(3)%beg)
         length(1) = patch_ib(patch_id)%length_x
         length(2) = patch_ib(patch_id)%length_y
         length(3) = patch_ib(patch_id)%length_z
@@ -630,7 +669,7 @@ contains
                         cart_z = z_cc(k)
                     end if
                     xyz_local = [x_cc(i), cart_y, cart_z] - center  ! get coordinate frame centered on IB
-                    xyz_local = matmul(inverse_rotation, xyz_local)  ! rotate the frame into the IB's coordinates
+                    xyz_local = matmul(inverse_rotation, xyz_local)
 
                     if (-0.5*length(1) <= xyz_local(1) .and. 0.5*length(1) >= xyz_local(1) .and. -0.5*length(2) <= xyz_local(2) &
                         & .and. 0.5*length(2) >= xyz_local(2) .and. -0.5*length(3) <= xyz_local(3) .and. 0.5*length(3) &
@@ -645,7 +684,13 @@ contains
 
     end subroutine s_ib_cuboid
 
-    !> Mark cells inside a cylindrical immersed boundary
+    !> The cylindrical patch is a 3D geometry that may be used, for example, in setting up a cylindrical solid boundary confinement,
+    !! like a blood vessel. The geometry of this patch is well-defined when the centroid, the radius and the length along the
+    !! cylinder's axis, parallel to the x-, y- or z-coordinate direction, are provided. Please note that the cylindrical patch DOES
+    !! allow for the smoothing of its lateral boundary.
+    !! @param patch_id is the patch identifier
+    !! @param ib_markers Array to track patch ids
+    !! @param ib True if this patch is an immersed boundary
     subroutine s_ib_cylinder(patch_id, ib_markers, xp, yp, zp)
 
         integer, intent(in)                :: patch_id
@@ -660,9 +705,9 @@ contains
 
         ! Transferring the cylindrical patch's centroid, length, radius,
 
-        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg)
-        center(3) = patch_ib(patch_id)%z_centroid + real(zp, wp)*(z_domain%end - z_domain%beg)
+        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
+        center(3) = patch_ib(patch_id)%z_centroid + real(zp, wp)*(glb_bounds(3)%end - glb_bounds(3)%beg)
         length(1) = patch_ib(patch_id)%length_x
         length(2) = patch_ib(patch_id)%length_y
         length(3) = patch_ib(patch_id)%length_z
@@ -698,7 +743,7 @@ contains
                         cart_z = z_cc(k)
                     end if
                     xyz_local = [x_cc(i), cart_y, cart_z] - center  ! get coordinate frame centered on IB
-                    xyz_local = matmul(inverse_rotation, xyz_local)  ! rotate the frame into the IB's coordinates
+                    xyz_local = matmul(inverse_rotation, xyz_local)
 
                     if (((.not. f_is_default(length(1)) .and. xyz_local(2)**2 + xyz_local(3)**2 <= radius**2 .and. &
                         & -0.5_wp*length(1) <= xyz_local(1) .and. 0.5_wp*length(1) >= xyz_local(1)) &
@@ -716,7 +761,7 @@ contains
 
     end subroutine s_ib_cylinder
 
-    !> Mark cells inside a 2D elliptical immersed boundary
+    !> @brief Marks cells inside a 2D elliptical immersed boundary defined by semi-axis lengths and rotation.
     subroutine s_ib_ellipse(patch_id, ib_markers, xp, yp)
 
         integer, intent(in)                :: patch_id
@@ -731,8 +776,8 @@ contains
 
         ! Transferring the ellipse's centroid and length information
 
-        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg)
+        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
         ellipse_coeffs(1) = 0.5_wp*patch_ib(patch_id)%length_x
         ellipse_coeffs(2) = 0.5_wp*patch_ib(patch_id)%length_y
         inverse_rotation(:,:) = patch_ib(patch_id)%rotation_matrix_inverse(:,:)
@@ -769,6 +814,8 @@ contains
     end subroutine s_ib_ellipse
 
     !> The STL patch is a 2D geometry that is imported from an STL file.
+    !! @param patch_id is the patch identifier
+    !! @param ib_markers Array to track patch ids
     subroutine s_ib_model(patch_id, ib_markers, xp, yp)
 
         integer, intent(in)                :: patch_id
@@ -786,8 +833,8 @@ contains
         real(wp), dimension(1:3,1:3)       :: inverse_rotation, rotation
 
         center = 0._wp
-        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg)
+        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
         inverse_rotation(:,:) = patch_ib(patch_id)%rotation_matrix_inverse(:,:)
         rotation(:,:) = patch_ib(patch_id)%rotation_matrix(:,:)
         offset(:) = patch_ib(patch_id)%centroid_offset(:)
@@ -846,6 +893,8 @@ contains
     end subroutine s_ib_model
 
     !> The STL patch is a 3D geometry that is imported from an STL file.
+    !! @param patch_id is the patch identifier
+    !! @param ib_markers Array to track patch ids
     subroutine s_ib_3d_model(patch_id, ib_markers, xp, yp, zp)
 
         integer, intent(in)                :: patch_id
@@ -862,9 +911,9 @@ contains
         real(wp), dimension(1:3)           :: bbox_min, bbox_max, local_corner, world_corner
 
         center = 0._wp
-        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg)
-        center(3) = patch_ib(patch_id)%z_centroid + real(zp, wp)*(z_domain%end - z_domain%beg)
+        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
+        center(3) = patch_ib(patch_id)%z_centroid + real(zp, wp)*(glb_bounds(3)%end - glb_bounds(3)%beg)
         inverse_rotation(:,:) = patch_ib(patch_id)%rotation_matrix_inverse(:,:)
         offset(:) = patch_ib(patch_id)%centroid_offset(:)
         spc = patch_ib(patch_id)%model_spc
@@ -932,7 +981,7 @@ contains
 
     end subroutine s_ib_3d_model
 
-    !> Compute a rotation matrix for converting to the rotating frame of the boundary
+    !> Subroutine that computes a rotation matrix for converting to the rotating frame of the boundary
     subroutine s_update_ib_rotation_matrix(patch_id)
 
         integer, intent(in)          :: patch_id
@@ -978,7 +1027,7 @@ contains
 
     end subroutine s_update_ib_rotation_matrix
 
-    !> Convert cylindrical (r, theta) coordinates to Cartesian (y, z)
+    !> @brief Converts cylindrical (r, theta) coordinates to Cartesian (y, z) and stores in module variables.
     subroutine s_convert_cylindrical_to_cartesian_coord(cyl_y, cyl_z)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -990,7 +1039,7 @@ contains
 
     end subroutine s_convert_cylindrical_to_cartesian_coord
 
-    !> Convert a 3D cylindrical coordinate vector (x, r, theta) to Cartesian (x, y, z)
+    !> @brief Converts a 3D cylindrical coordinate vector (x, r, theta) to Cartesian (x, y, z).
     pure function f_convert_cyl_to_cart(cyl) result(cart)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -1002,7 +1051,7 @@ contains
 
     end function f_convert_cyl_to_cart
 
-    !> Convert cylindrical coordinates (x, r) to the spherical azimuthal angle phi
+    !> @brief Converts cylindrical coordinates (x, r) to the spherical azimuthal angle phi and stores in a module variable.
     subroutine s_convert_cylindrical_to_spherical_coord(cyl_x, cyl_y)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -1052,7 +1101,7 @@ contains
 
     end subroutine get_bounding_indices
 
-    !> Encode the patch ID with a unique offset containing periodicity information
+    !> @brief encodes the patch id with a unique offset that contains information on how the IB marker wraps periodically
     subroutine s_encode_patch_periodicity(patch_id, x_periodicity, y_periodicity, z_periodicity, encoded_patch_id)
 
         integer, intent(in)  :: patch_id, x_periodicity, y_periodicity, z_periodicity
@@ -1070,7 +1119,7 @@ contains
 
     end subroutine s_encode_patch_periodicity
 
-    !> Decode the encoded ID to recover the original patch ID and periodicity
+    !> @brief decodes the encoded id to get out the original id and the way in which it is periodic
     subroutine s_decode_patch_periodicity(encoded_patch_id, patch_id, x_periodicity, y_periodicity, z_periodicity)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -1096,7 +1145,7 @@ contains
 
     end subroutine s_decode_patch_periodicity
 
-    !> Determine the periodic wrapping bounds in each direction
+    !> @brief Determines if we should wrap periodically
     subroutine s_get_periodicities(xp_lower, xp_upper, yp_lower, yp_upper, zp_lower, zp_upper)
 
         integer, intent(out)           :: xp_lower, xp_upper, yp_lower, yp_upper
@@ -1104,33 +1153,26 @@ contains
 
         ! check domain wraps in x, y
 
-        #:for X in [('x'), ('y')]
-            ! check for periodicity
-
-            if (bc_${X}$%beg == BC_PERIODIC) then
-                ${X}$p_lower = -1
-                ${X}$p_upper = 1
-            else
-                ! if it is not periodic, then both elements are 0
-                ${X}$p_lower = 0
-                ${X}$p_upper = 0
+        #:for X, ID in [('x', 1), ('y', 2), ('z', 3)]
+            if (num_dims >= ${ID}$) then
+                ! check for periodicity
+                if (bc_${X}$%beg == BC_PERIODIC) then
+                    ${X}$p_lower = -1
+                    ${X}$p_upper = 1
+                else
+                    ! if it is not periodic, then both elements are 0
+                    ${X}$p_lower = 0
+                    ${X}$p_upper = 0
+                end if
             end if
         #:endfor
 
-        ! z only if 3D
-        if (present(zp_lower) .and. p /= 0) then
-            if (bc_z%beg == BC_PERIODIC) then
-                zp_lower = -1
-                zp_upper = 1
-            else
-                zp_lower = 0
-                zp_upper = 0
-            end if
-        end if
-
     end subroutine s_get_periodicities
 
     !> Archimedes spiral function
+    !! @param myth Angle
+    !! @param offset Thickness
+    !! @param a Starting position
     pure elemental function f_r(myth, offset, a)
 
         $:GPU_ROUTINE(parallelism='[seq]')
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 785bf2dec3..eb241db76a 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -47,7 +47,7 @@ contains
 
         if (p > 0) then
             @:ALLOCATE(ib_markers%sf(-buff_size:m+buff_size, -buff_size:n+buff_size, -buff_size:p+buff_size))
-        else
+        else  ! we do not have an analytic moment of inertia calculation and need to approximate it directly via a sum
             @:ALLOCATE(ib_markers%sf(-buff_size:m+buff_size, -buff_size:n+buff_size, 0:0))
         end if
 
@@ -79,10 +79,11 @@ contains
         $:GPU_UPDATE(device='[patch_ib(1:num_ibs)]')
 
         ! GPU routines require updated cell centers
-        $:GPU_UPDATE(device='[num_ibs, x_cc, y_cc, dx, dy, x_domain, y_domain]')
+        $:GPU_UPDATE(device='[num_ibs, x_cc, y_cc, dx, dy]')
         if (p /= 0) then
-            $:GPU_UPDATE(device='[z_cc, dz, z_domain]')
+            $:GPU_UPDATE(device='[z_cc, dz]')
         end if
+        $:GPU_UPDATE(device='[glb_bounds]')
 
         ! allocate STL models
         call s_instantiate_STL_models()
@@ -102,7 +103,7 @@ contains
         if (moving_immersed_boundary_flag) then
             call s_mpi_allreduce_integer_sum(num_gps, max_num_gps)
             max_num_gps = min(max_num_gps*2, (m + 1)*(n + 1)*(p + 1))
-        else
+        else  ! we do not have an analytic moment of inertia calculation and need to approximate it directly via a sum
             max_num_gps = num_gps
         end if
 
@@ -111,7 +112,6 @@ contains
         @:ALLOCATE(ghost_points(1:max_num_gps))
 
         $:GPU_ENTER_DATA(copyin='[ghost_points]')
-        ! Ghost-cell IBM, Tseng & Ferziger JCP (2003), Mittal & Iaccarino ARFM (2005)
         call s_find_ghost_points(ghost_points)
         call s_apply_levelset(ghost_points, num_gps)
 
@@ -122,14 +122,16 @@ contains
 
     end subroutine s_ibm_setup
 
-    !> Update the conservative variables at the ghost points
+    !> Subroutine that updates the conservative variables at the ghost points
+    !! @param pb_in Internal bubble pressure
+    !! @param mv_in Mass of vapor in bubble
     subroutine s_ibm_correct_state(q_cons_vf, q_prim_vf, pb_in, mv_in)
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf  !< Primitive Variables
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf  !< Primitive Variables
         real(stp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), optional, intent(inout) :: pb_in, mv_in
         integer :: i, j, k, l, q, r                                          !< Iterator variables
-        integer :: patch_id                                                  !< Patch ID of ghost point
+        integer :: patch_id
         real(wp) :: rho, gamma, pi_inf, dyn_pres                             !< Mixture variables
         real(wp), dimension(2) :: Re_K
         real(wp) :: G_K
@@ -151,7 +153,7 @@ contains
             real(wp), dimension(nb*nmom)    :: nmom_IP
             real(wp), dimension(nb*nnode)   :: presb_IP, massv_IP
         #:endif
-        ! Primitive variables at the image point associated with a ghost point, interpolated from surrounding fluid cells.
+        !! Primitive variables at the image point associated with a ghost point, interpolated from surrounding fluid cells.
 
         real(wp), dimension(3) :: norm               !< Normal vector from GP to IP
         real(wp), dimension(3) :: physical_loc       !< Physical loc of GP
@@ -164,7 +166,6 @@ contains
         type(ghost_point)      :: innerp
 
         ! set the Moving IBM interior conservative variables
-
         $:GPU_PARALLEL_LOOP(private='[i, j, k, patch_id, rho]', copyin='[E_idx, momxb]', collapse=3)
         do l = 0, p
             do k = 0, n
@@ -240,10 +241,11 @@ contains
                     q_prim_vf(E_idx)%sf(j, k, l) = 0._wp
                     $:GPU_LOOP(parallelism='[seq]')
                     do q = 1, num_fluids
-                        ! Pressure correction for moving IB: accounts for acceleration of IB surface
+                        ! Se the pressure inside a moving immersed boundary based upon the pressure of the image point.
+                        ! acceleration, and normal vector direction
                         q_prim_vf(E_idx)%sf(j, k, l) = q_prim_vf(E_idx)%sf(j, k, &
                                   & l) + pres_IP/(1._wp - 2._wp*abs(gp%levelset*alpha_rho_IP(q)/pres_IP) &
-                                  & *dot_product(patch_ib(patch_id) %force/patch_ib(patch_id)%mass, gp%levelset_norm))
+                                  & *dot_product(patch_ib(patch_id)%force/patch_ib(patch_id)%mass, gp%levelset_norm))
                     end do
                 end if
 
@@ -371,7 +373,8 @@ contains
 
     end subroutine s_ibm_correct_state
 
-    !> Compute the image points for each ghost point
+    !> Function that computes the image points for each ghost point
+    !! @param ghost_points_in Ghost Points
     impure subroutine s_compute_image_points(ghost_points_in)
 
         type(ghost_point), dimension(num_gps), intent(inout) :: ghost_points_in
@@ -384,7 +387,7 @@ contains
         type(ghost_point)                                    :: gp
         integer                                              :: q, dim      !< Iterator variables
         integer                                              :: i, j, k, l  !< Location indexes
-        integer                                              :: patch_id    !< IB Patch ID
+        integer                                              :: patch_id
         integer                                              :: dir
         integer                                              :: index
         logical                                              :: bounds_error
@@ -450,12 +453,12 @@ contains
                                 print *, [x_cc(i), y_cc(j), z_cc(k)]
                             end if
                             print *, "We are searching in dimension ", dim, " for image point at ", ghost_points_in(q)%ip_loc(:)
-                            print *, "Domain size: ", [x_cc(-buff_size), y_cc(-buff_size), z_cc(-buff_size)]
+                            print *, "Domain size: "
                             print *, "x: ", x_cc(-buff_size), " to: ", x_cc(m + buff_size - 1)
                             print *, "y: ", y_cc(-buff_size), " to: ", y_cc(n + buff_size - 1)
                             if (p /= 0) print *, "z: ", z_cc(-buff_size), " to: ", z_cc(p + buff_size - 1)
                             print *, "Image point is located approximately ", &
-                                & (ghost_points_in(q)%loc(dim) - ghost_points_in(q) %ip_loc(dim))/(s_cc(1) - s_cc(0)), &
+                                & (ghost_points_in(q)%loc(dim) - ghost_points_in(q)%ip_loc(dim))/(s_cc(1) - s_cc(0)), &
                                 & " grid cells away"
                             print *, "Levelset ", dist, " and Norm: ", norm(:)
                             print *, &
@@ -480,7 +483,7 @@ contains
 
     end subroutine s_compute_image_points
 
-    !> Count the number of ghost points for memory allocation
+    !> Subroutine that finds the number of ghost points, used for allocating memory.
     subroutine s_find_num_ghost_points(num_gps_out)
 
         integer, intent(out) :: num_gps_out
@@ -525,7 +528,7 @@ contains
 
     end subroutine s_find_num_ghost_points
 
-    !> Locate all ghost points in the domain
+    !> Function that finds the ghost points
     subroutine s_find_ghost_points(ghost_points_in)
 
         type(ghost_point), dimension(num_gps), intent(inout) :: ghost_points_in
@@ -541,8 +544,7 @@ contains
         if (p == 0) gp_layers_z = 0
 
         $:GPU_PARALLEL_LOOP(private='[i, j, k, ii, jj, kk, is_gp, local_idx, patch_id, encoded_patch_id, xp, yp, zp]', &
-                            & copyin='[count, count_i, x_domain, y_domain, z_domain]', firstprivate='[gp_layers, gp_layers_z]', &
-                                & collapse=3)
+                            & copyin='[count, count_i, glb_bounds]', firstprivate='[gp_layers, gp_layers_z]', collapse=3)
         do i = 0, m
             do j = 0, n
                 do k = 0, p
@@ -562,7 +564,7 @@ contains
 
                         if (is_gp) then
                             $:GPU_ATOMIC(atomic='capture')
-                            count = count + 1
+                            count = count + 1  ! increment the count of total cells in the boundary
                             local_idx = count
                             $:END_GPU_ATOMIC_CAPTURE()
 
@@ -576,26 +578,26 @@ contains
                             ghost_points_in(local_idx)%z_periodicity = zp
                             ghost_points_in(local_idx)%slip = patch_ib(patch_id)%slip
 
-                            if ((x_cc(i) - dx(i)) < x_domain%beg) then
+                            if ((x_cc(i) - dx(i)) < glb_bounds(1)%beg) then
                                 ghost_points_in(local_idx)%DB(1) = -1
-                            else if ((x_cc(i) + dx(i)) > x_domain%end) then
+                            else if ((x_cc(i) + dx(i)) > glb_bounds(1)%end) then
                                 ghost_points_in(local_idx)%DB(1) = 1
                             else
                                 ghost_points_in(local_idx)%DB(1) = 0
                             end if
 
-                            if ((y_cc(j) - dy(j)) < y_domain%beg) then
+                            if ((y_cc(j) - dy(j)) < glb_bounds(2)%beg) then
                                 ghost_points_in(local_idx)%DB(2) = -1
-                            else if ((y_cc(j) + dy(j)) > y_domain%end) then
+                            else if ((y_cc(j) + dy(j)) > glb_bounds(2)%end) then
                                 ghost_points_in(local_idx)%DB(2) = 1
                             else
                                 ghost_points_in(local_idx)%DB(2) = 0
                             end if
 
                             if (p /= 0) then
-                                if ((z_cc(k) - dz(k)) < z_domain%beg) then
+                                if ((z_cc(k) - dz(k)) < glb_bounds(3)%beg) then
                                     ghost_points_in(local_idx)%DB(3) = -1
-                                else if ((z_cc(k) + dz(k)) > z_domain%end) then
+                                else if ((z_cc(k) + dz(k)) > glb_bounds(3)%end) then
                                     ghost_points_in(local_idx)%DB(3) = 1
                                 else
                                     ghost_points_in(local_idx)%DB(3) = 0
@@ -610,7 +612,7 @@ contains
 
     end subroutine s_find_ghost_points
 
-    !> Compute the interpolation coefficients for image points
+    !> Function that computes the interpolation coefficients of image points
     subroutine s_compute_interpolation_coeffs(ghost_points_in)
 
         type(ghost_point), dimension(num_gps), intent(inout) :: ghost_points_in
@@ -715,10 +717,27 @@ contains
 
     end subroutine s_compute_interpolation_coeffs
 
-    !> Interpolate primitive variables to a ghost point's image point using bilinear or trilinear interpolation
+    !> Function that uses the interpolation coefficients and the current state at the cell centers in order to estimate the state at
+    !! the image point
+    !! @param gp Ghost point data structure
+    !> @brief Interpolates primitive variables from the fluid domain to a ghost point's image point using bilinear or trilinear
+    !! interpolation.
+    !! @param alpha_rho_IP Partial density at image point
+    !! @param alpha_IP Volume fraction at image point
+    !! @param pres_IP Pressure at image point
+    !! @param vel_IP Velocity at image point
+    !! @param c_IP Speed of sound at image point
+    !! @param r_IP Bubble radius at image point
+    !! @param v_IP Bubble radial velocity at image point
+    !! @param pb_IP Bubble pressure at image point
+    !! @param mv_IP Bubble vapor mass at image point
+    !! @param nmom_IP Bubble moment at image point
+    !! @param pb_in Internal bubble pressure array
+    !! @param mv_in Mass of vapor in bubble array
+    !! @param presb_IP Bubble node pressure at image point
+    !! @param massv_IP Bubble node vapor mass at image point
     subroutine s_interpolate_image_point(q_prim_vf, gp, alpha_rho_IP, alpha_IP, pres_IP, vel_IP, c_IP, r_IP, v_IP, pb_IP, mv_IP, &
-
-        & nmom_IP, pb_in, mv_in, presb_IP, massv_IP)
+                                         & nmom_IP, pb_in, mv_in, presb_IP, massv_IP)
         $:GPU_ROUTINE(parallelism='[seq]')
         type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf  !< Primitive Variables
         real(stp), optional, dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), intent(in) :: pb_in, mv_in
@@ -879,9 +898,11 @@ contains
 
     end subroutine s_update_mib
 
-    !> Compute pressure and viscous forces and torques on immersed bodies via volume integration
+    !> @brief Computes pressure and viscous forces and torques on immersed bodies via a volume integration method.
     subroutine s_compute_ib_forces(q_prim_vf, fluid_pp)
 
+        ! real(wp), dimension(idwbuff(1)%beg:idwbuff(1)%end, & idwbuff(2)%beg:idwbuff(2)%end, & idwbuff(3)%beg:idwbuff(3)%end),
+        ! intent(in) :: pressure
         type(scalar_field), dimension(1:sys_size), intent(in)          :: q_prim_vf
         type(physical_parameters), dimension(1:num_fluids), intent(in) :: fluid_pp
         integer                                                        :: gp_id, i, j, k, l, q, ib_idx, fluid_idx
@@ -967,7 +988,7 @@ contains
                             viscous_stress_div(1,1:3) = (viscous_stress_div_2(1,1:3) - viscous_stress_div_1(1, &
                                                & 1:3))/(2._wp*dx)  ! get x derivative of the first-row of viscous stress tensor
                             local_force_contribution(1:3) = local_force_contribution(1:3) + viscous_stress_div(1, &
-                                                     & 1:3)  ! add the x components of the divergence to the force
+                                                     & 1:3)  ! add the y components of the divergence to the force
 
                             call s_compute_viscous_stress_tensor(viscous_stress_div_1, q_prim_vf, dynamic_viscosity, i, j - 1, k)
                             call s_compute_viscous_stress_tensor(viscous_stress_div_2, q_prim_vf, dynamic_viscosity, i, j + 1, k)
@@ -1031,7 +1052,7 @@ contains
 
     end subroutine s_compute_ib_forces
 
-    !> Finalize the IBM module
+    !> Subroutine to deallocate memory reserved for the IBM module
     impure subroutine s_finalize_ibm_module()
 
         @:DEALLOCATE(ib_markers%sf)
@@ -1093,13 +1114,14 @@ contains
             ! rotate the centroid offset back into the local coords of the IB
             patch_ib(ib_marker)%centroid_offset = matmul(patch_ib(ib_marker)%rotation_matrix_inverse, &
                      & patch_ib(ib_marker)%centroid_offset)
-        else
+        else  ! we do not have an analytic moment of inertia calculation and need to approximate it directly via a sum
             patch_ib(ib_marker)%centroid_offset(:) = [0._wp, 0._wp, 0._wp]
         end if
 
     end subroutine s_compute_centroid_offset
 
     !> Computes the moment of inertia for an immersed boundary
+    !! @param ib_marker Immersed boundary marker index
     subroutine s_compute_moment_of_inertia(ib_marker, axis)
 
         real(wp), dimension(3), intent(in) :: axis  !< the axis about which we compute the moment. Only required in 3D.
@@ -1114,7 +1136,7 @@ contains
             ! if the object is not actually rotating at this time, return a dummy value and exit
             patch_ib(ib_marker)%moment = 1._wp
             return
-        else
+        else  ! we do not have an analytic moment of inertia calculation and need to approximate it directly via a sum
             normal_axis = axis/sqrt(sum(axis))
         end if
 
@@ -1132,8 +1154,8 @@ contains
         else  ! we do not have an analytic moment of inertia calculation and need to approximate it directly via a sum
             count = 0
             moment = 0._wp
-            cell_volume = (x_cc(1) - x_cc(0))*(y_cc(1) - y_cc(0))
-            ! computed without grid stretching. Update in the loop to perform with stretching
+            cell_volume = (x_cc(1) - x_cc(0))*(y_cc(1) - y_cc(0)) &
+                           &  ! computed without grid stretching. Update in the loop to perform with stretching
             if (p /= 0) then
                 cell_volume = cell_volume*(z_cc(1) - z_cc(0))
             end if
@@ -1177,47 +1199,35 @@ contains
 
     end subroutine s_compute_moment_of_inertia
 
-    !> Wrap immersed boundary positions across periodic domain boundaries
+    !> @brief Checks for periodic boundary conditions in all directions, and if so, moves patch location if it left the domain
     subroutine s_wrap_periodic_ibs()
 
         integer :: patch_id
 
         do patch_id = 1, num_ibs
             ! check domain wraps in x, y,
-            #:for X in [('x'), ('y')]
-                ! check for periodicity
-                if (bc_${X}$%beg == BC_PERIODIC) then
-                    ! check if the boundary has left the domain, and then correct
-                    if (patch_ib(patch_id)%${X}$_centroid < ${X}$_domain%beg) then
-                        ! if the boundary exited "left", wrap it back around to the "right"
-                        patch_ib(patch_id)%${X}$_centroid = patch_ib(patch_id)%${X}$_centroid + (${X}$_domain%end &
-                                 & - ${X}$_domain%beg)
-                    else if (patch_ib(patch_id)%${X}$_centroid > ${X}$_domain%end) then
-                        ! if the boundary exited "right", wrap it back around to the "left"
-                        patch_ib(patch_id)%${X}$_centroid = patch_ib(patch_id)%${X}$_centroid - (${X}$_domain%end &
-                                 & - ${X}$_domain%beg)
+            #:for X, ID in [('x', 1), ('y', 2), ('z',3)]
+                if (num_dims >= ${ID}$) then
+                    ! check for periodicity
+                    if (bc_${X}$%beg == BC_PERIODIC) then
+                        ! check if the boundary has left the domain, and then correct
+                        if (patch_ib(patch_id)%${X}$_centroid < glb_bounds(${ID}$)%beg) then
+                            ! if the boundary exited "left", wrap it back around to the "right"
+                            patch_ib(patch_id)%${X}$_centroid = patch_ib(patch_id)%${X}$_centroid + (glb_bounds(${ID}$)%end &
+                                     & - glb_bounds(${ID}$)%beg)
+                        else if (patch_ib(patch_id)%${X}$_centroid > glb_bounds(${ID}$)%end) then
+                            ! if the boundary exited "right", wrap it back around to the "left"
+                            patch_ib(patch_id)%${X}$_centroid = patch_ib(patch_id)%${X}$_centroid - (glb_bounds(${ID}$)%end &
+                                     & - glb_bounds(${ID}$)%beg)
+                        end if
                     end if
                 end if
             #:endfor
-
-            if (p /= 0) then
-                ! check for periodicity
-                if (bc_z%beg == BC_PERIODIC) then
-                    ! check if the boundary has left the domain, and then correct
-                    if (patch_ib(patch_id)%z_centroid < z_domain%beg) then
-                        ! if the boundary exited "left", wrap it back around to the "right"
-                        patch_ib(patch_id)%z_centroid = patch_ib(patch_id)%z_centroid + (z_domain%end - z_domain%beg)
-                    else if (patch_ib(patch_id)%z_centroid > z_domain%end) then
-                        ! if the boundary exited "right", wrap it back around to the "left"
-                        patch_ib(patch_id)%z_centroid = patch_ib(patch_id)%z_centroid - (z_domain%end - z_domain%beg)
-                    end if
-                end if
-            end if
         end do
 
     end subroutine s_wrap_periodic_ibs
 
-    !> Compute the cross product c = a x b of two 3D vectors
+    !> @brief Computes the cross product c = a x b of two 3D vectors.
     subroutine s_cross_product(a, b, c)
 
         $:GPU_ROUTINE(parallelism='[seq]')
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index a2b6c84326..880b596ca6 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -22,14 +22,37 @@ module m_mpi_proxy
 
     implicit none
 
+    !> This variable is utilized to pack and send the buffer of the immersed boundary markers, for a single computational domain
+    !! boundary at the time, to the relevant neighboring processor.
     integer, private, allocatable, dimension(:) :: ib_buff_send  !< IB marker send buffer for halo exchange
+
+    !> q_cons_buff_recv is utilized to receive and unpack the buffer of the immersed boundary markers, for a single computational
+    !! domain boundary at the time, from the relevant neighboring processor.
     integer, private, allocatable, dimension(:) :: ib_buff_recv  !< IB marker receive buffer for halo exchange
     integer                                     :: i_halo_size
     $:GPU_DECLARE(create='[i_halo_size]')
 
+    integer, dimension(-1:1,-1:1,-1:1)          :: p_send_counts, p_recv_counts
+    integer, dimension(:,:,:,:), allocatable    :: p_send_ids
+    character(len=1), dimension(:), allocatable :: p_send_buff, p_recv_buff
+    integer                                     :: p_buff_size, p_var_size
+    !! EL Bubbles communication variables
+    integer, parameter :: MAX_NEIGHBORS = 27
+    integer            :: send_requests(MAX_NEIGHBORS), recv_requests(MAX_NEIGHBORS)
+    integer            :: recv_offsets(MAX_NEIGHBORS)
+    integer            :: neighbor_list(MAX_NEIGHBORS, 3)
+    integer            :: n_neighbors
+    $:GPU_DECLARE(create='[p_send_counts]')
+    integer, allocatable  :: force_send_counts(:), force_recv_counts(:)
+    integer, allocatable  :: force_send_ids(:,:)
+    integer, allocatable  :: flat_send_ids(:)
+    real(wp), allocatable :: force_send_vals(:,:,:)
+    real(wp), allocatable :: flat_send_vals(:)
+    $:GPU_DECLARE(create='[force_send_counts, force_send_ids, force_send_vals]')
+
 contains
 
-    !> Initialize the MPI proxy module
+    !> @brief Allocates immersed boundary communication buffers for MPI halo exchanges.
     subroutine s_initialize_mpi_proxy_module()
 
 #ifdef MFC_MPI
@@ -52,6 +75,81 @@ contains
 
     end subroutine s_initialize_mpi_proxy_module
 
+    !! This subroutine initializes the MPI buffers and variables required for the particle communication.
+    !! @param lag_num_ts Number of stages in time-stepping scheme
+    subroutine s_initialize_particles_mpi(lag_num_ts)
+
+        integer :: i, j, k
+        integer :: real_size, int_size, nReal, lag_num_ts
+        integer :: ierr  !< Generic flag used to identify and report MPI errors
+
+#ifdef MFC_MPI
+        call MPI_Pack_size(1, mpi_p, MPI_COMM_WORLD, real_size, ierr)
+        call MPI_Pack_size(1, MPI_INTEGER, MPI_COMM_WORLD, int_size, ierr)
+        nReal = 7 + 16*2 + 10*lag_num_ts
+        p_var_size = nReal*real_size + int_size
+        p_buff_size = lag_params%nBubs_glb*p_var_size
+        @:ALLOCATE(p_send_buff(0:p_buff_size), p_recv_buff(0:p_buff_size))
+        @:ALLOCATE(p_send_ids(nidx(1)%beg:nidx(1)%end, nidx(2)%beg:nidx(2)%end, nidx(3)%beg:nidx(3)%end, 0:lag_params%nBubs_glb))
+        ! First, collect all neighbor information
+        n_neighbors = 0
+        do k = nidx(3)%beg, nidx(3)%end
+            do j = nidx(2)%beg, nidx(2)%end
+                do i = nidx(1)%beg, nidx(1)%end
+                    if (abs(i) + abs(j) + abs(k) /= 0) then
+                        n_neighbors = n_neighbors + 1
+                        neighbor_list(n_neighbors, 1) = i
+                        neighbor_list(n_neighbors, 2) = j
+                        neighbor_list(n_neighbors, 3) = k
+                    end if
+                end do
+            end do
+        end do
+#endif
+
+    end subroutine s_initialize_particles_mpi
+
+    !! This subroutine initializes the MPI buffers and variables required for the particle communication.
+    !! @param lag_num_ts Number of stages in time-stepping scheme
+    subroutine s_initialize_solid_particles_mpi(lag_num_ts)
+
+        integer :: i, j, k
+        integer :: real_size, int_size, nReal, lag_num_ts
+        integer :: ierr  !< Generic flag used to identify and report MPI errors
+
+#ifdef MFC_MPI
+        call MPI_Pack_size(1, mpi_p, MPI_COMM_WORLD, real_size, ierr)
+        call MPI_Pack_size(1, MPI_INTEGER, MPI_COMM_WORLD, int_size, ierr)
+        nReal = 7 + 13*2 + 7*lag_num_ts
+        p_var_size = (nReal*real_size + int_size)
+        p_buff_size = lag_params%nParticles_glb*p_var_size
+        @:ALLOCATE(p_send_buff(0:p_buff_size), p_recv_buff(0:p_buff_size))
+        @:ALLOCATE(p_send_ids(nidx(1)%beg:nidx(1)%end, nidx(2)%beg:nidx(2)%end, nidx(3)%beg:nidx(3)%end, &
+                   & 0:lag_params%nParticles_glb))
+        ! First, collect all neighbor information
+        n_neighbors = 0
+        do k = nidx(3)%beg, nidx(3)%end
+            do j = nidx(2)%beg, nidx(2)%end
+                do i = nidx(1)%beg, nidx(1)%end
+                    if (abs(i) + abs(j) + abs(k) /= 0) then
+                        n_neighbors = n_neighbors + 1
+                        neighbor_list(n_neighbors, 1) = i
+                        neighbor_list(n_neighbors, 2) = j
+                        neighbor_list(n_neighbors, 3) = k
+                    end if
+                end do
+            end do
+        end do
+        @:ALLOCATE(force_send_counts(0:num_procs-1))
+        @:ALLOCATE(force_recv_counts(0:num_procs-1))
+        @:ALLOCATE(force_send_ids(0:num_procs-1, 1:lag_params%nParticles_glb))
+        @:ALLOCATE(force_send_vals(0:num_procs-1, 1:lag_params%nParticles_glb, 1:3))
+        @:ALLOCATE(flat_send_ids(1:lag_params%nParticles_glb))
+        @:ALLOCATE(flat_send_vals(1:3*lag_params%nParticles_glb))
+#endif
+
+    end subroutine s_initialize_solid_particles_mpi
+
     !> Since only the processor with rank 0 reads and verifies the consistency of user inputs, these are initially not available to
     !! the other processors. Then, the purpose of this subroutine is to distribute the user inputs to the remaining processors in
     !! the communicator.
@@ -92,8 +190,8 @@ contains
             & 'bc_z%grcbc_in', 'bc_z%grcbc_out', 'bc_z%grcbc_vel_out',          &
             & 'cfl_adap_dt', 'cfl_const_dt', 'cfl_dt', 'surface_tension',       &
             & 'shear_stress', 'bulk_stress', 'bubbles_lagrange',                &
-            & 'hyperelasticity', 'down_sample', 'int_comp','fft_wrt', &
-            & 'hyper_cleaning', 'ib_state_wrt']
+            & 'hyperelasticity', 'down_sample', 'int_comp','fft_wrt',           &
+            & 'hyper_cleaning', 'ib_state_wrt', 'particles_lagrange' ]
             call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
         #:endfor
 
@@ -109,17 +207,40 @@ contains
 
         if (bubbles_lagrange) then
             #:for VAR in [ 'heatTransfer_model', 'massTransfer_model', 'pressure_corrector', &
-                & 'write_bubbles', 'write_bubbles_stats']
+                & 'write_bubbles', 'write_bubbles_stats', 'write_void_evol', 'pressure_force', &
+                & 'gravity_force']
+                call MPI_BCAST(lag_params%${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
+            #:endfor
+
+            #:for VAR in ['solver_approach', 'cluster_type', 'smooth_type', 'nBubs_glb', 'vel_model', &
+                & 'drag_model', 'charNz']
+                call MPI_BCAST(lag_params%${VAR}$, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
+            #:endfor
+
+            #:for VAR in ['epsilonb','charwidth','valmaxvoid']
+                call MPI_BCAST(lag_params%${VAR}$, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
+            #:endfor
+
+            call MPI_BCAST(lag_params%input_path, len(lag_params%input_path), MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr)
+        end if
+
+        if (particles_lagrange) then
+            #:for VAR in [ 'heatTransfer_model', 'massTransfer_model', 'pressure_corrector', &
+                & 'write_bubbles', 'write_bubbles_stats', 'write_void_evol', 'pressure_force', &
+                & 'gravity_force', 'collision_force']
                 call MPI_BCAST(lag_params%${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
             #:endfor
 
-            #:for VAR in ['solver_approach', 'cluster_type', 'smooth_type', 'nBubs_glb']
+            #:for VAR in ['solver_approach', 'cluster_type', 'smooth_type', 'nParticles_glb', 'vel_model', &
+                & 'drag_model', 'qs_drag_model', 'stokes_drag', 'added_mass_model', 'interpolation_order']
                 call MPI_BCAST(lag_params%${VAR}$, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
             #:endfor
 
             #:for VAR in ['epsilonb','charwidth','valmaxvoid']
                 call MPI_BCAST(lag_params%${VAR}$, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
             #:endfor
+
+            call MPI_BCAST(lag_params%input_path, len(lag_params%input_path), MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr)
         end if
 
         #:for VAR in [ 'dt','weno_eps','teno_CT','pref','rhoref','R0ref','Web','Ca', 'sigma', &
@@ -128,9 +249,7 @@ contains
             & 'bc_y%vb1','bc_y%vb2','bc_y%vb3','bc_y%ve1','bc_y%ve2','bc_y%ve3', &
             & 'bc_z%vb1','bc_z%vb2','bc_z%vb3','bc_z%ve1','bc_z%ve2','bc_z%ve3', &
             & 'bc_x%pres_in','bc_x%pres_out','bc_y%pres_in','bc_y%pres_out', 'bc_z%pres_in','bc_z%pres_out', &
-            & 'x_domain%beg', 'x_domain%end', 'y_domain%beg', 'y_domain%end',    &
-            & 'z_domain%beg', 'z_domain%end', 'x_a', 'x_b', 'y_a', 'y_b', 'z_a', &
-            & 'z_b', 't_stop', 't_save', 'cfl_target', 'Bx0', 'alf_factor',  &
+            & 't_stop', 't_save', 'cfl_target', 'Bx0', 'alf_factor',  &
             & 'tau_star', 'cont_damage_s', 'alpha_bar', 'adap_dt_tol', &
             & 'ic_eps', 'ic_beta', 'hyper_cleaning_speed', &
             & 'hyper_cleaning_tau' ]
@@ -179,6 +298,12 @@ contains
             #:endfor
         end if
 
+        if (particles_lagrange) then
+            #:for VAR in [ 'rho0ref_particle','cp_particle']
+                call MPI_BCAST(particle_pp%${VAR}$, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
+            #:endfor
+        end if
+
         do i = 1, num_fluids_max
             #:for VAR in ['bc_x%alpha_rho_in','bc_x%alpha_in','bc_y%alpha_rho_in','bc_y%alpha_in','bc_z%alpha_rho_in', &
                 & 'bc_z%alpha_in']
@@ -233,11 +358,1096 @@ contains
         call MPI_BCAST(nv_uvm_out_of_core, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
         call MPI_BCAST(nv_uvm_igr_temps_on_gpu, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
         call MPI_BCAST(nv_uvm_pref_gpu, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
+
+        ! Extra BC Variable
+        call MPI_BCAST(periodic_bc, 3, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
 #endif
 
     end subroutine s_mpi_bcast_user_inputs
 
-    !> Broadcast random phase numbers from rank 0 to all MPI processes
+    !> @brief Packs, exchanges, and unpacks immersed boundary marker buffers between neighboring MPI ranks.
+    subroutine s_mpi_sendrecv_ib_buffers(ib_markers, mpi_dir, pbc_loc)
+
+        type(integer_field), intent(inout) :: ib_markers
+        integer, intent(in)                :: mpi_dir, pbc_loc
+        integer                            :: i, j, k, l, r, q  !< Generic loop iterators
+        integer                            :: buffer_counts(1:3), buffer_count
+        type(int_bounds_info)              :: boundary_conditions(1:3)
+        integer                            :: beg_end(1:2), grid_dims(1:3)
+        integer                            :: dst_proc, src_proc, recv_tag, send_tag
+        logical                            :: beg_end_geq_0, qbmm_comm
+        integer                            :: pack_offset, unpack_offset
+
+#ifdef MFC_MPI
+        integer :: ierr  !< Generic flag used to identify and report MPI errors
+
+        call nvtxStartRange("IB-MARKER-COMM-PACKBUF")
+
+        buffer_counts = (/buff_size*(n + 1)*(p + 1), buff_size*(m + 2*buff_size + 1)*(p + 1), &
+                         & buff_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1)/)
+
+        buffer_count = buffer_counts(mpi_dir)
+        boundary_conditions = (/bc_x, bc_y, bc_z/)
+        beg_end = (/boundary_conditions(mpi_dir)%beg, boundary_conditions(mpi_dir)%end/)
+        beg_end_geq_0 = beg_end(max(pbc_loc, 0) - pbc_loc + 1) >= 0
+
+        ! Implements: pbc_loc bc_x >= 0 -> [send/recv]_tag [dst/src]_proc -1 (=0) 0 -> [1,0] [0,0] | 0 0 [1,0] [beg,beg] -1 (=0) 1
+        ! -> [0,0] [1,0] | 0 1 [0,0] [end,beg] +1 (=1) 0 -> [0,1] [1,1] | 1 0 [0,1] [end,end] +1 (=1) 1 -> [1,1] [0,1] | 1 1 [1,1]
+        ! [beg,end]
+
+        send_tag = f_logical_to_int(.not. f_xor(beg_end_geq_0, pbc_loc == 1))
+        recv_tag = f_logical_to_int(pbc_loc == 1)
+
+        dst_proc = beg_end(1 + f_logical_to_int(f_xor(pbc_loc == 1, beg_end_geq_0)))
+        src_proc = beg_end(1 + f_logical_to_int(pbc_loc == 1))
+
+        grid_dims = (/m, n, p/)
+
+        pack_offset = 0
+        if (f_xor(pbc_loc == 1, beg_end_geq_0)) then
+            pack_offset = grid_dims(mpi_dir) - buff_size + 1
+        end if
+
+        unpack_offset = 0
+        if (pbc_loc == 1) then
+            unpack_offset = grid_dims(mpi_dir) + buff_size + 1
+        end if
+
+        ! Pack Buffer to Send
+        #:for mpi_dir in [1, 2, 3]
+            if (mpi_dir == ${mpi_dir}$) then
+                #:if mpi_dir == 1
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[j, k, l, r]')
+                    do l = 0, p
+                        do k = 0, n
+                            do j = 0, buff_size - 1
+                                r = (j + buff_size*(k + (n + 1)*l))
+                                ib_buff_send(r) = ib_markers%sf(j + pack_offset, k, l)
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                #:elif mpi_dir == 2
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[j, k, l, r]')
+                    do l = 0, p
+                        do k = 0, buff_size - 1
+                            do j = -buff_size, m + buff_size
+                                r = ((j + buff_size) + (m + 2*buff_size + 1)*(k + buff_size*l))
+                                ib_buff_send(r) = ib_markers%sf(j, k + pack_offset, l)
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                #:else
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[j, k, l, r]')
+                    do l = 0, buff_size - 1
+                        do k = -buff_size, n + buff_size
+                            do j = -buff_size, m + buff_size
+                                r = ((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + (n + 2*buff_size + 1)*l))
+                                ib_buff_send(r) = ib_markers%sf(j, k, l + pack_offset)
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                #:endif
+            end if
+        #:endfor
+        call nvtxEndRange  ! Packbuf
+
+        #:for rdma_mpi in [False, True]
+            if (rdma_mpi .eqv. ${'.true.' if rdma_mpi else '.false.'}$) then
+                #:if rdma_mpi
+                    #:call GPU_HOST_DATA(use_device_addr='[ib_buff_send, ib_buff_recv]')
+                        call nvtxStartRange("IB-MARKER-SENDRECV-RDMA")
+                        call MPI_SENDRECV(ib_buff_send, buffer_count, MPI_INTEGER, dst_proc, send_tag, ib_buff_recv, &
+                                          & buffer_count, MPI_INTEGER, src_proc, recv_tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+                        call nvtxEndRange
+                    #:endcall GPU_HOST_DATA
+                    $:GPU_WAIT()
+                #:else
+                    call nvtxStartRange("IB-MARKER-DEV2HOST")
+                    $:GPU_UPDATE(host='[ib_buff_send]')
+                    call nvtxEndRange
+
+                    call nvtxStartRange("IB-MARKER-SENDRECV-NO-RMDA")
+                    call MPI_SENDRECV(ib_buff_send, buffer_count, MPI_INTEGER, dst_proc, send_tag, ib_buff_recv, buffer_count, &
+                                      & MPI_INTEGER, src_proc, recv_tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+                    call nvtxEndRange
+
+                    call nvtxStartRange("IB-MARKER-HOST2DEV")
+                    $:GPU_UPDATE(device='[ib_buff_recv]')
+                    call nvtxEndRange
+                #:endif
+            end if
+        #:endfor
+
+        ! Unpack Received Buffer
+        call nvtxStartRange("IB-MARKER-COMM-UNPACKBUF")
+        #:for mpi_dir in [1, 2, 3]
+            if (mpi_dir == ${mpi_dir}$) then
+                #:if mpi_dir == 1
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[j, k, l, r]')
+                    do l = 0, p
+                        do k = 0, n
+                            do j = -buff_size, -1
+                                r = (j + buff_size*((k + 1) + (n + 1)*l))
+                                ib_markers%sf(j + unpack_offset, k, l) = ib_buff_recv(r)
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                #:elif mpi_dir == 2
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[j, k, l, r]')
+                    do l = 0, p
+                        do k = -buff_size, -1
+                            do j = -buff_size, m + buff_size
+                                r = ((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + buff_size*l))
+                                ib_markers%sf(j, k + unpack_offset, l) = ib_buff_recv(r)
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                #:else
+                    ! Unpacking buffer from bc_z%beg
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[j, k, l, r]')
+                    do l = -buff_size, -1
+                        do k = -buff_size, n + buff_size
+                            do j = -buff_size, m + buff_size
+                                r = ((j + buff_size) + (m + 2*buff_size + 1)*((k + buff_size) + (n + 2*buff_size + 1)*(l &
+                                     & + buff_size)))
+                                ib_markers%sf(j, k, l + unpack_offset) = ib_buff_recv(r)
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                #:endif
+            end if
+        #:endfor
+        call nvtxEndRange
+#endif
+
+    end subroutine s_mpi_sendrecv_ib_buffers
+
+    !> This subroutine adds particles to the transfer list for the MPI communication.
+    !! @param nPart Current LOCAL number of particles
+    !! @param pos Current position of each particle
+    !! @param posPrev Previous position of each particle (optional, not used
+    !!                for communication of initial condition)
+    impure subroutine s_add_particles_to_transfer_list(nBub, pos, posPrev, include_ghost)
+
+        real(wp), dimension(:,:)      :: pos, posPrev
+        integer                       :: bubID, nbub
+        integer                       :: i, j, k
+        logical, optional, intent(in) :: include_ghost
+
+        do k = nidx(3)%beg, nidx(3)%end
+            do j = nidx(2)%beg, nidx(2)%end
+                do i = nidx(1)%beg, nidx(1)%end
+                    p_send_counts(i, j, k) = 0
+                end do
+            end do
+        end do
+
+        do k = 1, nbub
+            if (f_crosses_boundary(k, 1, -1, pos, posPrev, include_ghost)) then
+                call s_add_particle_to_direction(k, -1, 0, 0)
+                if (n > 0) then
+                    if (f_crosses_boundary(k, 2, -1, pos, posPrev, include_ghost)) then
+                        call s_add_particle_to_direction(k, -1, -1, 0)
+                        call s_add_particle_to_direction(k, 0, -1, 0)
+                        if (p > 0) then
+                            if (f_crosses_boundary(k, 3, -1, pos, posPrev, include_ghost)) then
+                                call s_add_particle_to_direction(k, -1, -1, -1)
+                                call s_add_particle_to_direction(k, 0, -1, -1)
+                                call s_add_particle_to_direction(k, -1, 0, -1)
+                                call s_add_particle_to_direction(k, 0, 0, -1)
+                            else if (f_crosses_boundary(k, 3, 1, pos, posPrev, include_ghost)) then
+                                call s_add_particle_to_direction(k, -1, -1, 1)
+                                call s_add_particle_to_direction(k, 0, -1, 1)
+                                call s_add_particle_to_direction(k, -1, 0, 1)
+                                call s_add_particle_to_direction(k, 0, 0, 1)
+                            end if
+                        end if
+                    else if (f_crosses_boundary(k, 2, 1, pos, posPrev, include_ghost)) then
+                        call s_add_particle_to_direction(k, -1, 1, 0)
+                        call s_add_particle_to_direction(k, 0, 1, 0)
+                        if (p > 0) then
+                            if (f_crosses_boundary(k, 3, -1, pos, posPrev, include_ghost)) then
+                                call s_add_particle_to_direction(k, -1, 1, -1)
+                                call s_add_particle_to_direction(k, 0, 1, -1)
+                                call s_add_particle_to_direction(k, -1, 0, -1)
+                                call s_add_particle_to_direction(k, 0, 0, -1)
+                            else if (f_crosses_boundary(k, 3, 1, pos, posPrev, include_ghost)) then
+                                call s_add_particle_to_direction(k, -1, 1, 1)
+                                call s_add_particle_to_direction(k, 0, 1, 1)
+                                call s_add_particle_to_direction(k, -1, 0, 1)
+                                call s_add_particle_to_direction(k, 0, 0, 1)
+                            end if
+                        end if
+                    else
+                        if (p > 0) then
+                            if (f_crosses_boundary(k, 3, -1, pos, posPrev, include_ghost)) then
+                                call s_add_particle_to_direction(k, -1, 0, -1)
+                                call s_add_particle_to_direction(k, 0, 0, -1)
+                            else if (f_crosses_boundary(k, 3, 1, pos, posPrev, include_ghost)) then
+                                call s_add_particle_to_direction(k, -1, 0, 1)
+                                call s_add_particle_to_direction(k, 0, 0, 1)
+                            end if
+                        end if
+                    end if
+                end if
+            else if (f_crosses_boundary(k, 1, 1, pos, posPrev, include_ghost)) then
+                call s_add_particle_to_direction(k, 1, 0, 0)
+                if (n > 0) then
+                    if (f_crosses_boundary(k, 2, -1, pos, posPrev, include_ghost)) then
+                        call s_add_particle_to_direction(k, 1, -1, 0)
+                        call s_add_particle_to_direction(k, 0, -1, 0)
+                        if (p > 0) then
+                            if (f_crosses_boundary(k, 3, -1, pos, posPrev, include_ghost)) then
+                                call s_add_particle_to_direction(k, 1, -1, -1)
+                                call s_add_particle_to_direction(k, 0, -1, -1)
+                                call s_add_particle_to_direction(k, 1, 0, -1)
+                                call s_add_particle_to_direction(k, 0, 0, -1)
+                            else if (f_crosses_boundary(k, 3, 1, pos, posPrev, include_ghost)) then
+                                call s_add_particle_to_direction(k, 1, -1, 1)
+                                call s_add_particle_to_direction(k, 0, -1, 1)
+                                call s_add_particle_to_direction(k, 1, 0, 1)
+                                call s_add_particle_to_direction(k, 0, 0, 1)
+                            end if
+                        end if
+                    else if (f_crosses_boundary(k, 2, 1, pos, posPrev, include_ghost)) then
+                        call s_add_particle_to_direction(k, 1, 1, 0)
+                        call s_add_particle_to_direction(k, 0, 1, 0)
+                        if (p > 0) then
+                            if (f_crosses_boundary(k, 3, -1, pos, posPrev, include_ghost)) then
+                                call s_add_particle_to_direction(k, 1, 1, -1)
+                                call s_add_particle_to_direction(k, 0, 1, -1)
+                                call s_add_particle_to_direction(k, 1, 0, -1)
+                                call s_add_particle_to_direction(k, 0, 0, -1)
+                            else if (f_crosses_boundary(k, 3, 1, pos, posPrev, include_ghost)) then
+                                call s_add_particle_to_direction(k, 1, 1, 1)
+                                call s_add_particle_to_direction(k, 0, 1, 1)
+                                call s_add_particle_to_direction(k, 1, 0, 1)
+                                call s_add_particle_to_direction(k, 0, 0, 1)
+                            end if
+                        end if
+                    else
+                        if (p > 0) then
+                            if (f_crosses_boundary(k, 3, -1, pos, posPrev, include_ghost)) then
+                                call s_add_particle_to_direction(k, 1, 0, -1)
+                                call s_add_particle_to_direction(k, 0, 0, -1)
+                            else if (f_crosses_boundary(k, 3, 1, pos, posPrev, include_ghost)) then
+                                call s_add_particle_to_direction(k, 1, 0, 1)
+                                call s_add_particle_to_direction(k, 0, 0, 1)
+                            end if
+                        end if
+                    end if
+                end if
+            else if (f_crosses_boundary(k, 2, -1, pos, posPrev, include_ghost)) then
+                call s_add_particle_to_direction(k, 0, -1, 0)
+                if (p > 0) then
+                    if (f_crosses_boundary(k, 3, -1, pos, posPrev, include_ghost)) then
+                        call s_add_particle_to_direction(k, 0, -1, -1)
+                        call s_add_particle_to_direction(k, 0, 0, -1)
+                    else if (f_crosses_boundary(k, 3, 1, pos, posPrev, include_ghost)) then
+                        call s_add_particle_to_direction(k, 0, -1, 1)
+                        call s_add_particle_to_direction(k, 0, 0, 1)
+                    end if
+                end if
+            else if (f_crosses_boundary(k, 2, 1, pos, posPrev, include_ghost)) then
+                call s_add_particle_to_direction(k, 0, 1, 0)
+                if (p > 0) then
+                    if (f_crosses_boundary(k, 3, -1, pos, posPrev, include_ghost)) then
+                        call s_add_particle_to_direction(k, 0, 1, -1)
+                        call s_add_particle_to_direction(k, 0, 0, -1)
+                    else if (f_crosses_boundary(k, 3, 1, pos, posPrev, include_ghost)) then
+                        call s_add_particle_to_direction(k, 0, 1, 1)
+                        call s_add_particle_to_direction(k, 0, 0, 1)
+                    end if
+                end if
+            else if (p > 0) then
+                if (f_crosses_boundary(k, 3, -1, pos, posPrev, include_ghost)) then
+                    call s_add_particle_to_direction(k, 0, 0, -1)
+                else if (f_crosses_boundary(k, 3, 1, pos, posPrev, include_ghost)) then
+                    call s_add_particle_to_direction(k, 0, 0, 1)
+                end if
+            end if
+        end do
+
+        contains  ! f_crosses_boundary(k, 2, -1, pos, posPrev, include_ghost
+
+        logical function f_crosses_boundary(particle_id, dir, loc, pos, posPrev, include_ghost)
+
+            integer, intent(in)                            :: particle_id, dir, loc
+            real(wp), dimension(:,:), intent(in)           :: pos
+            real(wp), dimension(:,:), optional, intent(in) :: posPrev
+            logical, optional, intent(in)                  :: include_ghost
+
+            if (present(include_ghost) .and. include_ghost) then
+                if (loc == -1) then  ! Beginning of the domain
+                    if (nidx(dir)%beg == 0) then
+                        f_crosses_boundary = .false.
+                        return
+                    end if
+
+                    f_crosses_boundary = pos(particle_id, dir) < pcomm_coords_ghost(dir)%beg
+                else if (loc == 1) then  ! End of the domain
+                    if (nidx(dir)%end == 0) then
+                        f_crosses_boundary = .false.
+                        return
+                    end if
+
+                    f_crosses_boundary = pos(particle_id, dir) > pcomm_coords_ghost(dir)%end
+                end if
+            else
+                if (loc == -1) then  ! Beginning of the domain
+                    if (nidx(dir)%beg == 0) then
+                        f_crosses_boundary = .false.
+                        return
+                    end if
+
+                    f_crosses_boundary = (posPrev(particle_id, dir) >= pcomm_coords(dir)%beg .and. pos(particle_id, &
+                                          & dir) < pcomm_coords(dir)%beg)
+                else if (loc == 1) then  ! End of the domain
+                    if (nidx(dir)%end == 0) then
+                        f_crosses_boundary = .false.
+                        return
+                    end if
+
+                    f_crosses_boundary = (posPrev(particle_id, dir) <= pcomm_coords(dir)%end .and. pos(particle_id, &
+                                          & dir) > pcomm_coords(dir)%end)
+                end if
+            end if
+
+        end function f_crosses_boundary
+
+        subroutine s_add_particle_to_direction(particle_id, dir_x, dir_y, dir_z)
+
+            integer, intent(in) :: particle_id, dir_x, dir_y, dir_z
+
+            p_send_ids(dir_x, dir_y, dir_z, p_send_counts(dir_x, dir_y, dir_z)) = particle_id
+            p_send_counts(dir_x, dir_y, dir_z) = p_send_counts(dir_x, dir_y, dir_z) + 1
+
+        end subroutine s_add_particle_to_direction
+
+    end subroutine s_add_particles_to_transfer_list
+
+    !> This subroutine performs the MPI communication for lagrangian particles/ bubbles.
+    !! @param bub_R0 Initial radius of each bubble
+    !! @param Rmax_stats Maximum radius of each bubble
+    !! @param Rmin_stats Minimum radius of each bubble
+    !! @param gas_mg Mass of gas in each bubble
+    !! @param gas_betaT Heat flux model coefficient for each bubble
+    !! @param gas_betaC mass flux model coefficient for each bubble
+    !! @param bub_dphidt Subgrid velocity potential for each bubble
+    !! @param lag_id Global and local ID of each bubble
+    !! @param gas_p Pressure of the gas in each bubble
+    !! @param gas_mv Mass of vapor in each bubble
+    !! @param rad Radius of each bubble
+    !! @param rvel Radial velocity of each bubble
+    !! @param pos Position of each bubble
+    !! @param posPrev Previous position of each bubble
+    !! @param vel Velocity of each bubble
+    !! @param scoord Cell index in real format of each bubble
+    !! @param drad Radial velocity of each bubble
+    !! @param drvel Radial acceleration of each bubble
+    !! @param dgasp Time derivative of gas pressure in each bubble
+    !! @param dgasmv Time derivative of vapor mass in each bubble
+    !! @param dpos Time derivative of position of each bubble
+    !! @param dvel Time derivative of velocity of each bubble
+    !! @param lag_num_ts Number of stages in time-stepping scheme
+    !! @param nBubs Local number of bubbles
+    impure subroutine s_mpi_sendrecv_particles(bub_R0, Rmax_stats, Rmin_stats, gas_mg, gas_betaT, gas_betaC, bub_dphidt, lag_id, &
+        & gas_p, gas_mv, rad, rvel, pos, posPrev, vel, scoord, drad, drvel, dgasp, dgasmv, dpos, dvel, lag_num_ts, nbubs, dest)
+
+        real(wp), dimension(:)     :: bub_R0, Rmax_stats, Rmin_stats, gas_mg, gas_betaT, gas_betaC, bub_dphidt
+        integer, dimension(:,:)    :: lag_id
+        real(wp), dimension(:,:)   :: gas_p, gas_mv, rad, rvel, drad, drvel, dgasp, dgasmv
+        real(wp), dimension(:,:,:) :: pos, posPrev, vel, scoord, dpos, dvel
+        integer                    :: position, bub_id, lag_num_ts, tag, partner, send_tag, recv_tag, nbubs, p_recv_size, dest
+        integer                    :: i, j, k, l, q, r
+        integer                    :: req_send, req_recv, ierr  !< Generic flag used to identify and report MPI errors
+        integer                    :: send_count, send_offset, recv_count, recv_offset
+
+#ifdef MFC_MPI
+        ! Phase 1: Exchange particle counts using non-blocking communication
+        send_count = 0
+        recv_count = 0
+
+        ! Post all receives first
+        do l = 1, n_neighbors
+            i = neighbor_list(l, 1)
+            j = neighbor_list(l, 2)
+            k = neighbor_list(l, 3)
+            partner = neighbor_ranks(i, j, k)
+            recv_tag = neighbor_tag(i, j, k)
+
+            recv_count = recv_count + 1
+            call MPI_Irecv(p_recv_counts(i, j, k), 1, MPI_INTEGER, partner, recv_tag, MPI_COMM_WORLD, recv_requests(recv_count), &
+                           & ierr)
+        end do
+
+        ! Post all sends
+        do l = 1, n_neighbors
+            i = neighbor_list(l, 1)
+            j = neighbor_list(l, 2)
+            k = neighbor_list(l, 3)
+            partner = neighbor_ranks(i, j, k)
+            send_tag = neighbor_tag(-i, -j, -k)
+
+            send_count = send_count + 1
+            call MPI_Isend(p_send_counts(i, j, k), 1, MPI_INTEGER, partner, send_tag, MPI_COMM_WORLD, send_requests(send_count), &
+                           & ierr)
+        end do
+
+        ! Wait for all count exchanges to complete
+        if (recv_count > 0) then
+            call MPI_Waitall(recv_count, recv_requests(1:recv_count), MPI_STATUSES_IGNORE, ierr)
+        end if
+        if (send_count > 0) then
+            call MPI_Waitall(send_count, send_requests(1:send_count), MPI_STATUSES_IGNORE, ierr)
+        end if
+
+        ! Phase 2: Exchange particle data using non-blocking communication
+        send_count = 0
+        recv_count = 0
+
+        ! Post all receives for particle data first
+        recv_offset = 1
+        do l = 1, n_neighbors
+            i = neighbor_list(l, 1)
+            j = neighbor_list(l, 2)
+            k = neighbor_list(l, 3)
+
+            if (p_recv_counts(i, j, k) > 0) then
+                partner = neighbor_ranks(i, j, k)
+                p_recv_size = p_recv_counts(i, j, k)*p_var_size
+                recv_tag = neighbor_tag(i, j, k)
+
+                recv_count = recv_count + 1
+                call MPI_Irecv(p_recv_buff(recv_offset), p_recv_size, MPI_PACKED, partner, recv_tag, MPI_COMM_WORLD, &
+                               & recv_requests(recv_count), ierr)
+                recv_offsets(l) = recv_offset
+                recv_offset = recv_offset + p_recv_size
+            end if
+        end do
+
+        ! Pack and send particle data
+        send_offset = 0
+        do l = 1, n_neighbors
+            i = neighbor_list(l, 1)
+            j = neighbor_list(l, 2)
+            k = neighbor_list(l, 3)
+
+            if (p_send_counts(i, j, k) > 0 .and. abs(i) + abs(j) + abs(k) /= 0 .and. abs(i) + abs(j) + abs(k) /= 0) then
+                partner = neighbor_ranks(i, j, k)
+                send_tag = neighbor_tag(-i, -j, -k)
+
+                ! Pack data for sending
+                position = 0
+                do q = 0, p_send_counts(i, j, k) - 1
+                    bub_id = p_send_ids(i, j, k, q)
+
+                    call MPI_Pack(lag_id(bub_id, 1), 1, MPI_INTEGER, p_send_buff(send_offset), p_buff_size, position, &
+                                  & MPI_COMM_WORLD, ierr)
+                    call MPI_Pack(bub_R0(bub_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                    call MPI_Pack(Rmax_stats(bub_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, &
+                                  & ierr)
+                    call MPI_Pack(Rmin_stats(bub_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, &
+                                  & ierr)
+                    call MPI_Pack(gas_mg(bub_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                    call MPI_Pack(gas_betaT(bub_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, &
+                                  & ierr)
+                    call MPI_Pack(gas_betaC(bub_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, &
+                                  & ierr)
+                    call MPI_Pack(bub_dphidt(bub_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, &
+                                  & ierr)
+                    do r = 1, 2
+                        call MPI_Pack(gas_p(bub_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, &
+                                      & MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(gas_mv(bub_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, &
+                                      & MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(rad(bub_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, &
+                                      & ierr)
+                        call MPI_Pack(rvel(bub_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, &
+                                      & ierr)
+                        call MPI_Pack(pos(bub_id,:,r), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, &
+                                      & ierr)
+                        call MPI_Pack(posPrev(bub_id,:,r), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, &
+                                      & MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(vel(bub_id,:,r), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, &
+                                      & ierr)
+                        call MPI_Pack(scoord(bub_id,:,r), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, &
+                                      & MPI_COMM_WORLD, ierr)
+                    end do
+                    do r = 1, lag_num_ts
+                        call MPI_Pack(drad(bub_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, &
+                                      & ierr)
+                        call MPI_Pack(drvel(bub_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, &
+                                      & MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(dgasp(bub_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, &
+                                      & MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(dgasmv(bub_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, &
+                                      & MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(dpos(bub_id,:,r), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, &
+                                      & MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(dvel(bub_id,:,r), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, &
+                                      & MPI_COMM_WORLD, ierr)
+                    end do
+                end do
+
+                send_count = send_count + 1
+                call MPI_Isend(p_send_buff(send_offset), position, MPI_PACKED, partner, send_tag, MPI_COMM_WORLD, &
+                               & send_requests(send_count), ierr)
+                send_offset = send_offset + position
+            end if
+        end do
+
+        ! Wait for all recvs for contiguous data to complete
+        call MPI_Waitall(recv_count, recv_requests(1:recv_count), MPI_STATUSES_IGNORE, ierr)
+
+        ! Process received data as it arrives
+        do l = 1, n_neighbors
+            i = neighbor_list(l, 1)
+            j = neighbor_list(l, 2)
+            k = neighbor_list(l, 3)
+
+            if (p_recv_counts(i, j, k) > 0 .and. abs(i) + abs(j) + abs(k) /= 0) then
+                p_recv_size = p_recv_counts(i, j, k)*p_var_size
+                recv_offset = recv_offsets(l)
+
+                position = 0
+                ! Unpack received data
+                do q = 0, p_recv_counts(i, j, k) - 1
+                    nbubs = nbubs + 1
+                    bub_id = nbubs
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, lag_id(bub_id, 1), 1, MPI_INTEGER, &
+                                    & MPI_COMM_WORLD, ierr)
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, bub_R0(bub_id), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, Rmax_stats(bub_id), 1, mpi_p, &
+                                    & MPI_COMM_WORLD, ierr)
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, Rmin_stats(bub_id), 1, mpi_p, &
+                                    & MPI_COMM_WORLD, ierr)
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, gas_mg(bub_id), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, gas_betaT(bub_id), 1, mpi_p, MPI_COMM_WORLD, &
+                                    & ierr)
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, gas_betaC(bub_id), 1, mpi_p, MPI_COMM_WORLD, &
+                                    & ierr)
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, bub_dphidt(bub_id), 1, mpi_p, &
+                                    & MPI_COMM_WORLD, ierr)
+                    do r = 1, 2
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, gas_p(bub_id, r), 1, mpi_p, &
+                                        & MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, gas_mv(bub_id, r), 1, mpi_p, &
+                                        & MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, rad(bub_id, r), 1, mpi_p, &
+                                        & MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, rvel(bub_id, r), 1, mpi_p, &
+                                        & MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, pos(bub_id,:,r), 3, mpi_p, &
+                                        & MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, posPrev(bub_id,:,r), 3, mpi_p, &
+                                        & MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, vel(bub_id,:,r), 3, mpi_p, &
+                                        & MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, scoord(bub_id,:,r), 3, mpi_p, &
+                                        & MPI_COMM_WORLD, ierr)
+                    end do
+                    do r = 1, lag_num_ts
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, drad(bub_id, r), 1, mpi_p, &
+                                        & MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, drvel(bub_id, r), 1, mpi_p, &
+                                        & MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, dgasp(bub_id, r), 1, mpi_p, &
+                                        & MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, dgasmv(bub_id, r), 1, mpi_p, &
+                                        & MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, dpos(bub_id,:,r), 3, mpi_p, &
+                                        & MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, dvel(bub_id,:,r), 3, mpi_p, &
+                                        & MPI_COMM_WORLD, ierr)
+                    end do
+                    lag_id(bub_id, 2) = bub_id
+                end do
+                recv_offset = recv_offset + p_recv_size
+            end if
+        end do
+
+        ! Wait for all sends to complete
+        if (send_count > 0) then
+            call MPI_Waitall(send_count, send_requests(1:send_count), MPI_STATUSES_IGNORE, ierr)
+        end if
+#endif
+
+        if (any(periodic_bc)) then
+            call s_wrap_particle_positions(pos, posPrev, nbubs, dest)
+        end if
+
+    end subroutine s_mpi_sendrecv_particles
+
+    !> This subroutine performs the MPI communication for lagrangian particles/ particles.
+    !! @param particle_R0 Initial radius of each particle
+    !! @param Rmax_stats Maximum radius of each particle
+    !! @param Rmin_stats Minimum radius of each particle
+    !! @param particle_mass Mass of each particle
+    !! @param f_p Force on each particle
+    !! @param lag_id Global and local ID of each particle
+    !! @param rad Radius of each particle
+    !! @param pos Position of each particle
+    !! @param posPrev Previous position of each particle
+    !! @param vel Velocity of each particle
+    !! @param scoord Cell index in real format of each particle
+    !! @param drad Time derivative of particle's radius
+    !! @param dpos Time derivative of position of each particle
+    !! @param dvel Time derivative of velocity of each particle
+    !! @param lag_num_ts Number of stages in time-stepping scheme
+    !! @param nParticles Local number of particles
+    impure subroutine s_mpi_sendrecv_solid_particles(p_owner_rank, particle_R0, Rmax_stats, Rmin_stats, particle_mass, f_p, &
+        & lag_id, rad, pos, posPrev, vel, scoord, drad, dpos, dvel, lag_num_ts, nParticles, dest)
+
+        integer, dimension(:) :: p_owner_rank
+        real(wp), dimension(:) :: particle_R0, Rmax_stats, Rmin_stats, particle_mass
+        real(wp), dimension(:,:) :: f_p
+        integer, dimension(:,:) :: lag_id
+        real(wp), dimension(:,:) :: rad, drad
+        real(wp), dimension(:,:,:) :: pos, posPrev, vel, scoord, dpos, dvel
+        integer :: position, particle_id, lag_num_ts, tag, partner, send_tag, recv_tag, nParticles, p_recv_size, dest
+        integer :: i, j, k, l, q, r
+        integer :: req_send, req_recv, ierr  !< Generic flag used to identify and report MPI errors
+        integer :: send_count, send_offset, recv_count, recv_offset
+
+#ifdef MFC_MPI
+        ! Phase 1: Exchange particle counts using non-blocking communication
+        send_count = 0
+        recv_count = 0
+
+        ! Post all receives first
+        do l = 1, n_neighbors
+            i = neighbor_list(l, 1)
+            j = neighbor_list(l, 2)
+            k = neighbor_list(l, 3)
+            partner = neighbor_ranks(i, j, k)
+            recv_tag = neighbor_tag(i, j, k)
+
+            recv_count = recv_count + 1
+            call MPI_Irecv(p_recv_counts(i, j, k), 1, MPI_INTEGER, partner, recv_tag, MPI_COMM_WORLD, recv_requests(recv_count), &
+                           & ierr)
+        end do
+
+        ! Post all sends
+        do l = 1, n_neighbors
+            i = neighbor_list(l, 1)
+            j = neighbor_list(l, 2)
+            k = neighbor_list(l, 3)
+            partner = neighbor_ranks(i, j, k)
+            send_tag = neighbor_tag(-i, -j, -k)
+
+            send_count = send_count + 1
+            call MPI_Isend(p_send_counts(i, j, k), 1, MPI_INTEGER, partner, send_tag, MPI_COMM_WORLD, send_requests(send_count), &
+                           & ierr)
+        end do
+
+        ! Wait for all count exchanges to complete
+        if (recv_count > 0) then
+            call MPI_Waitall(recv_count, recv_requests(1:recv_count), MPI_STATUSES_IGNORE, ierr)
+        end if
+        if (send_count > 0) then
+            call MPI_Waitall(send_count, send_requests(1:send_count), MPI_STATUSES_IGNORE, ierr)
+        end if
+
+        ! Phase 2: Exchange particle data using non-blocking communication
+        send_count = 0
+        recv_count = 0
+
+        ! Post all receives for particle data first
+        recv_offset = 1
+        do l = 1, n_neighbors
+            i = neighbor_list(l, 1)
+            j = neighbor_list(l, 2)
+            k = neighbor_list(l, 3)
+
+            if (p_recv_counts(i, j, k) > 0) then
+                partner = neighbor_ranks(i, j, k)
+                p_recv_size = p_recv_counts(i, j, k)*p_var_size
+                recv_tag = neighbor_tag(i, j, k)
+
+                recv_count = recv_count + 1
+                call MPI_Irecv(p_recv_buff(recv_offset), p_recv_size, MPI_PACKED, partner, recv_tag, MPI_COMM_WORLD, &
+                               & recv_requests(recv_count), ierr)
+                recv_offsets(l) = recv_offset
+                recv_offset = recv_offset + p_recv_size
+            end if
+        end do
+
+        ! Pack and send particle data
+        send_offset = 0
+        do l = 1, n_neighbors
+            i = neighbor_list(l, 1)
+            j = neighbor_list(l, 2)
+            k = neighbor_list(l, 3)
+
+            if (p_send_counts(i, j, k) > 0 .and. abs(i) + abs(j) + abs(k) /= 0 .and. abs(i) + abs(j) + abs(k) /= 0) then
+                partner = neighbor_ranks(i, j, k)
+                send_tag = neighbor_tag(-i, -j, -k)
+
+                ! Pack data for sending
+                position = 0
+                do q = 0, p_send_counts(i, j, k) - 1
+                    particle_id = p_send_ids(i, j, k, q)
+
+                    call MPI_Pack(lag_id(particle_id, 1), 1, MPI_INTEGER, p_send_buff(send_offset), p_buff_size, position, &
+                                  & MPI_COMM_WORLD, ierr)
+                    call MPI_Pack(particle_R0(particle_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, &
+                                  & MPI_COMM_WORLD, ierr)
+                    call MPI_Pack(Rmax_stats(particle_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, &
+                                  & MPI_COMM_WORLD, ierr)
+                    call MPI_Pack(Rmin_stats(particle_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, &
+                                  & MPI_COMM_WORLD, ierr)
+                    call MPI_Pack(particle_mass(particle_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, &
+                                  & MPI_COMM_WORLD, ierr)
+                    call MPI_Pack(f_p(particle_id,:), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, &
+                                  & ierr)
+                    ! call MPI_Pack(gas_betaT(particle_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position,
+                    ! MPI_COMM_WORLD, ierr) call MPI_Pack(gas_betaC(particle_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size,
+                    ! position, MPI_COMM_WORLD, ierr) call MPI_Pack(bub_dphidt(particle_id), 1, mpi_p, p_send_buff(send_offset),
+                    ! p_buff_size, position, MPI_COMM_WORLD, ierr)
+                    do r = 1, 2
+                        ! call MPI_Pack(gas_p(particle_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position,
+                        ! MPI_COMM_WORLD, ierr) call MPI_Pack(gas_mv(particle_id, r), 1, mpi_p, p_send_buff(send_offset),
+                        ! p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(rad(particle_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, &
+                                      & MPI_COMM_WORLD, ierr)
+                        ! call MPI_Pack(rvel(particle_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position,
+                        ! MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(pos(particle_id,:,r), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, &
+                                      & MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(posPrev(particle_id,:,r), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, &
+                                      & MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(vel(particle_id,:,r), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, &
+                                      & MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(scoord(particle_id,:,r), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, &
+                                      & MPI_COMM_WORLD, ierr)
+                    end do
+                    do r = 1, lag_num_ts
+                        call MPI_Pack(drad(particle_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, &
+                                      & MPI_COMM_WORLD, ierr)
+                        ! call MPI_Pack(drvel(particle_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position,
+                        ! MPI_COMM_WORLD, ierr) call MPI_Pack(dgasp(particle_id, r), 1, mpi_p, p_send_buff(send_offset),
+                        ! p_buff_size, position, MPI_COMM_WORLD, ierr) call MPI_Pack(dgasmv(particle_id, r), 1, mpi_p,
+                        ! p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(dpos(particle_id,:,r), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, &
+                                      & MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(dvel(particle_id,:,r), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, &
+                                      & MPI_COMM_WORLD, ierr)
+                    end do
+                end do
+
+                send_count = send_count + 1
+                call MPI_Isend(p_send_buff(send_offset), position, MPI_PACKED, partner, send_tag, MPI_COMM_WORLD, &
+                               & send_requests(send_count), ierr)
+                send_offset = send_offset + position
+            end if
+        end do
+
+        ! Wait for all recvs for contiguous data to complete
+        call MPI_Waitall(recv_count, recv_requests(1:recv_count), MPI_STATUSES_IGNORE, ierr)
+
+        ! Process received data as it arrives
+        do l = 1, n_neighbors
+            i = neighbor_list(l, 1)
+            j = neighbor_list(l, 2)
+            k = neighbor_list(l, 3)
+
+            if (p_recv_counts(i, j, k) > 0 .and. abs(i) + abs(j) + abs(k) /= 0) then
+                p_recv_size = p_recv_counts(i, j, k)*p_var_size
+                recv_offset = recv_offsets(l)
+
+                position = 0
+                ! Unpack received data
+                do q = 0, p_recv_counts(i, j, k) - 1
+                    nParticles = nParticles + 1
+                    particle_id = nParticles
+
+                    p_owner_rank(particle_id) = neighbor_ranks(i, j, k)
+
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, lag_id(particle_id, 1), 1, MPI_INTEGER, &
+                                    & MPI_COMM_WORLD, ierr)
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, particle_R0(particle_id), 1, mpi_p, &
+                                    & MPI_COMM_WORLD, ierr)
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, Rmax_stats(particle_id), 1, mpi_p, &
+                                    & MPI_COMM_WORLD, ierr)
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, Rmin_stats(particle_id), 1, mpi_p, &
+                                    & MPI_COMM_WORLD, ierr)
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, particle_mass(particle_id), 1, mpi_p, &
+                                    & MPI_COMM_WORLD, ierr)
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, f_p(particle_id,:), 3, mpi_p, &
+                                    & MPI_COMM_WORLD, ierr)
+                    ! call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, gas_betaT(particle_id), 1, mpi_p,
+                    ! MPI_COMM_WORLD, ierr) call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, gas_betaC(particle_id),
+                    ! 1, mpi_p, MPI_COMM_WORLD, ierr) call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position,
+                    ! bub_dphidt(particle_id), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                    do r = 1, 2
+                        ! call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, gas_p(particle_id, r), 1, mpi_p,
+                        ! MPI_COMM_WORLD, ierr) call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, gas_mv(particle_id,
+                        ! r), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, rad(particle_id, r), 1, mpi_p, &
+                                        & MPI_COMM_WORLD, ierr)
+                        ! call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, rvel(particle_id, r), 1, mpi_p,
+                        ! MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, pos(particle_id,:,r), 3, mpi_p, &
+                                        & MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, posPrev(particle_id,:,r), 3, mpi_p, &
+                                        & MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, vel(particle_id,:,r), 3, mpi_p, &
+                                        & MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, scoord(particle_id,:,r), 3, mpi_p, &
+                                        & MPI_COMM_WORLD, ierr)
+                    end do
+                    do r = 1, lag_num_ts
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, drad(particle_id, r), 1, mpi_p, &
+                                        & MPI_COMM_WORLD, ierr)
+                        ! call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, drvel(particle_id, r), 1, mpi_p,
+                        ! MPI_COMM_WORLD, ierr) call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, dgasp(particle_id,
+                        ! r), 1, mpi_p, MPI_COMM_WORLD, ierr) call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position,
+                        ! dgasmv(particle_id, r), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, dpos(particle_id,:,r), 3, mpi_p, &
+                                        & MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, dvel(particle_id,:,r), 3, mpi_p, &
+                                        & MPI_COMM_WORLD, ierr)
+                    end do
+                    lag_id(particle_id, 2) = particle_id
+                end do
+                recv_offset = recv_offset + p_recv_size
+            end if
+        end do
+
+        ! Wait for all sends to complete
+        if (send_count > 0) then
+            call MPI_Waitall(send_count, send_requests(1:send_count), MPI_STATUSES_IGNORE, ierr)
+        end if
+#endif
+
+        if (any(periodic_bc)) then
+            call s_wrap_particle_positions(pos, posPrev, nParticles, dest)
+        end if
+
+    end subroutine s_mpi_sendrecv_solid_particles
+
+    !> This resets the collision force buffers
+    impure subroutine s_reset_force_buffers()
+
+        force_send_counts = 0
+        force_recv_counts = 0
+        force_send_ids = 0
+        force_send_vals = 0._wp
+
+        $:GPU_UPDATE(device='[force_send_counts, force_send_ids, force_send_vals]')
+
+    end subroutine s_reset_force_buffers
+
+    !> This adds the forces to the buffer arrays for mpi transfer
+    impure subroutine s_add_force_to_send_buffer(dest_rank, gid, force)
+
+        $:GPU_ROUTINE(function_name='s_add_force_to_send_buffer', parallelism='[seq]')
+
+        integer, intent(in)                :: dest_rank, gid
+        real(wp), intent(in), dimension(3) :: force
+        integer                            :: idx
+
+        $:GPU_ATOMIC(atomic='capture')
+        force_send_counts(dest_rank) = force_send_counts(dest_rank) + 1
+        idx = force_send_counts(dest_rank)
+        $:END_GPU_ATOMIC_CAPTURE()
+
+        force_send_ids(dest_rank, idx) = gid
+        force_send_vals(dest_rank, idx, 1) = force(1)
+        force_send_vals(dest_rank, idx, 2) = force(2)
+        force_send_vals(dest_rank, idx, 3) = force(3)
+
+    end subroutine s_add_force_to_send_buffer
+
+    !> This communicates the collision forces across neighbor mpi ranks
+    impure subroutine s_transfer_collision_forces(total_recv, force_recv_ids, force_recv_vals)
+
+        integer, intent(inout)  :: total_recv
+        integer, intent(inout)  :: force_recv_ids(:)
+        real(wp), intent(inout) :: force_recv_vals(:)
+
+#ifdef MFC_MPI
+        integer :: ierr  !< Generic flag used to identify and report MPI errors
+        integer :: i, j, k, l, idx, total_send, recv_tag, send_tag, partner, recv_count, send_count
+        integer :: send_displs(0:num_procs - 1), recv_displs(0:num_procs - 1)
+        integer :: sendcounts_vals(0:num_procs - 1), recvcounts_vals(0:num_procs - 1)
+        integer :: senddispls_vals(0:num_procs - 1), recvdispls_vals(0:num_procs - 1)
+        ! Local request arrays sized for 2 requests per neighbor (IDs + values)
+        integer :: coll_send_requests(2*MAX_NEIGHBORS), coll_recv_requests(2*MAX_NEIGHBORS)
+
+        $:GPU_UPDATE(host='[force_send_counts, force_send_ids, force_send_vals]')
+
+        ! Phase 1: Exchange force counts with neighbors only
+        send_count = 0
+        recv_count = 0
+
+        do l = 1, n_neighbors
+            i = neighbor_list(l, 1)
+            j = neighbor_list(l, 2)
+            k = neighbor_list(l, 3)
+            partner = neighbor_ranks(i, j, k)
+            recv_tag = neighbor_tag(i, j, k)
+            send_tag = neighbor_tag(-i, -j, -k)
+
+            recv_count = recv_count + 1
+            call MPI_Irecv(force_recv_counts(partner), 1, MPI_INTEGER, partner, recv_tag, MPI_COMM_WORLD, &
+                           & recv_requests(recv_count), ierr)
+
+            send_count = send_count + 1
+            call MPI_Isend(force_send_counts(partner), 1, MPI_INTEGER, partner, send_tag, MPI_COMM_WORLD, &
+                           & send_requests(send_count), ierr)
+        end do
+
+        call MPI_Waitall(recv_count, recv_requests(1:recv_count), MPI_STATUSES_IGNORE, ierr)
+        call MPI_Waitall(send_count, send_requests(1:send_count), MPI_STATUSES_IGNORE, ierr)
+
+        ! Compute displacements
+        send_displs(0) = 0
+        recv_displs(0) = 0
+        do i = 1, num_procs - 1
+            send_displs(i) = send_displs(i - 1) + force_send_counts(i - 1)
+            recv_displs(i) = recv_displs(i - 1) + force_recv_counts(i - 1)
+        end do
+
+        do i = 0, num_procs - 1
+            sendcounts_vals(i) = 3*force_send_counts(i)
+            recvcounts_vals(i) = 3*force_recv_counts(i)
+            senddispls_vals(i) = 3*send_displs(i)
+            recvdispls_vals(i) = 3*recv_displs(i)
+        end do
+
+        total_send = sum(force_send_counts)
+        total_recv = sum(force_recv_counts)
+
+        ! Flatten send buffers
+        idx = 1
+        do i = 0, num_procs - 1
+            do j = 1, force_send_counts(i)
+                flat_send_ids(idx) = force_send_ids(i, j)
+                flat_send_vals(3*(idx - 1) + 1) = force_send_vals(i, j, 1)
+                flat_send_vals(3*(idx - 1) + 2) = force_send_vals(i, j, 2)
+                flat_send_vals(3*(idx - 1) + 3) = force_send_vals(i, j, 3)
+                idx = idx + 1
+            end do
+        end do
+
+        ! Phase 2: Exchange force data with neighbors only
+        send_count = 0
+        recv_count = 0
+
+        do l = 1, n_neighbors
+            i = neighbor_list(l, 1)
+            j = neighbor_list(l, 2)
+            k = neighbor_list(l, 3)
+            partner = neighbor_ranks(i, j, k)
+            recv_tag = neighbor_tag(i, j, k)
+            send_tag = neighbor_tag(-i, -j, -k)
+
+            if (force_recv_counts(partner) > 0) then
+                recv_count = recv_count + 1
+                call MPI_Irecv(force_recv_ids(recv_displs(partner) + 1), force_recv_counts(partner), MPI_INTEGER, partner, &
+                               & recv_tag, MPI_COMM_WORLD, coll_recv_requests(recv_count), ierr)
+                recv_count = recv_count + 1
+                call MPI_Irecv(force_recv_vals(recvdispls_vals(partner) + 1), recvcounts_vals(partner), mpi_p, partner, &
+                               & recv_tag + 1, MPI_COMM_WORLD, coll_recv_requests(recv_count), ierr)
+            end if
+
+            if (force_send_counts(partner) > 0) then
+                send_count = send_count + 1
+                call MPI_Isend(flat_send_ids(send_displs(partner) + 1), force_send_counts(partner), MPI_INTEGER, partner, &
+                               & send_tag, MPI_COMM_WORLD, coll_send_requests(send_count), ierr)
+                send_count = send_count + 1
+                call MPI_Isend(flat_send_vals(senddispls_vals(partner) + 1), sendcounts_vals(partner), mpi_p, partner, &
+                               & send_tag + 1, MPI_COMM_WORLD, coll_send_requests(send_count), ierr)
+            end if
+        end do
+
+        call MPI_Waitall(recv_count, coll_recv_requests(1:recv_count), MPI_STATUSES_IGNORE, ierr)
+        call MPI_Waitall(send_count, coll_send_requests(1:send_count), MPI_STATUSES_IGNORE, ierr)
+#else
+        total_recv = 0
+#endif
+
+    end subroutine s_transfer_collision_forces
+
+    !! This function returns a unique tag for each neighbor based on its position relative to the current process.
+    !! @param i, j, k Indices of the neighbor in the range [-1, 1]
+    !! @return tag Unique integer tag for the neighbor
+    integer function neighbor_tag(i, j, k) result(tag)
+
+        integer, intent(in) :: i, j, k
+
+        tag = (k + 1)*9 + (j + 1)*3 + (i + 1)
+
+    end function neighbor_tag
+
+    subroutine s_wrap_particle_positions(pos, posPrev, nbubs, dest)
+
+        real(wp), dimension(:,:,:) :: pos, posPrev
+        integer                    :: nbubs, dest
+        integer                    :: i, q
+        real(wp)                   :: offset
+
+        do i = 1, nbubs
+            if (periodic_bc(1)) then
+                offset = glb_bounds(1)%end - glb_bounds(1)%beg
+                if (pos(i, 1, dest) > x_cb(m + buff_size)) then
+                    do q = 1, 2
+                        pos(i, 1, q) = pos(i, 1, q) - offset
+                        posPrev(i, 1, q) = posPrev(i, 1, q) - offset
+                    end do
+                end if
+                if (pos(i, 1, dest) < x_cb(-1 - buff_size)) then
+                    do q = 1, 2
+                        pos(i, 1, q) = pos(i, 1, q) + offset
+                        posPrev(i, 1, q) = posPrev(i, 1, q) + offset
+                    end do
+                end if
+            end if
+
+            if (periodic_bc(2)) then
+                offset = glb_bounds(2)%end - glb_bounds(2)%beg
+                if (pos(i, 2, dest) > y_cb(n + buff_size)) then
+                    do q = 1, 2
+                        pos(i, 2, q) = pos(i, 2, q) - offset
+                        posPrev(i, 2, q) = posPrev(i, 2, q) - offset
+                    end do
+                end if
+                if (pos(i, 2, dest) < y_cb(-buff_size - 1)) then
+                    do q = 1, 2
+                        pos(i, 2, q) = pos(i, 2, q) + offset
+                        posPrev(i, 2, q) = posPrev(i, 2, q) + offset
+                    end do
+                end if
+            end if
+
+            if (periodic_bc(3)) then
+                offset = glb_bounds(3)%end - glb_bounds(3)%beg
+                if (pos(i, 3, dest) > z_cb(p + buff_size)) then
+                    do q = 1, 2
+                        pos(i, 3, q) = pos(i, 3, q) - offset
+                        posPrev(i, 3, q) = posPrev(i, 3, q) - offset
+                    end do
+                end if
+                if (pos(i, 3, dest) < z_cb(-1 - buff_size)) then
+                    do q = 1, 2
+                        pos(i, 3, q) = pos(i, 3, q) + offset
+                        posPrev(i, 3, q) = posPrev(i, 3, q) + offset
+                    end do
+                end if
+            end if
+        end do
+
+    end subroutine s_wrap_particle_positions
+
+    !> @brief Broadcasts random phase numbers from rank 0 to all MPI processes.
     impure subroutine s_mpi_send_random_number(phi_rn, num_freq)
 
         integer, intent(in)                            :: num_freq
@@ -245,13 +1455,12 @@ contains
 
 #ifdef MFC_MPI
         integer :: ierr  !< Generic flag used to identify and report MPI errors
-
         call MPI_BCAST(phi_rn, num_freq, mpi_p, 0, MPI_COMM_WORLD, ierr)
 #endif
 
     end subroutine s_mpi_send_random_number
 
-    !> Finalize the MPI proxy module
+    !> @brief Deallocates immersed boundary MPI communication buffers.
     subroutine s_finalize_mpi_proxy_module()
 
 #ifdef MFC_MPI
diff --git a/src/simulation/m_particles_EL.fpp b/src/simulation/m_particles_EL.fpp
new file mode 100644
index 0000000000..1674f07c6f
--- /dev/null
+++ b/src/simulation/m_particles_EL.fpp
@@ -0,0 +1,2645 @@
+!>
+!! @file m_particles_EL.fpp
+!! @brief Contains module m_particles_EL
+
+#:include 'macros.fpp'
+
+!> @brief Euler-Lagrange solid particle solver with two-way coupling.
+!!
+!! Tracks non-deformable solid particles in compressible flow using Gaussian volume-averaging (Maeda & Colonius, J. Computational
+!! Physics, 361, 2018). Supports multiple drag correlations, soft-sphere DEM collisions, pressure gradient and added mass forces.
+!! Derived from the m_bubbles_EL module. Kernel functions are in m_particles_EL_kernels.
+module m_particles_EL
+
+    use m_global_parameters     !< Definitions of the global parameters
+    use m_mpi_proxy             !< Message passing interface (MPI) module proxy
+    use m_particles_EL_kernels  !< Definitions of the kernel functions
+    use m_variables_conversion  !< State variables type conversion procedures
+    use m_compile_specific
+    use m_boundary_common
+    use m_helper_basic          !< Functions to compare floating point numbers
+    use m_sim_helpers
+    use m_helper
+    use m_mpi_common
+    use m_ibm
+
+    implicit none
+
+    real(wp)                             :: next_write_time
+    integer, allocatable, dimension(:,:) :: lag_part_id      !< Global and local IDs
+    integer, allocatable, dimension(:)   :: gid_to_local
+    real(wp), allocatable, dimension(:)  :: particle_R0      !< Initial particle radius
+    real(wp), allocatable, dimension(:)  :: Rmax_stats_part  !< Maximum radius
+    real(wp), allocatable, dimension(:)  :: Rmin_stats_part  !< Minimum radius
+    $:GPU_DECLARE(create='[lag_part_id, gid_to_local, particle_R0, Rmax_stats_part, Rmin_stats_part]')
+
+    real(wp), allocatable, dimension(:) :: particle_mass  !< Particle Mass
+    $:GPU_DECLARE(create='[particle_mass]')
+    real(wp), allocatable, dimension(:) :: p_AM  !< Particle Added Mass
+    $:GPU_DECLARE(create='[p_AM]')
+
+    integer, allocatable, dimension(:) :: p_owner_rank  !< MPI rank that owns this particle
+    $:GPU_DECLARE(create='[p_owner_rank]')
+
+    integer, allocatable, dimension(:) :: linked_list  !< particle cell linked list
+    $:GPU_DECLARE(create='[linked_list]')
+
+    integer, allocatable, dimension(:,:,:) :: particle_head  !< particle heads at each cell
+    $:GPU_DECLARE(create='[particle_head]')
+
+    ! Particle state arrays use dimensions (nParticles_glb, component, stage): component: 1=x, 2=y, 3=z for position/velocity stage:
+    ! 1=committed state at current time level, 2=intermediate RK stage value
+
+    ! (nPart, 1 -> actual val or 2 -> temp val)
+    real(wp), allocatable, dimension(:,:) :: particle_rad  !< Particle radius
+    $:GPU_DECLARE(create='[particle_rad]')
+
+    ! (nPart, 1-> x or 2->y or 3 ->z, 1 -> actual or 2 -> temporal val)
+    real(wp), allocatable, dimension(:,:,:) :: particle_pos      !< Particle's position
+    real(wp), allocatable, dimension(:,:,:) :: particle_posPrev  !< Particle's previous position
+    real(wp), allocatable, dimension(:,:,:) :: particle_vel      !< Particle's velocity
+    real(wp), allocatable, dimension(:,:,:) :: particle_s        !< Particle's computational cell position in real format
+    $:GPU_DECLARE(create='[particle_pos, particle_posPrev, particle_vel, particle_s]')
+    ! (nPart, 1-> x or 2->y or 3 ->z, time-stage)
+    real(wp), allocatable, dimension(:,:)   :: particle_draddt  !< Time derivative of particle's radius
+    real(wp), allocatable, dimension(:,:,:) :: particle_dposdt  !< Time derivative of the particle's position
+    real(wp), allocatable, dimension(:,:,:) :: particle_dveldt  !< Time derivative of the particle's velocity
+    $:GPU_DECLARE(create='[particle_draddt, particle_dposdt, particle_dveldt]')
+
+    integer, private :: lag_num_ts  !< Number of time stages in the time-stepping scheme
+    $:GPU_DECLARE(create='[lag_num_ts]')
+
+    real(wp) :: Rmax_glb, Rmin_glb  !< Global maximum and minimum R/R0 ratio across all particles
+    !> Eulerian projection of particle data (volume fraction, momentum, sources)
+    type(scalar_field), dimension(:), allocatable :: q_particles
+    integer                                       :: q_particles_idx  !< Size of the q vector field for particle cell (q)uantities
+    integer, parameter                            :: alphaf_id = 1
+    integer, parameter                            :: alphaupx_id = 2  !< x particle momentum index
+    integer, parameter                            :: alphaupy_id = 3  !< y particle momentum index
+    integer, parameter                            :: alphaupz_id = 4  !< z particle momentum index
+    integer, parameter                            :: Smx_id = 5
+    integer, parameter                            :: Smy_id = 6
+    integer, parameter                            :: Smz_id = 7
+    integer, parameter                            :: SE_id = 8
+
+    !> Interpolated Eulerian field gradients at particle locations
+    type(scalar_field), dimension(:), allocatable :: field_vars           !< For cell quantities (field gradients, etc.)
+    integer, parameter                            :: dPx_id = 1           !< Spatial pressure gradient in x, y, and z
+    integer, parameter                            :: dPy_id = 2
+    integer, parameter                            :: dPz_id = 3
+    integer, parameter                            :: drhox_id = 4         !< Spatial density gradient in x, y, and z
+    integer, parameter                            :: drhoy_id = 5
+    integer, parameter                            :: drhoz_id = 6
+    integer, parameter                            :: dufx_id = 7          !< Spatial velocity gradient in x, y, and z
+    integer, parameter                            :: dufy_id = 8
+    integer, parameter                            :: dufz_id = 9
+    integer, parameter                            :: dalphafx_id = 10     !< Spatial fluid volume fraction gradient in x, y, and z
+    integer, parameter                            :: dalphafy_id = 11
+    integer, parameter                            :: dalphafz_id = 12
+    integer, parameter                            :: dalphap_upx_id = 13  !< Spatial particle momentum gradient in x, y, and z
+    integer, parameter                            :: dalphap_upy_id = 14
+    integer, parameter                            :: dalphap_upz_id = 15
+    integer, parameter                            :: nField_vars = 15
+    type(scalar_field), dimension(:), allocatable :: weights_x_interp     !< For precomputing weights
+    type(scalar_field), dimension(:), allocatable :: weights_y_interp     !< For precomputing weights
+    type(scalar_field), dimension(:), allocatable :: weights_z_interp     !< For precomputing weights
+    integer                                       :: nWeights_interp
+    type(scalar_field), dimension(:), allocatable :: weights_x_grad       !< For precomputing weights
+    type(scalar_field), dimension(:), allocatable :: weights_y_grad       !< For precomputing weights
+    type(scalar_field), dimension(:), allocatable :: weights_z_grad       !< For precomputing weights
+    integer                                       :: nWeights_grad
+
+    $:GPU_DECLARE(create='[Rmax_glb, Rmin_glb, q_particles, q_particles_idx, field_vars]')
+    $:GPU_DECLARE(create='[weights_x_interp, weights_y_interp, weights_z_interp, nWeights_interp]')
+    $:GPU_DECLARE(create='[weights_x_grad, weights_y_grad, weights_z_grad, nWeights_grad]')
+
+    ! Particle Source terms for fluid coupling
+    real(wp), allocatable, dimension(:,:) :: f_p  !< force on each particle
+    $:GPU_DECLARE(create='[f_p]')
+
+    real(wp), allocatable, dimension(:) :: gSum  !< gaussian sum for each particle
+    $:GPU_DECLARE(create='[gSum]')
+
+    integer, allocatable  :: force_recv_ids(:)   !< ids of collision forces received from other ranks
+    real(wp), allocatable :: force_recv_vals(:)  !< collision forces received from other ranks
+    $:GPU_DECLARE(create='[force_recv_ids, force_recv_vals]')
+
+    integer, parameter                   :: LAG_EVOL_ID = 11  ! File id for lag_bubbles_evol_*.dat
+    integer, parameter                   :: LAG_STATS_ID = 12  ! File id for stats_lag_bubbles_*.dat
+    integer, parameter                   :: LAG_VOID_ID = 13  ! File id for voidfraction.dat
+    integer, allocatable, dimension(:)   :: keep_bubble
+    integer, allocatable, dimension(:,:) :: wrap_bubble_loc, wrap_bubble_dir
+    $:GPU_DECLARE(create='[keep_bubble]')
+    $:GPU_DECLARE(create='[wrap_bubble_loc, wrap_bubble_dir]')
+
+    integer :: error_flag  ! Error flag for collisions
+    $:GPU_DECLARE(create='[error_flag]')
+
+    integer, parameter :: ncc = 1  !< Number of collisions cells at boundaries
+    real(wp)           :: eps_overlap = 1.e-12
+
+contains
+
+    !> Initializes the lagrangian subgrid particle solver
+    !! @param q_cons_vf Initial conservative variables
+    impure subroutine s_initialize_particles_EL_module(q_cons_vf, bc_type)
+
+        type(scalar_field), dimension(sys_size), intent(inout)     :: q_cons_vf
+        type(integer_field), dimension(1:num_dims,1:2), intent(in) :: bc_type
+        integer                                                    :: nParticles_glb, i, j, k, nf, l, npts
+
+        ! PRIM TO CONS VARIABLES
+        real(wp)               :: dyn_pres, pi_inf, qv, gamma, pres, T
+        real(wp)               :: rhou, alpharhou, rho_f, alpharho
+        real(wp), dimension(3) :: fluid_vel
+        real(wp)               :: rhoYks(1:num_species)
+        integer                :: save_count
+        real(wp)               :: qtime
+        real(wp)               :: myR, func_sum
+        real(wp), dimension(3) :: myPos, myVel, myForce
+        integer, dimension(3)  :: cell
+        logical                :: only_beta
+
+        only_beta = .true.
+
+        next_write_time = 0._wp
+
+        if (cfl_dt) then
+            save_count = n_start
+            qtime = n_start*t_save
+        else
+            save_count = t_step_start
+            qtime = t_step_start*dt
+        end if
+
+        pi_inf = 0._wp
+        qv = 0._wp
+        gamma = gammas(1)
+
+        ! Setting number of time-stages for selected time-stepping scheme
+        lag_num_ts = time_stepper
+
+        ! Allocate space for the Eulerian fields needed to map the effect of the particles
+        if (lag_params%solver_approach == 1) then
+            ! One-way coupling
+            q_particles_idx = 1  ! For tracking volume fraction
+        else if (lag_params%solver_approach == 2) then
+            ! Two-way coupling
+            q_particles_idx = 8  ! For tracking volume fraction(1), x-mom(2), y-mom(3), z-mom(4), and energy(5) sources, and alpha_p u_p (x(6),y(7),z(8))
+        else
+            call s_mpi_abort('Please check the lag_params%solver_approach input')
+        end if
+
+        nWeights_interp = lag_params%interpolation_order + 1
+        nWeights_grad = fd_order + 1
+
+        pcomm_coords(1)%beg = x_cb(-1)
+        pcomm_coords(1)%end = x_cb(m)
+        $:GPU_UPDATE(device='[pcomm_coords(1)]')
+        if (n > 0) then
+            pcomm_coords(2)%beg = y_cb(-1)
+            pcomm_coords(2)%end = y_cb(n)
+            $:GPU_UPDATE(device='[pcomm_coords(2)]')
+            if (p > 0) then
+                pcomm_coords(3)%beg = z_cb(-1)
+                pcomm_coords(3)%end = z_cb(p)
+                $:GPU_UPDATE(device='[pcomm_coords(3)]')
+            end if
+        end if
+
+        pcomm_coords_ghost(1)%beg = x_cb(-1 + ncc)
+        pcomm_coords_ghost(1)%end = x_cb(m - ncc)
+        $:GPU_UPDATE(device='[pcomm_coords_ghost(1)]')
+        if (n > 0) then
+            pcomm_coords_ghost(2)%beg = y_cb(-1 + ncc)
+            pcomm_coords_ghost(2)%end = y_cb(n - ncc)
+            $:GPU_UPDATE(device='[pcomm_coords_ghost(2)]')
+            if (p > 0) then
+                pcomm_coords_ghost(3)%beg = z_cb(-1 + ncc)
+                pcomm_coords_ghost(3)%end = z_cb(p - ncc)
+                $:GPU_UPDATE(device='[pcomm_coords_ghost(3)]')
+            end if
+        end if
+
+        $:GPU_UPDATE(device='[lag_num_ts, q_particles_idx]')
+
+        @:ALLOCATE(q_particles(1:q_particles_idx))
+        do i = 1, q_particles_idx
+            @:ALLOCATE(q_particles(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
+                       & idwbuff(3)%beg:idwbuff(3)%end))
+            @:ACC_SETUP_SFs(q_particles(i))
+        end do
+
+        @:ALLOCATE(field_vars(1:nField_vars))
+        do i = 1, nField_vars
+            @:ALLOCATE(field_vars(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
+                       & idwbuff(3)%beg:idwbuff(3)%end))
+            @:ACC_SETUP_SFs(field_vars(i))
+        end do
+
+        @:ALLOCATE(weights_x_interp(1:nWeights_interp))
+        do i = 1, nWeights_interp
+            @:ALLOCATE(weights_x_interp(i)%sf(idwbuff(1)%beg:idwbuff(1)%end,1:1,1:1))
+            @:ACC_SETUP_SFs(weights_x_interp(i))
+        end do
+
+        @:ALLOCATE(weights_y_interp(1:nWeights_interp))
+        do i = 1, nWeights_interp
+            @:ALLOCATE(weights_y_interp(i)%sf(idwbuff(2)%beg:idwbuff(2)%end,1:1,1:1))
+            @:ACC_SETUP_SFs(weights_y_interp(i))
+        end do
+
+        @:ALLOCATE(weights_z_interp(1:nWeights_interp))
+        do i = 1, nWeights_interp
+            @:ALLOCATE(weights_z_interp(i)%sf(idwbuff(3)%beg:idwbuff(3)%end,1:1,1:1))
+            @:ACC_SETUP_SFs(weights_z_interp(i))
+        end do
+
+        @:ALLOCATE(weights_x_grad(1:nWeights_grad))
+        do i = 1, nWeights_grad
+            @:ALLOCATE(weights_x_grad(i)%sf(idwbuff(1)%beg:idwbuff(1)%end,1:1,1:1))
+            @:ACC_SETUP_SFs(weights_x_grad(i))
+        end do
+
+        @:ALLOCATE(weights_y_grad(1:nWeights_grad))
+        do i = 1, nWeights_grad
+            @:ALLOCATE(weights_y_grad(i)%sf(idwbuff(2)%beg:idwbuff(2)%end,1:1,1:1))
+            @:ACC_SETUP_SFs(weights_y_grad(i))
+        end do
+
+        @:ALLOCATE(weights_z_grad(1:nWeights_grad))
+        do i = 1, nWeights_grad
+            @:ALLOCATE(weights_z_grad(i)%sf(idwbuff(3)%beg:idwbuff(3)%end,1:1,1:1))
+            @:ACC_SETUP_SFs(weights_z_grad(i))
+        end do
+
+        ! Allocating space for lagrangian variables
+        nParticles_glb = lag_params%nParticles_glb
+
+        @:ALLOCATE(lag_part_id(1:nParticles_glb, 1:2))
+        @:ALLOCATE(gid_to_local(1:nParticles_glb))
+        @:ALLOCATE(particle_R0(1:nParticles_glb))
+        @:ALLOCATE(Rmax_stats_part(1:nParticles_glb))
+        @:ALLOCATE(Rmin_stats_part(1:nParticles_glb))
+        @:ALLOCATE(particle_mass(1:nParticles_glb))
+        @:ALLOCATE(p_AM(1:nParticles_glb))
+        @:ALLOCATE(p_owner_rank(1:nParticles_glb))
+        @:ALLOCATE(particle_rad(1:nParticles_glb, 1:2))
+        @:ALLOCATE(particle_pos(1:nParticles_glb, 1:3, 1:2))
+        @:ALLOCATE(particle_posPrev(1:nParticles_glb, 1:3, 1:2))
+        @:ALLOCATE(particle_vel(1:nParticles_glb, 1:3, 1:2))
+        @:ALLOCATE(particle_s(1:nParticles_glb, 1:3, 1:2))
+        @:ALLOCATE(particle_draddt(1:nParticles_glb, 1:lag_num_ts))
+        @:ALLOCATE(particle_dposdt(1:nParticles_glb, 1:3, 1:lag_num_ts))
+        @:ALLOCATE(particle_dveldt(1:nParticles_glb, 1:3, 1:lag_num_ts))
+        @:ALLOCATE(f_p(1:nParticles_glb, 1:3))
+        @:ALLOCATE(gSum(1:nParticles_glb))
+
+        @:ALLOCATE(linked_list(1:nParticles_glb))
+
+        @:ALLOCATE(particle_head(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, idwbuff(3)%beg:idwbuff(3)%end))
+
+        @:ALLOCATE(force_recv_ids(1:lag_params%nParticles_glb))
+        @:ALLOCATE(force_recv_vals(1:3*lag_params%nParticles_glb))
+
+        @:ALLOCATE(keep_bubble(1:nParticles_glb))
+        @:ALLOCATE(wrap_bubble_loc(1:nParticles_glb, 1:num_dims), wrap_bubble_dir(1:nParticles_glb, 1:num_dims))
+
+        if (adap_dt .and. f_is_default(adap_dt_tol)) adap_dt_tol = dflt_adap_dt_tol
+
+        if (num_procs > 1) call s_initialize_solid_particles_mpi(lag_num_ts)
+
+        ! Starting particles
+        if (lag_params%write_void_evol) call s_open_void_evol
+        if (lag_params%write_bubbles) call s_open_lag_bubble_evol()
+        if (lag_params%write_bubbles_stats) call s_open_lag_particle_stats()
+
+        if (lag_params%vel_model > 0) then
+            moving_lag_particles = .true.
+            lag_pressure_force = lag_params%pressure_force
+            lag_gravity_force = lag_params%gravity_force
+            lag_vel_model = lag_params%vel_model
+            lag_drag_model = lag_params%drag_model
+        end if
+
+        $:GPU_UPDATE(device='[moving_lag_particles, lag_pressure_force, lag_gravity_force, lag_vel_model, lag_drag_model]')
+
+        ! Allocate cell list arrays for atomic-free Gaussian smearing
+        @:ALLOCATE(cell_list_start(0:m, 0:n, 0:p))
+        @:ALLOCATE(cell_list_count(0:m, 0:n, 0:p))
+        @:ALLOCATE(cell_list_idx(1:lag_params%nParticles_glb))
+
+        call s_read_input_particles(q_cons_vf, bc_type)
+
+        call s_reset_cell_vars()
+
+        $:GPU_PARALLEL_LOOP(private='[k, cell, myR, myPos, myVel, myForce, func_sum]',copyin='[only_beta]')
+        do k = 1, n_el_particles_loc
+            cell = fd_number - buff_size
+            call s_locate_cell(particle_pos(k,1:3,1), cell, particle_s(k,1:3,1))
+
+            myR = particle_R0(k)
+            myPos = particle_pos(k,1:3,1)
+            myVel = particle_vel(k,1:3,1)
+            myForce = f_p(k,:)
+            ! Compute the total gaussian contribution for each particle for normalization
+            call s_compute_gaussian_contribution(myR, myPos, cell, func_sum)
+            gSum(k) = func_sum
+
+            call s_gaussian_atomic(myR, myVel, myPos, myForce, func_sum, cell, q_particles, only_beta)
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        call s_finalize_beta_field(bc_type, only_beta)
+
+        npts = (nWeights_interp - 1)/2
+        call s_compute_barycentric_weights(npts)  ! For interpolation
+
+        npts = (nWeights_grad - 1)/2
+        call s_compute_fornberg_fd_weights(npts)  ! For finite differences
+
+        if (lag_params%solver_approach == 2) then
+            if (save_count == 0) then
+                !> Correcting initial conditions so they account for particles
+                $:GPU_PARALLEL_LOOP(private='[i, j, k, dyn_pres, fluid_vel, rho_f, alpharho, rhou, alpharhou]', collapse=3, &
+                                    & copyin = '[pi_inf, qv, gamma, rhoYks]')
+                do k = idwint(3)%beg, idwint(3)%end
+                    do j = idwint(2)%beg, idwint(2)%end
+                        do i = idwint(1)%beg, idwint(1)%end
+                            !!!!!!!!! Mass
+                            do l = 1, num_fluids  ! num_fluid is just 1 right now
+                                rho_f = q_cons_vf(l)%sf(i, j, k)
+                                alpharho = q_particles(alphaf_id)%sf(i, j, k)*rho_f
+                                q_cons_vf(l)%sf(i, j, k) = alpharho
+                            end do
+
+                            !!!!!!!!! Momentum
+                            dyn_pres = 0._wp
+                            do l = momxb, momxe
+                                fluid_vel(l - momxb + 1) = q_cons_vf(l)%sf(i, j, k)/rho_f
+                                rhou = q_cons_vf(l)%sf(i, j, k)
+                                alpharhou = q_particles(alphaf_id)%sf(i, j, k)*rhou
+                                q_cons_vf(l)%sf(i, j, k) = alpharhou
+                                dyn_pres = dyn_pres + q_cons_vf(l)%sf(i, j, k)*fluid_vel(l - momxb + 1)/2._wp
+                            end do
+
+                            !!!!!!!!!Energy
+                            call s_compute_pressure(q_cons_vf(E_idx)%sf(i, j, k), q_cons_vf(alf_idx)%sf(i, j, k), dyn_pres, &
+                                                    & pi_inf, gamma, alpharho, qv, rhoYks, pres, T)
+
+                            q_cons_vf(E_idx)%sf(i, j, k) = gamma*pres + dyn_pres + pi_inf + qv  ! Updating energy in cons
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+        end if
+
+    end subroutine s_initialize_particles_EL_module
+
+    !> Initialize particle data from input file or generate initial conditions
+    !! @param q_cons_vf Conservative variables
+    impure subroutine s_read_input_particles(q_cons_vf, bc_type)
+
+        type(scalar_field), dimension(sys_size), intent(inout)     :: q_cons_vf
+        type(integer_field), dimension(1:num_dims,1:2), intent(in) :: bc_type
+        real(wp), dimension(8)                                     :: inputParticle
+        real(wp)                                                   :: qtime
+        integer                                                    :: id, particle_id, save_count
+        integer                                                    :: i, ios
+        logical                                                    :: file_exist, indomain
+        integer, dimension(3)                                      :: cell
+        character(LEN=path_len + 2*name_len)                       :: path_D_dir
+
+        ! Initialize number of particles
+
+        particle_id = 0
+        id = 0
+
+        ! Read the input lag_bubble file or restart point
+        if (cfl_dt) then
+            save_count = n_start
+            qtime = n_start*t_save
+        else
+            save_count = t_step_start
+            qtime = t_step_start*dt
+        end if
+
+        if (save_count == 0) then
+            if (proc_rank == 0) print *, 'Reading lagrange particles input file.'
+            call my_inquire(trim(lag_params%input_path), file_exist)
+            if (file_exist) then
+                open (94, file=trim(lag_params%input_path), form='formatted', iostat=ios)
+                do while (ios == 0)
+                    read (94, *, iostat=ios) (inputParticle(i), i=1, 8)
+                    if (ios /= 0) cycle
+                    indomain = particle_in_domain_physical(inputParticle(1:3))
+                    id = id + 1
+                    if (id > lag_params%nParticles_glb .and. proc_rank == 0) then
+                        call s_mpi_abort("Current number of particles is larger than nParticles_glb")
+                    end if
+                    if (indomain) then
+                        particle_id = particle_id + 1
+                        call s_add_particles(inputParticle, q_cons_vf, particle_id)
+                        lag_part_id(particle_id, 1) = id  ! global ID
+                        lag_part_id(particle_id, 2) = particle_id  ! local ID
+                        n_el_particles_loc = particle_id  ! local number of particles
+                    end if
+                end do
+                close (94)
+            else
+                call s_mpi_abort("Initialize the lagrange particles in " // trim(lag_params%input_path))
+            end if
+        else
+            if (proc_rank == 0) print *, 'Restarting lagrange particles at save_count: ', save_count
+            call s_restart_bubbles(particle_id, save_count)
+        end if
+
+        print *, " Lagrange parrticles running, in proc", proc_rank, "number:", particle_id, "/", id
+
+        if (num_procs > 1) then
+            call s_mpi_reduce_int_sum(n_el_particles_loc, n_el_particles_glb)
+        else
+            n_el_particles_glb = n_el_particles_loc
+        end if
+
+        if (proc_rank == 0) then
+            if (n_el_particles_glb == 0) call s_mpi_abort('No particles in the domain. Check ' // trim(lag_params%input_path))
+        end if
+
+        $:GPU_UPDATE(device='[particles_lagrange, lag_params]')
+
+        $:GPU_UPDATE(device='[lag_part_id, particle_R0, Rmax_stats_part, Rmin_stats_part, particle_mass, f_p, p_AM, p_owner_rank, &
+                     & gid_to_local, particle_rad, particle_pos, particle_posPrev, particle_vel, particle_s, particle_draddt, &
+                     & particle_dposdt, particle_dveldt, n_el_particles_loc]')
+
+        Rmax_glb = min(dflt_real, -dflt_real)
+        Rmin_glb = max(dflt_real, -dflt_real)
+        $:GPU_UPDATE(device='[Rmax_glb, Rmin_glb]')
+
+        $:GPU_UPDATE(device='[dx, dy, dz, x_cb, x_cc, y_cb, y_cc, z_cb, z_cc]')
+
+        ! Populate temporal variables
+        call s_transfer_data_to_tmp_particles()
+
+        if (save_count == 0) then
+            ! Create ./D directory
+            if (proc_rank == 0) then
+                write (path_D_dir, '(A,I0,A,I0)') trim(case_dir) // '/D'
+                call my_inquire(trim(path_D_dir), file_exist)
+                if (.not. file_exist) call s_create_directory(trim(path_D_dir))
+            end if
+            call s_mpi_barrier()
+            call s_write_restart_lag_particles(save_count)  ! Needed for post_processing
+            if (lag_params%write_void_evol) call s_write_void_evol_particles(qtime)
+        end if
+
+        if (lag_params%write_bubbles) call s_write_lag_particle_evol(qtime)
+
+    end subroutine s_read_input_particles
+
+    !> The purpose of this procedure is to obtain the information of the particles when starting fresh
+    !! @param inputPart Particle information
+    !! @param q_cons_vf Conservative variables
+    !! @param part_id Local id of the particle
+    impure subroutine s_add_particles(inputPart, q_cons_vf, part_id)
+
+        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
+        real(wp), dimension(8), intent(in)                  :: inputPart
+        integer, intent(in)                                 :: part_id
+        integer                                             :: i
+        real(wp)                                            :: pliq, volparticle, concvap, totalmass, kparticle, cpparticle
+        real(wp)                                            :: omegaN_local, PeG, PeT, rhol, pcrit, qv, gamma, pi_inf, dynP
+        integer, dimension(3)                               :: cell
+        real(wp), dimension(2)                              :: Re
+        real(wp)                                            :: massflag, heatflag, Re_trans, Im_trans, myR, func_sum
+        real(wp), dimension(3)                              :: myPos, myVel
+
+        massflag = 0._wp
+        heatflag = 0._wp
+        if (lag_params%massTransfer_model) massflag = 1._wp
+        if (lag_params%heatTransfer_model) heatflag = 1._wp
+
+        particle_R0(part_id) = inputPart(7)
+        Rmax_stats_part(part_id) = min(dflt_real, -dflt_real)
+        Rmin_stats_part(part_id) = max(dflt_real, -dflt_real)
+        particle_rad(part_id, 1) = inputPart(7)
+        particle_pos(part_id,1:3,1) = inputPart(1:3)
+        particle_posPrev(part_id,1:3,1) = particle_pos(part_id,1:3,1)
+        particle_vel(part_id,1:3,1) = inputPart(4:6)
+
+        ! Initialize Particle Sources
+        f_p(part_id,1:3) = 0._wp
+        p_AM(part_id) = 0._wp
+        p_owner_rank(part_id) = proc_rank
+        gid_to_local(part_id) = -1
+
+        if (cyl_coord .and. p == 0) then
+            particle_pos(part_id, 2, 1) = sqrt(particle_pos(part_id, 2, 1)**2._wp + particle_pos(part_id, 3, 1)**2._wp)
+            ! Storing azimuthal angle (-Pi to Pi)) into the third coordinate variable
+            particle_pos(part_id, 3, 1) = atan2(inputPart(3), inputPart(2))
+            particle_posPrev(part_id,1:3,1) = particle_pos(part_id,1:3,1)
+        end if
+
+        cell = fd_number - buff_size
+        call s_locate_cell(particle_pos(part_id,1:3,1), cell, particle_s(part_id,1:3,1))
+
+        ! Check if the particle is located in the ghost cell of a symmetric, or wall boundary
+        if ((any(bc_x%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, &
+            & BC_NO_SLIP_WALL/)) .and. cell(1) < 0) .or. (any(bc_x%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, &
+            & BC_NO_SLIP_WALL/)) .and. cell(1) > m) .or. (any(bc_y%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, &
+            & BC_NO_SLIP_WALL/)) .and. cell(2) < 0) .or. (any(bc_y%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, &
+            & BC_NO_SLIP_WALL/)) .and. cell(2) > n)) then
+            call s_mpi_abort("Lagrange particle is in the ghost cells of a symmetric or wall boundary.")
+        end if
+
+        if (p > 0) then
+            if ((any(bc_z%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, &
+                & BC_NO_SLIP_WALL/)) .and. cell(3) < 0) .or. (any(bc_z%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, &
+                & BC_NO_SLIP_WALL/)) .and. cell(3) > p)) then
+                call s_mpi_abort("Lagrange particle is in the ghost cells of a symmetric or wall boundary.")
+            end if
+        end if
+
+        ! Initial particle mass
+        volparticle = 4._wp/3._wp*pi*particle_R0(part_id)**3  ! volume
+        particle_mass(part_id) = volparticle*rho0ref_particle  ! mass
+        if (particle_mass(part_id) <= 0._wp) then
+            call s_mpi_abort("The initial particle mass is negative or zero. Check the particle file.")
+        end if
+
+    end subroutine s_add_particles
+
+    !> Read particle data from a restart checkpoint
+    !! @param part_id Local ID of the particle
+    !! @param save_count File identifier
+    impure subroutine s_restart_bubbles(part_id, save_count)
+
+        integer, intent(inout)               :: part_id, save_count
+        character(LEN=path_len + 2*name_len) :: file_loc
+        real(wp)                             :: file_time, file_dt
+        integer                              :: file_num_procs, file_tot_part, tot_part
+
+#ifndef MFC_MPI
+        @:PROHIBIT(.true., "Lagrangian particle restart requires MPI (--mpi)")
+#else
+        real(wp), dimension(20)                :: inputvals
+        integer, dimension(MPI_STATUS_SIZE)    :: status
+        integer(kind=MPI_OFFSET_KIND)          :: disp
+        integer                                :: view
+        integer, dimension(3)                  :: cell
+        logical                                :: indomain, particle_file, file_exist
+        integer, dimension(2)                  :: gsizes, lsizes, start_idx_part
+        integer                                :: ifile, ierr, tot_data, id
+        integer                                :: i
+        integer, dimension(:), allocatable     :: proc_particle_counts
+        real(wp), dimension(1:1,1:lag_io_vars) :: dummy
+        dummy = 0._wp
+
+        ! Construct file path
+        write (file_loc, '(A,I0,A)') 'lag_bubbles_', save_count, '.dat'
+        file_loc = trim(case_dir) // '/restart_data' // trim(mpiiofs) // trim(file_loc)
+
+        ! Check if file exists
+        inquire (FILE=trim(file_loc), EXIST=file_exist)
+        if (.not. file_exist) then
+            call s_mpi_abort('Restart file ' // trim(file_loc) // ' does not exist!')
+        end if
+
+        if (.not. parallel_io) return
+
+        if (proc_rank == 0) then
+            call MPI_FILE_OPEN(MPI_COMM_SELF, file_loc, MPI_MODE_RDONLY, mpi_info_int, ifile, ierr)
+
+            call MPI_FILE_READ(ifile, file_tot_part, 1, MPI_INTEGER, status, ierr)
+            call MPI_FILE_READ(ifile, file_time, 1, mpi_p, status, ierr)
+            call MPI_FILE_READ(ifile, file_dt, 1, mpi_p, status, ierr)
+            call MPI_FILE_READ(ifile, file_num_procs, 1, MPI_INTEGER, status, ierr)
+
+            call MPI_FILE_CLOSE(ifile, ierr)
+        end if
+
+        call MPI_BCAST(file_tot_part, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
+        call MPI_BCAST(file_time, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
+        call MPI_BCAST(file_dt, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
+        call MPI_BCAST(file_num_procs, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
+
+        allocate (proc_particle_counts(file_num_procs))
+
+        if (proc_rank == 0) then
+            call MPI_FILE_OPEN(MPI_COMM_SELF, file_loc, MPI_MODE_RDONLY, mpi_info_int, ifile, ierr)
+
+            ! Skip to processor counts position
+            disp = int(sizeof(file_tot_part) + 2*sizeof(file_time) + sizeof(file_num_procs), MPI_OFFSET_KIND)
+            call MPI_FILE_SEEK(ifile, disp, MPI_SEEK_SET, ierr)
+            call MPI_FILE_READ(ifile, proc_particle_counts, file_num_procs, MPI_INTEGER, status, ierr)
+
+            call MPI_FILE_CLOSE(ifile, ierr)
+        end if
+
+        call MPI_BCAST(proc_particle_counts, file_num_procs, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
+
+        ! Set time variables from file
+        mytime = file_time
+        dt = file_dt
+
+        part_id = proc_particle_counts(proc_rank + 1)
+
+        start_idx_part(1) = 0
+        do i = 1, proc_rank
+            start_idx_part(1) = start_idx_part(1) + proc_particle_counts(i)
+        end do
+
+        start_idx_part(2) = 0
+        lsizes(1) = part_id
+        lsizes(2) = lag_io_vars
+
+        gsizes(1) = file_tot_part
+        gsizes(2) = lag_io_vars
+
+        if (part_id > 0) then
+            allocate (MPI_IO_DATA_lag_bubbles(part_id,1:lag_io_vars))
+
+            call MPI_TYPE_CREATE_SUBARRAY(2, gsizes, lsizes, start_idx_part, MPI_ORDER_FORTRAN, mpi_p, view, ierr)
+            call MPI_TYPE_COMMIT(view, ierr)
+
+            call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, MPI_MODE_RDONLY, mpi_info_int, ifile, ierr)
+
+            ! Skip extended header
+            disp = int(sizeof(file_tot_part) + 2*sizeof(file_time) + sizeof(file_num_procs) &
+                       & + file_num_procs*sizeof(proc_particle_counts(1)), MPI_OFFSET_KIND)
+            call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, view, 'native', mpi_info_int, ierr)
+
+            call MPI_FILE_READ_ALL(ifile, MPI_IO_DATA_lag_bubbles, lag_io_vars*part_id, mpi_p, status, ierr)
+
+            call MPI_FILE_CLOSE(ifile, ierr)
+            call MPI_TYPE_FREE(view, ierr)
+
+            n_el_particles_loc = part_id
+
+            do i = 1, part_id
+                lag_part_id(i, 1) = int(MPI_IO_DATA_lag_bubbles(i, 1))
+                particle_pos(i,1:3,1) = MPI_IO_DATA_lag_bubbles(i,2:4)
+                particle_posPrev(i,1:3,1) = MPI_IO_DATA_lag_bubbles(i,5:7)
+                particle_vel(i,1:3,1) = MPI_IO_DATA_lag_bubbles(i,8:10)
+                particle_rad(i, 1) = MPI_IO_DATA_lag_bubbles(i, 11)
+                particle_R0(i) = MPI_IO_DATA_lag_bubbles(i, 13)
+                Rmax_stats_part(i) = MPI_IO_DATA_lag_bubbles(i, 14)
+                Rmin_stats_part(i) = MPI_IO_DATA_lag_bubbles(i, 15)
+                particle_mass(i) = MPI_IO_DATA_lag_bubbles(i, 19)
+                cell = -buff_size
+                call s_locate_cell(particle_pos(i,1:3,1), cell, particle_s(i,1:3,1))
+            end do
+
+            deallocate (MPI_IO_DATA_lag_bubbles)
+        else
+            n_el_particles_loc = 0
+
+            call MPI_TYPE_CONTIGUOUS(0, mpi_p, view, ierr)
+            call MPI_TYPE_COMMIT(view, ierr)
+
+            call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, MPI_MODE_RDONLY, mpi_info_int, ifile, ierr)
+
+            ! Skip extended header
+            disp = int(sizeof(file_tot_part) + 2*sizeof(file_time) + sizeof(file_num_procs) &
+                       & + file_num_procs*sizeof(proc_particle_counts(1)), MPI_OFFSET_KIND)
+            call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, view, 'native', mpi_info_int, ierr)
+
+            call MPI_FILE_READ_ALL(ifile, dummy, 0, mpi_p, status, ierr)
+
+            call MPI_FILE_CLOSE(ifile, ierr)
+            call MPI_TYPE_FREE(view, ierr)
+        end if
+
+        if (proc_rank == 0) then
+            write (*, '(A,I0,A,I0)') 'Read ', file_tot_part, ' particles from restart file at t_step = ', save_count
+            write (*, '(A,E15.7,A,E15.7)') 'Restart time = ', mytime, ', dt = ', dt
+        end if
+
+        deallocate (proc_particle_counts)
+#endif
+
+    end subroutine s_restart_bubbles
+
+    !> Contains the particle dynamics subroutines.
+    !! @param q_cons_vf Conservative variables
+    !! @param q_prim_vf Primitive variables
+    !! @param rhs_vf Calculated change of conservative variables
+    !! @param t_step Current time step
+    !! @param stage Current stage in the time-stepper algorithm
+    subroutine s_compute_particle_EL_dynamics(q_prim_vf, bc_type, stage, vL_x, vL_y, vL_z, vR_x, vR_y, vR_z, rhs_vf)
+
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
+        type(integer_field), dimension(1:num_dims,1:2), intent(in) :: bc_type
+        type(scalar_field), dimension(sys_size), intent(in) :: rhs_vf
+        integer, intent(in) :: stage
+        real(wp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:), intent(inout) :: vL_x, vL_y, vL_z
+        real(wp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:), intent(inout) :: vR_x, vR_y, vR_z
+        integer, dimension(3) :: cell, cellijk
+        real(wp) :: myMass, myR, myBeta_c, myBeta_t, myR0, myRe, mydrhodt, myVolumeFrac, myGamma, rmass_add, func_sum
+        real(wp), dimension(3) :: myVel, myPos, force_vec, s_cell
+        logical :: only_beta
+        integer :: k, l, i, j
+
+        only_beta = .false.
+
+        if (lag_params%pressure_force .or. lag_params%added_mass_model > 0) then
+            do l = 1, num_dims
+                if (l == 1) then
+                    call s_gradient_field(vL_x, vR_x, field_vars(dPx_id)%sf, l, E_idx)
+                else if (l == 2) then
+                    call s_gradient_field(vL_y, vR_y, field_vars(dPy_id)%sf, l, E_idx)
+                else if (l == 3) then
+                    call s_gradient_field(vL_z, vR_z, field_vars(dPz_id)%sf, l, E_idx)
+                end if
+            end do
+        end if
+
+        if (lag_params%added_mass_model > 0) then
+            do l = 1, num_dims
+                if (l == 1) then
+                    call s_gradient_field(vL_x, vR_x, field_vars(drhox_id)%sf, l, 1)
+                else if (l == 2) then
+                    call s_gradient_field(vL_y, vR_y, field_vars(drhoy_id)%sf, l, 1)
+                else if (l == 3) then
+                    call s_gradient_field(vL_z, vR_z, field_vars(drhoz_id)%sf, l, 1)
+                end if
+            end do
+        end if
+
+        myGamma = (1._wp/fluid_pp(1)%gamma) + 1._wp
+        if (viscous) then
+            myRe = 1._wp/fluid_pp(1)%Re(1)
+        else
+            ! TODO: Wire to a fluid parameter for non-air flows
+            myRe = 1.845e-5_wp
+        end if
+
+        call nvtxStartRange("LAGRANGE-PARTICLE-DYNAMICS")
+
+        !> Compute Fluid-Particle Forces (drag/pressure/added mass) and convert to particle acceleration
+        $:GPU_PARALLEL_LOOP(private='[i, k, l, cell, s_cell, myMass, myR, myR0, myPos, myVel, myVolumeFrac, force_vec, rmass_add, &
+                            & func_sum, mydrhodt]', copyin='[stage, myGamma, myRe, only_beta]')
+        do k = 1, n_el_particles_loc
+            f_p(k,:) = 0._wp
+            p_owner_rank(k) = proc_rank
+
+            s_cell = particle_s(k,1:3,2)
+            cell = int(s_cell(:))
+            do i = 1, num_dims
+                if (s_cell(i) < 0._wp) cell(i) = cell(i) - 1
+            end do
+
+            ! Current particle state
+            myMass = particle_mass(k)
+            myR = particle_rad(k, 2)
+            myR0 = particle_R0(k)
+            myPos = particle_pos(k,:,2)
+            myVel = particle_vel(k,:,2)
+            myVolumeFrac = 1._wp - q_particles(alphaf_id)%sf(cell(1), cell(2), cell(3))
+            mydrhodt = rhs_vf(1)%sf(cell(1), cell(2), cell(3))
+
+            particle_dposdt(k,:,stage) = 0._wp
+            particle_dveldt(k,:,stage) = 0._wp
+            particle_draddt(k, stage) = 0._wp
+
+            call s_get_particle_force(myPos, myR, myVel, myMass, myRe, myGamma, myVolumeFrac, mydrhodt, cell, q_prim_vf, &
+                                      & field_vars, weights_x_interp, weights_y_interp, weights_z_interp, force_vec, rmass_add)
+
+            p_AM(k) = rMass_add
+            f_p(k,:) = f_p(k,:) + force_vec(:)
+
+            if (.not. lag_params%collision_force) then
+                myMass = particle_mass(k) + p_AM(k)
+                myVel = particle_vel(k,:,2)
+                do l = 1, num_dims
+                    particle_dposdt(k, l, stage) = myVel(l)
+                    particle_dveldt(k, l, stage) = f_p(k, l)/myMass
+                    particle_draddt(k, stage) = 0._wp
+                end do
+            end if
+
+            if (lag_params%solver_approach == 2) then
+                func_sum = gSum(k)
+                call s_gaussian_atomic(myR, myVel, myPos, force_vec, func_sum, cell, q_particles, only_beta)
+            end if
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        if (lag_params%solver_approach == 2) then
+            call s_finalize_beta_field(bc_type, only_beta)
+        end if
+
+        call nvtxStartRange("LAGRANGE-PARTICLE-COLLISIONS")
+        if (lag_params%collision_force) then
+            !> Compute Particle-Particle collision forces
+            call s_compute_particle_EL_collisions(stage, bc_type)
+
+            $:GPU_PARALLEL_LOOP(private='[k, l, myMass, myVel]')
+            do k = 1, n_el_particles_loc
+                myMass = particle_mass(k) + p_AM(k)
+                myVel = particle_vel(k,:,2)
+                do l = 1, num_dims
+                    particle_dposdt(k, l, stage) = myVel(l)
+                    particle_dveldt(k, l, stage) = f_p(k, l)/myMass
+                    particle_draddt(k, stage) = 0._wp
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
+        call nvtxEndRange
+
+        call nvtxEndRange
+
+    end subroutine s_compute_particle_EL_dynamics
+
+    !> Compute inter-particle collision forces using a soft-sphere DEM model. Uses a cell-based linked list for O(N) neighbor
+    !! search. The contact force model is a spring-dashpot (Hertzian stiffness with viscous damping). Forces on particles owned by
+    !! other MPI ranks are buffered for later transfer.
+    subroutine s_compute_particle_EL_collisions(stage, bc_type)
+
+        integer, intent(in) :: stage
+        type(integer_field), dimension(1:num_dims,1:2), intent(in) :: bc_type
+        integer, dimension(3) :: cell
+        real(wp), dimension(3) :: s_cell
+        integer, dimension(3) :: cellaux
+        integer :: i, k, l, q, ip, jp, kp, ii, jj, kk
+        logical :: celloutside
+        real(wp) :: pidtksp2, ksp, nu1, nu2, Rp1, Rp2, E1, E2, Estar, cor, rmag, Rstar, dij, eta_n, kappa_n, mp1, mp2, dt_loc
+        real(wp), dimension(3) :: xp1, xp2, vp1, vp2, v_rel, rpij, nij, vnij, Fnpp_ij, force_vec
+        integer :: kpz
+        integer :: total_recv
+        integer :: glb_id, count
+        integer :: n_el_particles_loc_before_ghost
+
+        if (num_procs > 1) then
+            n_el_particles_loc_before_ghost = n_el_particles_loc
+            call s_reset_force_buffers()
+            call s_add_ghost_particles()
+        end if
+
+        kpz = 0
+        if (num_dims == 3) kpz = 1
+
+        ksp = 10._wp  ! Spring stiffness multiplier
+        nu1 = 0.35_wp  ! Poisson's ratio, particle 1 (glass/steel-like)
+        nu2 = 0.35_wp  ! Poisson's ratio, particle 2
+        E1 = 1.e9_wp  ! Young's modulus [Pa], particle 1
+        E2 = 1.e9_wp  ! Young's modulus [Pa], particle 2
+        cor = 0.7_wp  ! Coefficient of restitution
+
+        pidtksp2 = (pi**2)/((dt*ksp)**2)
+
+        Estar = 1._wp/(((1._wp - nu1**2)/E1) + ((1._wp - nu2**2)/E2))
+        Estar = (4._wp/3._wp)*Estar
+
+        call s_reset_linked_list()
+
+        call nvtxStartRange("LAGRANGE-PARTICLE-COLLISIONS")
+        error_flag = 0
+        $:GPU_UPDATE(device='[error_flag]')
+
+        $:GPU_PARALLEL_LOOP(private='[i, k, cell, ip, jp, kp, Rp1, xp1, mp1, vp1, kk, jj, ii, cellaux, q, Rp2, xp2, mp2, vp2, &
+                            & v_rel, Rstar, rpij, rmag, nij, vnij, dij, kappa_n, eta_n, Fnpp_ij, force_vec, s_cell, celloutside, &
+                            & count]', copyin='[ksp, nu1, nu2, E1, E2, cor, pidtksp2, Estar, kpz]')
+        do k = 1, n_el_particles_loc
+            if (.not. particle_in_domain_physical(particle_pos(k,1:3,2))) then
+                cycle
+            end if
+
+            s_cell = particle_s(k,1:3,2)
+            cell = int(s_cell(:))
+            do i = 1, num_dims
+                if (s_cell(i) < 0._wp) cell(i) = cell(i) - 1
+            end do
+
+            ip = cell(1)
+            jp = cell(2)
+            kp = cell(3)
+
+            Rp1 = particle_rad(k, 2)
+            xp1 = particle_pos(k,:,2)
+            mp1 = particle_mass(k)
+            vp1 = particle_vel(k,:,2)
+
+            do kk = kp - kpz, kp + kpz
+                do jj = jp - 1, jp + 1
+                    do ii = ip - 1, ip + 1
+                        cellaux(1) = ii
+                        cellaux(2) = jj
+                        cellaux(3) = kk
+
+                        call s_check_celloutside_wbuff(cellaux, celloutside)
+
+                        if (.not. celloutside) then
+                            q = particle_head(ii, jj, kk)
+                            ! Traverse linked list in that cell
+
+                            count = 0
+                            do while (q /= -1)
+                                count = count + 1
+                                if (count > n_el_particles_loc) then
+                                    $:GPU_ATOMIC(atomic='write')
+                                    error_flag = 1
+                                    exit
+                                end if
+
+                                if (lag_part_id(q, 1) > lag_part_id(k, 1)) then
+                                    Rp2 = particle_rad(q, 2)
+                                    xp2 = particle_pos(q,:,2)
+                                    mp2 = particle_mass(q)
+                                    vp2 = particle_vel(q,:,2)
+                                    v_rel = vp2 - vp1
+
+                                    Rstar = (Rp1*Rp2)/(Rp1 + Rp2)
+                                    rpij = xp2 - xp1
+                                    rmag = sqrt(rpij(1)**2 + rpij(2)**2 + rpij(3)**2)
+                                    rmag = max(rmag, eps_overlap)
+                                    nij = rpij/rmag
+                                    vnij = dot_product(v_rel, nij)*nij
+                                    dij = (Rp1 + Rp2) - rmag
+
+                                    if (dij > 0._wp) then
+                                        kappa_n = min((pidtksp2*mp1), (pidtksp2*mp2), (Estar*sqrt(Rstar)*sqrt(abs(dij))))
+
+                                        eta_n = ((-2._wp*sqrt(kappa_n)*log(cor))/sqrt((log(cor))**2 + pi**2)) &
+                                                 & *(1._wp/sqrt((1._wp/mp1) + (1._wp/mp2)))
+
+                                        Fnpp_ij = -kappa_n*dij*nij - eta_n*vnij
+
+                                        f_p(k,:) = f_p(k,:) + Fnpp_ij
+
+                                        if (p_owner_rank(q) == proc_rank) then
+                                            ! f_p(q, :) = f_p(q, :) - Fnpp_ij
+
+                                            $:GPU_ATOMIC(atomic='update')
+                                            f_p(q, 1) = f_p(q, 1) - Fnpp_ij(1)
+
+                                            $:GPU_ATOMIC(atomic='update')
+                                            f_p(q, 2) = f_p(q, 2) - Fnpp_ij(2)
+
+                                            $:GPU_ATOMIC(atomic='update')
+                                            f_p(q, 3) = f_p(q, 3) - Fnpp_ij(3)
+                                        else
+                                            call s_add_force_to_send_buffer(p_owner_rank(q), lag_part_id(q, 1), -Fnpp_ij)
+                                        end if
+                                    end if
+                                end if
+
+                                q = linked_list(q)
+                            end do
+                        end if
+                    end do
+                end do
+            end do
+
+            !> Check each local particle for wall collisions
+
+            call s_compute_wall_collisions(xp1, vp1, Rp1, mp1, Estar, pidtksp2, cor, force_vec)
+            f_p(k,:) = f_p(k,:) + force_vec
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        call nvtxEndRange
+
+        $:GPU_UPDATE(host='[error_flag]')
+        if (error_flag == 1) then
+            call s_mpi_abort("Linked list infinite loop detected")
+        end if
+
+        if (num_procs > 1) then
+            n_el_particles_loc = n_el_particles_loc_before_ghost
+            $:GPU_UPDATE(device='[n_el_particles_loc]')
+
+            total_recv = 0
+            force_recv_ids = 0
+            force_recv_vals = 0._wp
+
+            call s_transfer_collision_forces(total_recv, force_recv_ids, force_recv_vals)
+
+            $:GPU_UPDATE(device = '[force_recv_ids, force_recv_vals]')
+
+            $:GPU_PARALLEL_LOOP(private='[i, k]',copyin = '[total_recv]')
+            do i = 1, total_recv
+                k = gid_to_local(force_recv_ids(i))
+                if (k > 0) then
+                    $:GPU_ATOMIC(atomic='update')
+                    f_p(k, 1) = f_p(k, 1) + force_recv_vals(3*(i - 1) + 1)
+
+                    $:GPU_ATOMIC(atomic='update')
+                    f_p(k, 2) = f_p(k, 2) + force_recv_vals(3*(i - 1) + 2)
+
+                    $:GPU_ATOMIC(atomic='update')
+                    f_p(k, 3) = f_p(k, 3) + force_recv_vals(3*(i - 1) + 3)
+                end if
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
+
+    end subroutine s_compute_particle_EL_collisions
+
+    !> This subroutine checks for particles at solid walls to compute a collision force
+    subroutine s_compute_wall_collisions(pos, vel, rad, mass, Es, pidtksp, core, wcol_force)
+
+        $:GPU_ROUTINE(function_name='s_compute_wall_collisions',parallelism='[seq]', cray_inline=True)
+
+        real(wp), dimension(3), intent(in)    :: pos, vel
+        real(wp), intent(in)                  :: rad, mass, Es, pidtksp, core
+        real(wp), dimension(3), intent(inout) :: wcol_force
+        real(wp)                              :: dij
+
+        wcol_force = 0._wp
+
+        ! Check for particles at solid boundaries
+        if (any(bc_x%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/))) then
+            dij = rad - (pos(1) - x_cb(-1))
+
+            if (dij > 0._wp) then
+                call s_compute_wall_collision_force(dij, vel, rad, mass, Es, pidtksp, core, 1, 1._wp, wcol_force)
+            end if
+        end if
+
+        if (any(bc_x%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/))) then
+            dij = rad - (x_cb(m) - pos(1))
+
+            if (dij > 0._wp) then
+                call s_compute_wall_collision_force(dij, vel, rad, mass, Es, pidtksp, core, 1, -1._wp, wcol_force)
+            end if
+        end if
+
+        if (any(bc_y%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/))) then
+            dij = rad - (pos(2) - y_cb(-1))
+
+            if (dij > 0._wp) then
+                call s_compute_wall_collision_force(dij, vel, rad, mass, Es, pidtksp, core, 2, 1._wp, wcol_force)
+            end if
+        end if
+
+        if (any(bc_y%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/))) then
+            dij = rad - (y_cb(n) - pos(2))
+
+            if (dij > 0._wp) then
+                call s_compute_wall_collision_force(dij, vel, rad, mass, Es, pidtksp, core, 2, -1._wp, wcol_force)
+            end if
+        end if
+
+        if (p > 0) then
+            if (any(bc_z%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/))) then
+                dij = rad - (pos(3) - z_cb(-1))
+
+                if (dij > 0._wp) then
+                    call s_compute_wall_collision_force(dij, vel, rad, mass, Es, pidtksp, core, 3, 1._wp, wcol_force)
+                end if
+            end if
+
+            if (any(bc_z%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/))) then
+                dij = rad - (z_cb(p) - pos(3))
+
+                if (dij > 0._wp) then
+                    call s_compute_wall_collision_force(dij, vel, rad, mass, Es, pidtksp, core, 3, -1._wp, wcol_force)
+                end if
+            end if
+        end if
+
+    end subroutine s_compute_wall_collisions
+
+    !> This subroutine computes the collision force with a solid wall
+    subroutine s_compute_wall_collision_force(dij, vel, rad, mass, Es, pidtksp, core, dir, normal, wcol_force)
+
+        $:GPU_ROUTINE(function_name='s_compute_wall_collision_force',parallelism='[seq]', cray_inline=True)
+
+        real(wp), dimension(3), intent(in)    :: vel
+        real(wp), intent(in)                  :: dij, rad, mass, Es, pidtksp, core, normal
+        integer, intent(in)                   :: dir
+        real(wp), dimension(3), intent(inout) :: wcol_force
+        real(wp), dimension(3)                :: nij, v_rel, vnij
+        real(wp)                              :: kappa_n, eta_n
+
+        ! Normal points away from wall (into domain)
+        nij = 0._wp
+        nij(dir) = normal
+
+        ! Relative velocity (wall has zero velocity)
+        v_rel = vel
+        vnij = dot_product(v_rel, nij)*nij
+
+        ! Wall has infinite mass so use mp1 only
+        kappa_n = min((pidtksp*mass), (Es*sqrt(rad)*sqrt(abs(dij))))
+
+        eta_n = ((-2._wp*sqrt(kappa_n)*log(core))/sqrt((log(core))**2 + pi**2))*(1._wp/sqrt(1._wp/mass))
+
+        wcol_force = wcol_force + (kappa_n*dij*nij - eta_n*vnij)
+
+    end subroutine s_compute_wall_collision_force
+
+    !> This subroutine adds temporary ghost particles for collision purposes
+    subroutine s_add_ghost_particles()
+
+        integer               :: k, i, q
+        integer               :: patch_id, newBubs
+        integer, dimension(3) :: cell
+        logical               :: inc_ghost
+
+        inc_ghost = .true.
+
+        call nvtxStartRange("LAG-GHOSTADD")
+        call nvtxStartRange("LAG-GHOSTADD-DEV2HOST")
+        $:GPU_UPDATE(host='[p_owner_rank, particle_R0, Rmax_stats_part, Rmin_stats_part, particle_mass, f_p, lag_part_id, &
+                     & particle_rad, particle_pos, particle_posPrev, particle_vel, particle_s, particle_draddt, particle_dposdt, &
+                     & particle_dveldt, n_el_particles_loc, wrap_bubble_dir, wrap_bubble_loc]')
+        call nvtxEndRange
+
+        ! Handle MPI transfer of particles going to another processor's local domain
+        if (num_procs > 1) then
+            call nvtxStartRange("LAG-GHOSTADD-TRANSFER-LIST")
+            call s_add_particles_to_transfer_list(n_el_particles_loc, particle_pos(:,:,2), particle_posPrev(:,:,2), inc_ghost)
+            call nvtxEndRange
+
+            call nvtxStartRange("LAG-GHOSTADD-SENDRECV")
+            call s_mpi_sendrecv_solid_particles(p_owner_rank, particle_R0, Rmax_stats_part, Rmin_stats_part, particle_mass, f_p, &
+                                                & lag_part_id, particle_rad, particle_pos, particle_posPrev, particle_vel, &
+                                                & particle_s, particle_draddt, particle_dposdt, particle_dveldt, lag_num_ts, &
+                                                & n_el_particles_loc, 2)
+            call nvtxEndRange
+        end if
+
+        call nvtxStartRange("LAG-GHOSTADD-HOST2DEV")
+        $:GPU_UPDATE(device='[p_owner_rank, particle_R0, Rmax_stats_part, Rmin_stats_part, particle_mass, f_p, lag_part_id, &
+                     & particle_rad, particle_pos, particle_posPrev, particle_vel, particle_s, particle_draddt, particle_dposdt, &
+                     & particle_dveldt, n_el_particles_loc]')
+        call nvtxEndRange
+
+        call nvtxEndRange  ! LAG-GHOSTADD
+
+        $:GPU_PARALLEL_LOOP(private='[k, cell]')
+        do k = 1, n_el_particles_loc
+            cell = fd_number - buff_size
+            call s_locate_cell(particle_pos(k,1:3,2), cell, particle_s(k,1:3,2))
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+    end subroutine s_add_ghost_particles
+
+    !> The purpose of this subroutine is to check if the current cell is outside the computational domain or not (including ghost
+    !! cells).
+    !! @param cellaux Tested cell to smear the particle effect in.
+    !! @param celloutside If true, then cellaux is outside the computational domain.
+    subroutine s_check_celloutside_wbuff(cellaux, celloutside)
+
+        $:GPU_ROUTINE(function_name='s_check_celloutside_wbuff',parallelism='[seq]', cray_inline=True)
+
+        integer, dimension(3), intent(inout) :: cellaux
+        logical, intent(out)                 :: celloutside
+
+        celloutside = .false.
+
+        if (num_dims == 2) then
+            if ((cellaux(1) < -buff_size) .or. (cellaux(2) < -buff_size)) then
+                celloutside = .true.
+            end if
+
+            if ((cellaux(1) > m + buff_size) .or. (cellaux(2) > n + buff_size)) then
+                celloutside = .true.
+            end if
+        else
+            if ((cellaux(1) < -buff_size) .or. (cellaux(2) < -buff_size) .or. (cellaux(3) < -buff_size)) then
+                celloutside = .true.
+            end if
+
+            if ((cellaux(1) > m + buff_size) .or. (cellaux(2) > n + buff_size) .or. (cellaux(3) > p + buff_size)) then
+                celloutside = .true.
+            end if
+        end if
+
+    end subroutine s_check_celloutside_wbuff
+
+    !> Compute particle source terms for two-way Euler-Lagrange coupling (Maeda & Colonius, 2018)
+    !! @param q_cons_vf Conservative variables
+    !! @param q_prim_vf Primitive variables
+    !! @param rhs_vf Time derivative of the conservative variables
+    subroutine s_compute_particles_EL_source(q_cons_vf, q_prim_vf, rhs_vf, stage)
+
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
+        type(scalar_field), dimension(sys_size), intent(inout) :: rhs_vf
+        integer, intent(in)                                    :: stage
+        integer                                                :: i, j, k, l, nf
+        real(wp)                                               :: dalphapdt, alpha_f, udot_gradalpha
+
+        ! Spatial derivative of the fluid volume fraction and eulerian particle momentum fields.
+
+        do l = 1, num_dims
+            call s_gradient_dir_fornberg(q_particles(alphaf_id)%sf, field_vars(dalphafx_id + l - 1)%sf, l)
+            call s_gradient_dir_fornberg(q_particles(alphaupx_id + l - 1)%sf, field_vars(dalphap_upx_id + l - 1)%sf, l)
+        end do
+
+        !> Apply particle sources to the Eulerian RHS
+        $:GPU_PARALLEL_LOOP(private='[i, j, k, alpha_f, dalphapdt, udot_gradalpha]', collapse=3)
+        do k = idwint(3)%beg, idwint(3)%end
+            do j = idwint(2)%beg, idwint(2)%end
+                do i = idwint(1)%beg, idwint(1)%end
+                    if (q_particles(alphaf_id)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then
+                        alpha_f = q_particles(alphaf_id)%sf(i, j, k)
+
+                        dalphapdt = 0._wp
+                        udot_gradalpha = 0._wp
+                        do l = 1, num_dims
+                            dalphapdt = dalphapdt + field_vars(dalphap_upx_id + l - 1)%sf(i, j, k)
+                            udot_gradalpha = udot_gradalpha + q_prim_vf(momxb + l - 1)%sf(i, j, &
+                                & k)*field_vars(dalphafx_id + l - 1)%sf(i, j, k)
+                        end do
+                        dalphapdt = -dalphapdt
+                        ! Add any contribution to dalphapdt from particles growing or shrinking
+
+                        !> Step 1: Source terms for volume fraction corrections
+                        ! cons_var/alpha_f * (dalpha_p/dt - u dot grad(alpha_f))
+                        do l = 1, E_idx
+                            rhs_vf(l)%sf(i, j, k) = rhs_vf(l)%sf(i, j, k) + (q_cons_vf(l)%sf(i, j, &
+                                   & k)/alpha_f)*(dalphapdt - udot_gradalpha)
+                        end do
+
+                        ! momentum term -1/alpha_f * (p*grad(alpha_f) - Tau^v dot grad(alpha_f)) !Viscous term not implemented
+                        do l = 1, num_dims
+                            rhs_vf(momxb + l - 1)%sf(i, j, k) = rhs_vf(momxb + l - 1)%sf(i, j, &
+                                   & k) - ((1._wp/alpha_f)*(q_prim_vf(E_idx)%sf(i, j, k)*field_vars(dalphafx_id + l - 1)%sf(i, j, &
+                                   & k)))
+                        end do
+
+                        ! energy term -1/alpha_f * (p*u dot grad(alpha_f) - (Tau^v dot u) dot grad(alpha_f)) !Viscous term not
+                        ! implemented
+                        rhs_vf(E_idx)%sf(i, j, k) = rhs_vf(E_idx)%sf(i, j, k) - ((1._wp/alpha_f)*(q_prim_vf(E_idx)%sf(i, j, &
+                               & k)*udot_gradalpha))
+
+                        !> Step 2: Add the drag/pressure/added mass forces to the fluid
+                        rhs_vf(momxb)%sf(i, j, k) = rhs_vf(momxb)%sf(i, j, k) + q_particles(Smx_id)%sf(i, j, k)*(1._wp/alpha_f)
+                        rhs_vf(momxb + 1)%sf(i, j, k) = rhs_vf(momxb + 1)%sf(i, j, k) + q_particles(Smy_id)%sf(i, j, &
+                               & k)*(1._wp/alpha_f)
+
+                        ! Energy source
+                        rhs_vf(E_idx)%sf(i, j, k) = rhs_vf(E_idx)%sf(i, j, k) + (q_particles(Smx_id)%sf(i, j, &
+                               & k)*q_prim_vf(momxb)%sf(i, j, k) + q_particles(Smy_id)%sf(i, j, k)*q_prim_vf(momxb + 1)%sf(i, j, &
+                               & k) + q_particles(SE_id)%sf(i, j, k))*(1._wp/alpha_f)
+
+                        if (num_dims == 3) then
+                            rhs_vf(momxb + 2)%sf(i, j, k) = rhs_vf(momxb + 2)%sf(i, j, k) + q_particles(Smz_id)%sf(i, j, &
+                                   & k)*(1._wp/alpha_f)
+                            ! Energy source
+                            rhs_vf(E_idx)%sf(i, j, k) = rhs_vf(E_idx)%sf(i, j, k) + (q_particles(Smz_id)%sf(i, j, &
+                                   & k)*q_prim_vf(momxb + 2)%sf(i, j, k))*(1._wp/alpha_f)
+                        end if
+                    end if
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+    end subroutine s_compute_particles_EL_source
+
+    !> Reset and rebuild the cell-based linked list for particle-to-cell mapping
+    subroutine s_reset_linked_list()
+
+        integer :: j, k, l
+
+        $:GPU_PARALLEL_LOOP(private='[j, k, l]', collapse=3)
+        do l = idwbuff(3)%beg, idwbuff(3)%end
+            do k = idwbuff(2)%beg, idwbuff(2)%end
+                do j = idwbuff(1)%beg, idwbuff(1)%end
+                    particle_head(j, k, l) = -1
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        $:GPU_PARALLEL_LOOP(private='[k]')
+        do k = 1, n_el_particles_loc
+            linked_list(k) = -1
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        call s_build_linked_list()
+
+    end subroutine s_reset_linked_list
+
+    !> Zero all Eulerian field variables and particle projection arrays
+    subroutine s_reset_cell_vars()
+
+        integer :: i, j, k, l
+
+        $:GPU_PARALLEL_LOOP(private='[i, j, k, l]', collapse=4)
+        do i = 1, max(nField_vars, q_particles_idx)  ! outermost is largest of the i-like dims
+            do l = idwbuff(3)%beg, idwbuff(3)%end
+                do k = idwbuff(2)%beg, idwbuff(2)%end
+                    do j = idwbuff(1)%beg, idwbuff(1)%end
+                        ! Zero field_vars if i <= nField_vars
+                        if (i <= nField_vars) field_vars(i)%sf(j, k, l) = 0._wp
+                        ! Zero q_particles if i <= q_particles_idx
+                        if (i <= q_particles_idx) then
+                            q_particles(i)%sf(j, k, l) = 0._wp
+                        end if
+                    end do
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+    end subroutine s_reset_cell_vars
+
+    !> Finalize the particle volume fraction field after Gaussian smearing. Applies boundary conditions to the smeared field, then
+    !! converts accumulated particle volume alpha_p to fluid volume fraction alpha_f = 1 - alpha_p.
+    subroutine s_finalize_beta_field(bc_type, onlyBeta)
+
+        type(integer_field), dimension(1:num_dims,1:2), intent(in) :: bc_type
+        integer                                                    :: j, k, l
+        logical, intent(in)                                        :: onlyBeta
+
+        call nvtxStartRange("PARTICLES-LAGRANGE-BETA-COMM")
+        if (onlyBeta) then
+            call s_populate_beta_buffers(q_particles, bc_type, 1)
+        else
+            call s_populate_beta_buffers(q_particles, bc_type, q_particles_idx)
+        end if
+        call nvtxEndRange
+
+        ! Store 1-q_particles(1)
+        $:GPU_PARALLEL_LOOP(private='[j, k, l]', collapse=3)
+        do l = idwbuff(3)%beg, idwbuff(3)%end
+            do k = idwbuff(2)%beg, idwbuff(2)%end
+                do j = idwbuff(1)%beg, idwbuff(1)%end
+                    q_particles(alphaf_id)%sf(j, k, l) = 1._wp - q_particles(alphaf_id)%sf(j, k, l)
+                    ! Limiting void fraction given max value
+                    q_particles(alphaf_id)%sf(j, k, l) = max(q_particles(alphaf_id)%sf(j, k, l), 1._wp - lag_params%valmaxvoid)
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+    end subroutine s_finalize_beta_field
+
+    !> Build a cell-based linked list for particle-to-cell mapping. particle_head(i,j,k) points to the first particle in cell
+    !! (i,j,k). linked_list(k) points to the next particle in the same cell (-1 = end).
+    subroutine s_build_linked_list()
+
+        integer                :: k, glb_id, i
+        integer, dimension(3)  :: cell
+        real(wp), dimension(3) :: s_cell
+        logical                :: celloutside
+
+        $:GPU_PARALLEL_LOOP(private='[i, k, cell, s_cell, glb_id, celloutside]')
+        do k = 1, n_el_particles_loc
+            glb_id = lag_part_id(k, 1)
+            gid_to_local(glb_id) = k
+
+            s_cell = particle_s(k,1:3,2)
+            cell = int(s_cell(:))
+            do i = 1, num_dims
+                if (s_cell(i) < 0._wp) cell(i) = cell(i) - 1
+            end do
+
+            call s_check_celloutside_wbuff(cell, celloutside)
+
+            if (.not. celloutside) then
+                !!!!! Particle linked list building
+                $:GPU_ATOMIC(atomic='capture')
+                linked_list(k) = particle_head(cell(1), cell(2), cell(3))
+                particle_head(cell(1), cell(2), cell(3)) = k
+                $:END_GPU_ATOMIC_CAPTURE()
+            end if
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+    end subroutine s_build_linked_list
+
+    !> This subroutine updates the Lagrange variables using the tvd RK time steppers. The time derivative of the particle variables
+    !! must be stored at every stage to avoid precision errors.
+    !! @param stage Current tvd RK stage
+    impure subroutine s_update_lagrange_particles_tdv_rk(q_prim_vf, bc_type, stage)
+
+        type(scalar_field), dimension(sys_size), intent(in)        :: q_prim_vf
+        type(integer_field), dimension(1:num_dims,1:2), intent(in) :: bc_type
+        integer, intent(in)                                        :: stage
+        integer                                                    :: k
+
+        if (time_stepper == 1) then  ! 1st order TVD RK
+
+            $:GPU_PARALLEL_LOOP(private='[k]')
+            do k = 1, n_el_particles_loc
+                ! u{1} = u{n} +  dt * RHS{n}
+                particle_rad(k, 1) = particle_rad(k, 1) + dt*particle_draddt(k, 1)
+                if (moving_lag_particles) then
+                    particle_posPrev(k,1:3,1) = particle_pos(k,1:3,1)
+                    particle_pos(k,1:3,1) = particle_pos(k,1:3,1) + dt*particle_dposdt(k,1:3,1)
+                    particle_vel(k,1:3,1) = particle_vel(k,1:3,1) + dt*particle_dveldt(k,1:3,1)
+                end if
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            call s_transfer_data_to_tmp_particles()
+            if (moving_lag_particles) call s_enforce_EL_particles_boundary_conditions(q_prim_vf, stage, bc_type)
+            if (lag_params%write_void_evol) call s_write_void_evol_particles(mytime)
+            if (lag_params%write_bubbles_stats) call s_calculate_lag_particle_stats()
+            if (lag_params%write_bubbles) then
+                ! $:GPU_UPDATE(host='[gas_p,gas_mv,particle_rad,intfc_vel]')
+                $:GPU_UPDATE(host='[particle_rad]')
+                call s_write_lag_particle_evol(mytime)
+            end if
+        else if (time_stepper == 2) then  ! 2nd order TVD RK
+            if (stage == 1) then
+                $:GPU_PARALLEL_LOOP(private='[k]')
+                do k = 1, n_el_particles_loc
+                    ! u{1} = u{n} +  dt * RHS{n}
+                    particle_rad(k, 2) = particle_rad(k, 1) + dt*particle_draddt(k, 1)
+                    if (moving_lag_particles) then
+                        particle_posPrev(k,1:3,2) = particle_pos(k,1:3,1)
+                        particle_pos(k,1:3,2) = particle_pos(k,1:3,1) + dt*particle_dposdt(k,1:3,1)
+                        particle_vel(k,1:3,2) = particle_vel(k,1:3,1) + dt*particle_dveldt(k,1:3,1)
+                    end if
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+
+                if (moving_lag_particles) call s_enforce_EL_particles_boundary_conditions(q_prim_vf, stage, bc_type)
+            else if (stage == 2) then
+                $:GPU_PARALLEL_LOOP(private='[k]')
+                do k = 1, n_el_particles_loc
+                    ! u{1} = u{n} + (1/2) * dt * (RHS{n} + RHS{1})
+                    particle_rad(k, 1) = particle_rad(k, 1) + dt*(particle_draddt(k, 1) + particle_draddt(k, 2))/2._wp
+                    if (moving_lag_particles) then
+                        particle_posPrev(k,1:3,1) = particle_pos(k,1:3,2)
+                        particle_pos(k,1:3,1) = particle_pos(k,1:3,1) + dt*(particle_dposdt(k,1:3,1) + particle_dposdt(k,1:3, &
+                                     & 2))/2._wp
+                        particle_vel(k,1:3,1) = particle_vel(k,1:3,1) + dt*(particle_dveldt(k,1:3,1) + particle_dveldt(k,1:3, &
+                                     & 2))/2._wp
+                    end if
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+
+                call s_transfer_data_to_tmp_particles()
+                if (moving_lag_particles) call s_enforce_EL_particles_boundary_conditions(q_prim_vf, stage, bc_type)
+                if (lag_params%write_void_evol) call s_write_void_evol_particles(mytime)
+                if (lag_params%write_bubbles_stats) call s_calculate_lag_particle_stats()
+                if (lag_params%write_bubbles) then
+                    $:GPU_UPDATE(host='[particle_rad]')
+                    call s_write_lag_particle_evol(mytime)
+                end if
+            end if
+        else if (time_stepper == 3) then  ! 3rd order TVD RK
+            if (stage == 1) then
+                $:GPU_PARALLEL_LOOP(private='[k]')
+                do k = 1, n_el_particles_loc
+                    ! u{1} = u{n} +  dt * RHS{n}
+                    particle_rad(k, 2) = particle_rad(k, 1) + dt*particle_draddt(k, 1)
+                    if (moving_lag_particles) then
+                        particle_posPrev(k,1:3,2) = particle_pos(k,1:3,1)
+                        particle_pos(k,1:3,2) = particle_pos(k,1:3,1) + dt*particle_dposdt(k,1:3,1)
+                        particle_vel(k,1:3,2) = particle_vel(k,1:3,1) + dt*particle_dveldt(k,1:3,1)
+                    end if
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+
+                if (moving_lag_particles) call s_enforce_EL_particles_boundary_conditions(q_prim_vf, stage, bc_type)
+            else if (stage == 2) then
+                $:GPU_PARALLEL_LOOP(private='[k]')
+                do k = 1, n_el_particles_loc
+                    ! u{2} = u{n} + (1/4) * dt * [RHS{n} + RHS{1}]
+                    particle_rad(k, 2) = particle_rad(k, 1) + dt*(particle_draddt(k, 1) + particle_draddt(k, 2))/4._wp
+                    if (moving_lag_particles) then
+                        particle_posPrev(k,1:3,2) = particle_pos(k,1:3,2)
+                        particle_pos(k,1:3,2) = particle_pos(k,1:3,1) + dt*(particle_dposdt(k,1:3,1) + particle_dposdt(k,1:3, &
+                                     & 2))/4._wp
+                        particle_vel(k,1:3,2) = particle_vel(k,1:3,1) + dt*(particle_dveldt(k,1:3,1) + particle_dveldt(k,1:3, &
+                                     & 2))/4._wp
+                    end if
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+
+                if (moving_lag_particles) call s_enforce_EL_particles_boundary_conditions(q_prim_vf, stage, bc_type)
+            else if (stage == 3) then
+                $:GPU_PARALLEL_LOOP(private='[k]')
+                do k = 1, n_el_particles_loc
+                    ! u{n+1} = u{n} + (2/3) * dt * [(1/4)* RHS{n} + (1/4)* RHS{1} + RHS{2}]
+                    particle_rad(k, 1) = particle_rad(k, 1) + (2._wp/3._wp)*dt*(particle_draddt(k, 1)/4._wp + particle_draddt(k, &
+                                 & 2)/4._wp + particle_draddt(k, 3))
+                    if (moving_lag_particles) then
+                        particle_posPrev(k,1:3,1) = particle_pos(k,1:3,2)
+                        particle_pos(k,1:3,1) = particle_pos(k,1:3,1) + (2._wp/3._wp)*dt*(particle_dposdt(k,1:3, &
+                                     & 1)/4._wp + particle_dposdt(k,1:3,2)/4._wp + particle_dposdt(k,1:3,3))
+                        particle_vel(k,1:3,1) = particle_vel(k,1:3,1) + (2._wp/3._wp)*dt*(particle_dveldt(k,1:3, &
+                                     & 1)/4._wp + particle_dveldt(k,1:3,2)/4._wp + particle_dveldt(k,1:3,3))
+                    end if
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+
+                call s_transfer_data_to_tmp_particles()
+                if (moving_lag_particles) call s_enforce_EL_particles_boundary_conditions(q_prim_vf, stage, bc_type)
+                if (lag_params%write_void_evol) call s_write_void_evol_particles(mytime)
+                if (lag_params%write_bubbles_stats) call s_calculate_lag_particle_stats()
+                if (lag_params%write_bubbles .and. mytime >= next_write_time) then
+                    $:GPU_UPDATE(host='[particle_mass, particle_rad]')
+                    call s_write_lag_particle_evol(mytime)
+                    next_write_time = next_write_time + t_save
+                end if
+            end if
+        end if
+
+    end subroutine s_update_lagrange_particles_tdv_rk
+
+    !> Enforce boundary conditions on Lagrangian particles. Phases: (1) GPU->host transfer, (2) MPI particle exchange with
+    !! neighbors, (3) host->GPU transfer, (4) per-particle BC (periodic wrap / reflect / remove), (5) compaction to remove deleted
+    !! particles, (6) re-smear onto Eulerian grid.
+    impure subroutine s_enforce_EL_particles_boundary_conditions(q_prim_vf, nstage, bc_type)
+
+        type(scalar_field), dimension(sys_size), intent(in)        :: q_prim_vf
+        type(integer_field), dimension(1:num_dims,1:2), intent(in) :: bc_type
+        integer, intent(in)                                        :: nstage
+        real(wp)                                                   :: offset
+        integer                                                    :: k, i, q
+        integer                                                    :: patch_id, newBubs, new_idx
+        integer, dimension(3)                                      :: cell
+        logical                                                    :: inc_ghost
+        real(wp)                                                   :: myR, func_sum
+        real(wp), dimension(3)                                     :: myPos, myVel, myForce
+        logical                                                    :: only_beta
+
+        inc_ghost = .false.
+        only_beta = .true.
+
+        call nvtxStartRange("LAG-BC")
+        call nvtxStartRange("LAG-BC-DEV2HOST")
+        $:GPU_UPDATE(host='[p_owner_rank, particle_R0, Rmax_stats_part, Rmin_stats_part, particle_mass, f_p, lag_part_id, &
+                     & particle_rad, particle_pos, particle_posPrev, particle_vel, particle_s, particle_draddt, particle_dposdt, &
+                     & particle_dveldt, keep_bubble, n_el_particles_loc, wrap_bubble_dir, wrap_bubble_loc]')
+        call nvtxEndRange
+
+        ! Handle MPI transfer of particles going to another processor's local domain
+        if (num_procs > 1) then
+            call nvtxStartRange("LAG-BC-TRANSFER-LIST")
+            call s_add_particles_to_transfer_list(n_el_particles_loc, particle_pos(:,:,2), particle_posPrev(:,:,2), inc_ghost)
+            call nvtxEndRange
+
+            call nvtxStartRange("LAG-BC-SENDRECV")
+            call s_mpi_sendrecv_solid_particles(p_owner_rank, particle_R0, Rmax_stats_part, Rmin_stats_part, particle_mass, f_p, &
+                                                & lag_part_id, particle_rad, particle_pos, particle_posPrev, particle_vel, &
+                                                & particle_s, particle_draddt, particle_dposdt, particle_dveldt, lag_num_ts, &
+                                                & n_el_particles_loc, 2)
+            call nvtxEndRange
+        end if
+
+        call nvtxStartRange("LAG-BC-HOST2DEV")
+        $:GPU_UPDATE(device='[p_owner_rank, particle_R0, Rmax_stats_part, Rmin_stats_part, particle_mass, f_p, lag_part_id, &
+                     & particle_rad, particle_pos, particle_posPrev, particle_vel, particle_s, particle_draddt, particle_dposdt, &
+                     & particle_dveldt, n_el_particles_loc]')
+        call nvtxEndRange
+
+        $:GPU_PARALLEL_LOOP(private='[k, cell]',copyin='[nstage]')
+        do k = 1, n_el_particles_loc
+            keep_bubble(k) = 1
+            wrap_bubble_loc(k,:) = 0
+            wrap_bubble_dir(k,:) = 0
+
+            ! Relocate particles at solid boundaries and delete particles that leave buffer regions
+            if (any(bc_x%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) .and. particle_pos(k, 1, &
+                & 2) < x_cb(-1) + eps_overlap*particle_rad(k, 2)) then
+                particle_pos(k, 1, 2) = x_cb(-1) + eps_overlap*particle_rad(k, 2)
+                if (nstage == lag_num_ts) then
+                    particle_pos(k, 1, 1) = particle_pos(k, 1, 2)
+                end if
+            else if (any(bc_x%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) .and. particle_pos(k, &
+                     & 1, 2) > x_cb(m) - eps_overlap*particle_rad(k, 2)) then
+                particle_pos(k, 1, 2) = x_cb(m) - eps_overlap*particle_rad(k, 2)
+                if (nstage == lag_num_ts) then
+                    particle_pos(k, 1, 1) = particle_pos(k, 1, 2)
+                end if
+            else if (bc_x%beg == BC_PERIODIC .and. particle_pos(k, 1, 2) < pcomm_coords(1)%beg .and. particle_posPrev(k, 1, &
+                     & 2) >= pcomm_coords(1)%beg) then
+                wrap_bubble_dir(k, 1) = 1
+                wrap_bubble_loc(k, 1) = -1
+            else if (bc_x%end == BC_PERIODIC .and. particle_pos(k, 1, 2) > pcomm_coords(1)%end .and. particle_posPrev(k, 1, &
+                     & 2) <= pcomm_coords(1)%end) then
+                wrap_bubble_dir(k, 1) = 1
+                wrap_bubble_loc(k, 1) = 1
+            else if (particle_pos(k, 1, 2) >= x_cb(m)) then
+                keep_bubble(k) = 0
+            else if (particle_pos(k, 1, 2) < x_cb(-1)) then
+                keep_bubble(k) = 0
+            end if
+
+            if (any(bc_y%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) .and. particle_pos(k, 2, &
+                & 2) < y_cb(-1) + eps_overlap*particle_rad(k, 2)) then
+                particle_pos(k, 2, 2) = y_cb(-1) + eps_overlap*particle_rad(k, 2)
+                if (nstage == lag_num_ts) then
+                    particle_pos(k, 2, 1) = particle_pos(k, 2, 2)
+                end if
+            else if (any(bc_y%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) .and. particle_pos(k, &
+                     & 2, 2) > y_cb(n) - eps_overlap*particle_rad(k, 2)) then
+                particle_pos(k, 2, 2) = y_cb(n) - eps_overlap*particle_rad(k, 2)
+                if (nstage == lag_num_ts) then
+                    particle_pos(k, 2, 1) = particle_pos(k, 2, 2)
+                end if
+            else if (bc_y%beg == BC_PERIODIC .and. particle_pos(k, 2, 2) < pcomm_coords(2)%beg .and. particle_posPrev(k, 2, &
+                     & 2) >= pcomm_coords(2)%beg) then
+                wrap_bubble_dir(k, 2) = 1
+                wrap_bubble_loc(k, 2) = -1
+            else if (bc_y%end == BC_PERIODIC .and. particle_pos(k, 2, 2) > pcomm_coords(2)%end .and. particle_posPrev(k, 2, &
+                     & 2) <= pcomm_coords(2)%end) then
+                wrap_bubble_dir(k, 2) = 1
+                wrap_bubble_loc(k, 2) = 1
+            else if (particle_pos(k, 2, 2) >= y_cb(n)) then
+                keep_bubble(k) = 0
+            else if (particle_pos(k, 2, 2) < y_cb(-1)) then
+                keep_bubble(k) = 0
+            end if
+
+            if (p > 0) then
+                if (any(bc_z%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) .and. particle_pos(k, 3, &
+                    & 2) < z_cb(-1) + eps_overlap*particle_rad(k, 2)) then
+                    particle_pos(k, 3, 2) = z_cb(-1) + eps_overlap*particle_rad(k, 2)
+                    if (nstage == lag_num_ts) then
+                        particle_pos(k, 3, 1) = particle_pos(k, 3, 2)
+                    end if
+                else if (any(bc_z%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, &
+                         & BC_NO_SLIP_WALL/)) .and. particle_pos(k, 3, 2) > z_cb(p) - eps_overlap*particle_rad(k, 2)) then
+                    particle_pos(k, 3, 2) = z_cb(p) - eps_overlap*particle_rad(k, 2)
+                    if (nstage == lag_num_ts) then
+                        particle_pos(k, 3, 1) = particle_pos(k, 3, 2)
+                    end if
+                else if (bc_z%beg == BC_PERIODIC .and. particle_pos(k, 3, 2) < pcomm_coords(3)%beg .and. particle_posPrev(k, 3, &
+                         & 2) >= pcomm_coords(3)%beg) then
+                    wrap_bubble_dir(k, 3) = 1
+                    wrap_bubble_loc(k, 3) = -1
+                else if (bc_z%end == BC_PERIODIC .and. particle_pos(k, 3, 2) > pcomm_coords(3)%end .and. particle_posPrev(k, 3, &
+                         & 2) <= pcomm_coords(3)%end) then
+                    wrap_bubble_dir(k, 3) = 1
+                    wrap_bubble_loc(k, 3) = 1
+                else if (particle_pos(k, 3, 2) >= z_cb(p)) then
+                    keep_bubble(k) = 0
+                else if (particle_pos(k, 3, 2) < z_cb(-1)) then
+                    keep_bubble(k) = 0
+                end if
+            end if
+
+            if (keep_bubble(k) == 1) then
+                ! Remove bubbles that are no longer in a liquid
+                cell = fd_number - buff_size
+                call s_locate_cell(particle_pos(k,1:3,2), cell, particle_s(k,1:3,2))
+
+                if (q_prim_vf(advxb)%sf(cell(1), cell(2), cell(3)) < (1._wp - lag_params%valmaxvoid)) then
+                    keep_bubble(k) = 0
+                end if
+            end if
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        if (n_el_particles_loc > 0) then
+            call nvtxStartRange("LAG-BC")
+            call nvtxStartRange("LAG-BC-DEV2HOST")
+            $:GPU_UPDATE(host='[particle_R0, Rmax_stats_part, Rmin_stats_part, particle_mass, f_p, lag_part_id, particle_rad, &
+                         & particle_pos, particle_posPrev, particle_vel, particle_s, particle_draddt, particle_dposdt, &
+                         & particle_dveldt, keep_bubble, n_el_particles_loc, wrap_bubble_dir, wrap_bubble_loc]')
+            call nvtxEndRange
+
+            newBubs = 0
+            do k = 1, n_el_particles_loc
+                if (keep_bubble(k) == 1) then
+                    newBubs = newBubs + 1
+                    if (newBubs /= k) then
+                        call s_copy_lag_particle(newBubs, k)
+                        wrap_bubble_dir(newBubs,:) = wrap_bubble_dir(k,:)
+                        wrap_bubble_loc(newBubs,:) = wrap_bubble_loc(k,:)
+                    end if
+                end if
+            end do
+
+            n_el_particles_loc = newBubs
+
+            ! Handle periodic wrapping of bubbles on same processor
+            newBubs = 0
+            do k = 1, n_el_particles_loc
+                if (any(wrap_bubble_dir(k,:) == 1)) then
+                    newBubs = newBubs + 1
+                    new_idx = n_el_particles_loc + newBubs
+                    call s_copy_lag_particle(new_idx, k)
+                    do i = 1, num_dims
+                        if (wrap_bubble_dir(k, i) == 1) then
+                            offset = glb_bounds(i)%end - glb_bounds(i)%beg
+                            if (wrap_bubble_loc(k, i) == 1) then
+                                do q = 1, 2
+                                    particle_pos(new_idx, i, q) = particle_pos(new_idx, i, q) - offset
+                                    particle_posPrev(new_idx, i, q) = particle_posPrev(new_idx, i, q) - offset
+                                end do
+                            else if (wrap_bubble_loc(k, i) == -1) then
+                                do q = 1, 2
+                                    particle_pos(new_idx, i, q) = particle_pos(new_idx, i, q) + offset
+                                    particle_posPrev(new_idx, i, q) = particle_posPrev(new_idx, i, q) + offset
+                                end do
+                            end if
+                        end if
+                    end do
+                end if
+            end do
+            call nvtxStartRange("LAG-BC-HOST2DEV")
+            $:GPU_UPDATE(device='[particle_R0, Rmax_stats_part, Rmin_stats_part, particle_mass, f_p, lag_part_id, particle_rad, &
+                         & particle_pos, particle_posPrev, particle_vel, particle_s, particle_draddt, particle_dposdt, &
+                         & particle_dveldt, n_el_particles_loc]')
+            call nvtxEndRange
+        end if
+
+        call s_reset_cell_vars()
+
+        $:GPU_PARALLEL_LOOP(private='[cell, myR, myPos, myVel, myForce, func_sum]',copyin='[only_beta]')
+        do k = 1, n_el_particles_loc
+            myR = particle_rad(k, 2)
+            myPos = particle_pos(k,1:3,2)
+            myVel = particle_vel(k,1:3,2)
+            myForce = f_p(k,:)
+
+            cell = fd_number - buff_size
+            call s_locate_cell(particle_pos(k,1:3,2), cell, particle_s(k,1:3,2))
+
+            ! Compute the total gaussian contribution for each particle for normalization
+            call s_compute_gaussian_contribution(myR, myPos, cell, func_sum)
+            gSum(k) = func_sum
+
+            call s_gaussian_atomic(myR, myVel, myPos, myForce, func_sum, cell, q_particles, only_beta)
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        ! Update void fraction and communicate buffers
+        call s_finalize_beta_field(bc_type, only_beta)
+
+        call nvtxEndRange  ! LAG-BC
+
+    end subroutine s_enforce_EL_particles_boundary_conditions
+
+    !> This subroutine returns the computational coordinate of the cell for the given position.
+    !! @param pos Input coordinates
+    !! @param cell Computational coordinate of the cell
+    !! @param scoord Calculated particle coordinates
+    subroutine s_locate_cell(pos, cell, scoord)
+
+        $:GPU_ROUTINE(function_name='s_locate_cell',parallelism='[seq]', cray_inline=True)
+
+        real(wp), dimension(3), intent(in)   :: pos
+        real(wp), dimension(3), intent(out)  :: scoord
+        integer, dimension(3), intent(inout) :: cell
+        integer                              :: i
+
+        do while (pos(1) < x_cb(cell(1) - 1) .and. cell(1) > -buff_size)
+            cell(1) = cell(1) - 1
+        end do
+
+        do while (pos(1) >= x_cb(cell(1)) .and. cell(1) < m + buff_size)
+            cell(1) = cell(1) + 1
+        end do
+
+        do while (pos(2) < y_cb(cell(2) - 1) .and. cell(2) > -buff_size)
+            cell(2) = cell(2) - 1
+        end do
+
+        do while (pos(2) >= y_cb(cell(2)) .and. cell(2) < n + buff_size)
+            cell(2) = cell(2) + 1
+        end do
+
+        if (p > 0) then
+            do while (pos(3) < z_cb(cell(3) - 1) .and. cell(3) > -buff_size)
+                cell(3) = cell(3) - 1
+            end do
+            do while (pos(3) >= z_cb(cell(3)) .and. cell(3) < p + buff_size)
+                cell(3) = cell(3) + 1
+            end do
+        end if
+
+        ! The numbering of the cell of which left boundary is the domain boundary is 0. if comp.coord of the pos is s, the real
+        ! coordinate of s is (the coordinate of the left boundary of the Floor(s)-th cell) + (s-(int(s))*(cell-width). In other
+        ! words, the coordinate of the center of the cell is x_cc(cell).
+
+        ! coordinates in computational space
+        scoord(1) = cell(1) + (pos(1) - x_cb(cell(1) - 1))/dx(cell(1))
+        scoord(2) = cell(2) + (pos(2) - y_cb(cell(2) - 1))/dy(cell(2))
+        scoord(3) = 0._wp
+        if (p > 0) scoord(3) = cell(3) + (pos(3) - z_cb(cell(3) - 1))/dz(cell(3))
+        cell(:) = int(scoord(:))
+        do i = 1, num_dims
+            if (scoord(i) < 0._wp) cell(i) = cell(i) - 1
+        end do
+
+    end subroutine s_locate_cell
+
+    !> This subroutine transfer data into the temporal variables.
+    impure subroutine s_transfer_data_to_tmp_particles()
+
+        integer :: k
+
+        $:GPU_PARALLEL_LOOP(private='[k]')
+        do k = 1, n_el_particles_loc
+            particle_rad(k, 2) = particle_rad(k, 1)
+            particle_pos(k,1:3,2) = particle_pos(k,1:3,1)
+            particle_posPrev(k,1:3,2) = particle_posPrev(k,1:3,1)
+            particle_vel(k,1:3,2) = particle_vel(k,1:3,1)
+            particle_s(k,1:3,2) = particle_s(k,1:3,1)
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+    end subroutine s_transfer_data_to_tmp_particles
+
+    !> The purpose of this procedure is to determine if the global coordinates of the bubbles are present in the current MPI
+    !! processor (including ghost cells).
+    !! @param pos_part Spatial coordinates of the bubble
+    function particle_in_domain(pos_part)
+
+        logical                            :: particle_in_domain
+        real(wp), dimension(3), intent(in) :: pos_part
+
+        ! 2D
+
+        if (p == 0 .and. cyl_coord .neqv. .true.) then
+            ! Defining a virtual z-axis that has the same dimensions as y-axis defined in the input file
+            particle_in_domain = ((pos_part(1) < x_cb(m + buff_size - fd_number)) .and. (pos_part(1) >= x_cb(fd_number &
+                                  & - buff_size - 1)) .and. (pos_part(2) < y_cb(n + buff_size - fd_number)) .and. (pos_part(2) &
+                                  & >= y_cb(fd_number - buff_size - 1)) .and. (pos_part(3) < lag_params%charwidth/2._wp) &
+                                  & .and. (pos_part(3) > -lag_params%charwidth/2._wp))
+        else
+            ! cyl_coord
+            particle_in_domain = ((pos_part(1) < x_cb(m + buff_size - fd_number)) .and. (pos_part(1) >= x_cb(fd_number &
+                                  & - buff_size - 1)) .and. (abs(pos_part(2)) < y_cb(n + buff_size - fd_number)) &
+                                  & .and. (abs(pos_part(2)) >= max(y_cb(fd_number - buff_size - 1), 0._wp)))
+        end if
+
+        ! 3D
+        if (p > 0) then
+            particle_in_domain = ((pos_part(1) < x_cb(m + buff_size - fd_number)) .and. (pos_part(1) >= x_cb(fd_number &
+                                  & - buff_size - 1)) .and. (pos_part(2) < y_cb(n + buff_size - fd_number)) .and. (pos_part(2) &
+                                  & >= y_cb(fd_number - buff_size - 1)) .and. (pos_part(3) < z_cb(p + buff_size - fd_number)) &
+                                  & .and. (pos_part(3) >= z_cb(fd_number - buff_size - 1)))
+        end if
+
+        ! For symmetric and wall boundary condition
+        if (any(bc_x%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/))) then
+            particle_in_domain = (particle_in_domain .and. (pos_part(1) >= x_cb(-1)))
+        end if
+        if (any(bc_x%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/))) then
+            particle_in_domain = (particle_in_domain .and. (pos_part(1) < x_cb(m)))
+        end if
+        if (any(bc_y%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) .and. (.not. cyl_coord)) then
+            particle_in_domain = (particle_in_domain .and. (pos_part(2) >= y_cb(-1)))
+        end if
+        if (any(bc_y%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) .and. (.not. cyl_coord)) then
+            particle_in_domain = (particle_in_domain .and. (pos_part(2) < y_cb(n)))
+        end if
+        if (p > 0) then
+            if (any(bc_z%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/))) then
+                particle_in_domain = (particle_in_domain .and. (pos_part(3) >= z_cb(-1)))
+            end if
+            if (any(bc_z%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/))) then
+                particle_in_domain = (particle_in_domain .and. (pos_part(3) < z_cb(p)))
+            end if
+        end if
+
+    end function particle_in_domain
+
+    !> Determine if a particle is located within the physical domain (excluding ghost cells)
+    !! @param pos_part Spatial coordinates of the particle
+    function particle_in_domain_physical(pos_part)
+
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        logical                            :: particle_in_domain_physical
+        real(wp), dimension(3), intent(in) :: pos_part
+
+        particle_in_domain_physical = ((pos_part(1) < x_cb(m)) .and. (pos_part(1) >= x_cb(-1)) .and. (pos_part(2) < y_cb(n)) &
+                                       & .and. (pos_part(2) >= y_cb(-1)))
+
+        if (p > 0) then
+            particle_in_domain_physical = (particle_in_domain_physical .and. (pos_part(3) < z_cb(p)) .and. (pos_part(3) &
+                                           & >= z_cb(-1)))
+        end if
+
+    end function particle_in_domain_physical
+
+    !> The purpose of this procedure is to calculate the gradient from reconstructed states along the x, y and z
+    !! @param vL_field left edge reconstructed values
+    !! @param vR_field right edge reconstructed values
+    !! @param dq Output gradient of q
+    !! @param dir Gradient spatial direction
+    !! @param field_var variable index for reconstructed states
+    subroutine s_gradient_field(vL_field, vR_field, dq, dir, field_var)
+
+        real(stp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:), intent(out)  :: dq
+        real(wp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:), intent(in) :: vL_field
+        real(wp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:), intent(in) :: vR_field
+        integer, intent(in)                                                                 :: dir, field_var
+        integer                                                                             :: i, j, k
+        real(wp)                                                                            :: mydx
+
+        if (dir == 1) then
+            $:GPU_PARALLEL_LOOP(private='[i, j, k, mydx]', collapse=3,copyin='[dir, field_var]')
+            do k = idwbuff(3)%beg, idwbuff(3)%end
+                do j = idwbuff(2)%beg, idwbuff(2)%end
+                    do i = idwbuff(1)%beg, idwbuff(1)%end
+                        mydx = dx(i)
+                        dq(i, j, k) = (vR_field(i, j, k, field_var) - vL_field(i, j, k, field_var))/mydx
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        else if (dir == 2) then
+            $:GPU_PARALLEL_LOOP(private='[i, j, k, mydx]', collapse=3,copyin='[dir, field_var]')
+            do k = idwbuff(3)%beg, idwbuff(3)%end
+                do j = idwbuff(1)%beg, idwbuff(1)%end
+                    do i = idwbuff(2)%beg, idwbuff(2)%end
+                        mydx = dy(i)
+                        dq(j, i, k) = (vR_field(i, j, k, field_var) - vL_field(i, j, k, field_var))/mydx
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        else if (dir == 3) then
+            $:GPU_PARALLEL_LOOP(private='[i, j, k, mydx]', collapse=3,copyin='[dir, field_var]')
+            do k = idwbuff(1)%beg, idwbuff(1)%end
+                do j = idwbuff(2)%beg, idwbuff(2)%end
+                    do i = idwbuff(3)%beg, idwbuff(3)%end
+                        mydx = dz(i)
+                        dq(k, j, i) = (vR_field(i, j, k, field_var) - vL_field(i, j, k, field_var))/mydx
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
+
+    end subroutine s_gradient_field
+
+    !> The purpose of this procedure is to calculate the gradient of a scalar field along the x, y and z directions using Fornberg's
+    !! method
+    !! @param q Input scalar field
+    !! @param dq Output gradient of q
+    !! @param dir Gradient spatial direction
+    subroutine s_gradient_dir_fornberg(q, dq, dir)
+
+        real(stp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:), intent(in)  :: q
+        real(stp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:), intent(out) :: dq
+        integer, intent(in)                                                                :: dir
+        integer                                                                            :: i, j, k, a, npts, s_idx
+
+        npts = (nWeights_grad - 1)/2
+
+        if (dir == 1) then
+            $:GPU_PARALLEL_LOOP(private='[i, j, k, s_idx, a]', collapse=3,copyin='[npts]')
+            do k = idwbuff(3)%beg, idwbuff(3)%end
+                do j = idwbuff(2)%beg, idwbuff(2)%end
+                    do i = idwbuff(1)%beg + 2, idwbuff(1)%end - 2
+                        dq(i, j, k) = 0._wp
+                        do a = -npts, npts
+                            s_idx = a + npts + 1
+                            dq(i, j, k) = dq(i, j, k) + weights_x_grad(s_idx)%sf(i, 1, 1)*q(i + a, j, k)
+                        end do
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        else if (dir == 2) then
+            $:GPU_PARALLEL_LOOP(private='[i, j, k, s_idx, a]', collapse=3,copyin='[npts]')
+            do k = idwbuff(3)%beg, idwbuff(3)%end
+                do j = idwbuff(2)%beg + 2, idwbuff(2)%end - 2
+                    do i = idwbuff(1)%beg, idwbuff(1)%end
+                        dq(i, j, k) = 0._wp
+                        do a = -npts, npts
+                            s_idx = a + npts + 1
+                            dq(i, j, k) = dq(i, j, k) + weights_y_grad(s_idx)%sf(j, 1, 1)*q(i, j + a, k)
+                        end do
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        else if (dir == 3) then
+            $:GPU_PARALLEL_LOOP(private='[i, j, k, s_idx, a]', collapse=3,copyin='[npts]')
+            do k = idwbuff(3)%beg + 2, idwbuff(3)%end - 2
+                do j = idwbuff(2)%beg, idwbuff(2)%end
+                    do i = idwbuff(1)%beg, idwbuff(1)%end
+                        dq(i, j, k) = 0._wp
+                        do a = -npts, npts
+                            s_idx = a + npts + 1
+                            dq(i, j, k) = dq(i, j, k) + weights_z_grad(s_idx)%sf(k, 1, 1)*q(i, j, k + a)
+                        end do
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
+
+    end subroutine s_gradient_dir_fornberg
+
+    !> The purpose of this procedure is to compute the Fornberg finite difference weights for derivatives (only done once at start
+    !! time)
+    impure subroutine s_compute_fornberg_fd_weights(npts)
+
+        integer, intent(in) :: npts
+        integer             :: i, j, k, a, m_order
+        integer             :: s_idx
+        real(wp)            :: x0, y0, z0
+        real(wp)            :: x_stencil(nWeights_grad)
+        real(wp)            :: c(nWeights_grad,0:1)
+
+        m_order = 1  ! first derivative
+
+        $:GPU_PARALLEL_LOOP(private='[i, a, x_stencil, c, s_idx, x0]', copyin='[npts, m_order]')
+        do i = idwbuff(1)%beg + npts, idwbuff(1)%end - npts
+            do a = -npts, npts
+                s_idx = a + npts + 1
+                x_stencil(s_idx) = x_cc(i + a)
+            end do
+            x0 = x_cc(i)
+
+            call s_fornberg_weights(x0, x_stencil, nWeights_grad, m_order, c)
+
+            do a = -npts, npts
+                s_idx = a + npts + 1
+                weights_x_grad(s_idx)%sf(i, 1, 1) = c(s_idx, 1)
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        $:GPU_PARALLEL_LOOP(private='[j, a, x_stencil, c, s_idx, y0]', copyin='[npts, m_order]')
+        do j = idwbuff(2)%beg + npts, idwbuff(2)%end - npts
+            do a = -npts, npts
+                s_idx = a + npts + 1
+                x_stencil(s_idx) = y_cc(j + a)
+            end do
+            y0 = y_cc(j)
+
+            call s_fornberg_weights(y0, x_stencil, nWeights_grad, m_order, c)
+
+            do a = -npts, npts
+                s_idx = a + npts + 1
+                weights_y_grad(s_idx)%sf(j, 1, 1) = c(s_idx, 1)
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        if (num_dims == 3) then
+            $:GPU_PARALLEL_LOOP(private='[k, a, x_stencil, c, s_idx, z0]', copyin='[npts, m_order]')
+            do k = idwbuff(3)%beg + npts, idwbuff(3)%end - npts
+                do a = -npts, npts
+                    s_idx = a + npts + 1
+                    x_stencil(s_idx) = z_cc(k + a)
+                end do
+                z0 = z_cc(k)
+
+                call s_fornberg_weights(z0, x_stencil, nWeights_grad, m_order, c)
+
+                do a = -npts, npts
+                    s_idx = a + npts + 1
+                    weights_z_grad(s_idx)%sf(k, 1, 1) = c(s_idx, 1)
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
+
+    end subroutine s_compute_fornberg_fd_weights
+
+    !> The purpose of this procedure is to compute the Fornberg finite difference weights on a local stencil
+    subroutine s_fornberg_weights(x0, stencil, npts, m_order, coeffs)
+
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        integer, intent(in)   :: npts  ! number of stencil points
+        integer, intent(in)   :: m_order  ! highest derivative order
+        real(wp), intent(in)  :: x0  ! evaluation point
+        real(wp), intent(in)  :: stencil(npts)  ! stencil coordinates
+        real(wp), intent(out) :: coeffs(npts,0:m_order)
+        integer               :: i, j, k, mn
+        real(wp)              :: c1, c2, c3, c4, c5
+
+        coeffs = 0.0_wp
+        c1 = 1.0_wp
+        c4 = stencil(1) - x0
+        coeffs(1, 0) = 1.0_wp
+
+        do i = 2, npts
+            mn = min(i - 1, m_order)
+            c2 = 1.0_wp
+            c5 = c4
+            c4 = stencil(i) - x0
+
+            do j = 1, i - 1
+                c3 = stencil(i) - stencil(j)
+                c2 = c2*c3
+
+                if (j == i - 1) then
+                    do k = mn, 1, -1
+                        coeffs(i, k) = c1*(k*coeffs(i - 1, k - 1) - c5*coeffs(i - 1, k))/c2
+                    end do
+                    coeffs(i, 0) = -c1*c5*coeffs(i - 1, 0)/c2
+                end if
+
+                do k = mn, 1, -1
+                    coeffs(j, k) = (c4*coeffs(j, k) - k*coeffs(j, k - 1))/c3
+                end do
+                coeffs(j, 0) = c4*coeffs(j, 0)/c3
+            end do
+
+            c1 = c2
+        end do
+
+    end subroutine s_fornberg_weights
+
+    !> The purpose of this procedure is to compute the barycentric weights for interpolation (only done once at start time)
+    impure subroutine s_compute_barycentric_weights(npts)
+
+        integer, intent(in) :: npts
+        integer             :: i, j, k, l, a, b
+        real(wp)            :: prod_x, prod_y, prod_z, dx_loc, dy_loc, dz_loc
+
+        $:GPU_PARALLEL_LOOP(private='[i, a, b, prod_x, dx_loc]', copyin = '[npts]')
+        do i = idwbuff(1)%beg + npts, idwbuff(1)%end - npts
+            do a = -npts, npts
+                prod_x = 1._wp
+                do b = -npts, npts
+                    if (a /= b) then
+                        dx_loc = x_cc(i + a) - x_cc(i + b)
+                        prod_x = prod_x*dx_loc
+                    end if
+                end do
+                weights_x_interp(a + npts + 1)%sf(i, 1, 1) = 1._wp/prod_x
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        $:GPU_PARALLEL_LOOP(private='[j, a, b, prod_y, dy_loc]', copyin = '[npts]')
+        do j = idwbuff(2)%beg + npts, idwbuff(2)%end - npts
+            do a = -npts, npts
+                prod_y = 1._wp
+                do b = -npts, npts
+                    if (a /= b) then
+                        dy_loc = y_cc(j + a) - y_cc(j + b)
+                        prod_y = prod_y*dy_loc
+                    end if
+                end do
+                weights_y_interp(a + npts + 1)%sf(j, 1, 1) = 1._wp/prod_y
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        if (num_dims == 3) then
+            $:GPU_PARALLEL_LOOP(private='[k, a, b, prod_z, dz_loc]', copyin = '[npts]')
+            do k = idwbuff(3)%beg + npts, idwbuff(3)%end - npts
+                do a = -npts, npts
+                    prod_z = 1._wp
+                    do b = -npts, npts
+                        if (a /= b) then
+                            dz_loc = z_cc(k + a) - z_cc(k + b)
+                            prod_z = prod_z*dz_loc
+                        end if
+                    end do
+                    weights_z_interp(a + npts + 1)%sf(k, 1, 1) = 1._wp/prod_z
+                end do
+            end do
+        end if
+
+    end subroutine s_compute_barycentric_weights
+
+    impure subroutine s_open_lag_bubble_evol
+
+        character(LEN=path_len + 2*name_len) :: file_loc
+        logical                              :: file_exist
+        character(LEN=25)                    :: FMT
+
+        write (file_loc, '(A,I0,A)') 'lag_bubble_evol_', proc_rank, '.dat'
+        file_loc = trim(case_dir) // '/D/' // trim(file_loc)
+        call my_inquire(trim(file_loc), file_exist)
+
+        if (precision == 1) then
+            FMT = "(A16,A14,8A16)"
+        else
+            FMT = "(A24,A14,8A24)"
+        end if
+
+        if (.not. file_exist) then
+            open (LAG_EVOL_ID, FILE=trim(file_loc), form='formatted', position='rewind')
+            write (LAG_EVOL_ID, FMT) 'currentTime', 'particleID', 'x', 'y', 'z', 'Vx', 'Vy', 'Vz', 'Fp_x', 'Fp_y', 'Fp_z', 'radius'
+        else
+            open (LAG_EVOL_ID, FILE=trim(file_loc), form='formatted', position='append')
+        end if
+
+    end subroutine s_open_lag_bubble_evol
+
+    !> Write particle evolution data at each output time step
+    !! @param qtime Current time
+    impure subroutine s_write_lag_particle_evol(qtime)
+
+        real(wp), intent(in)                 :: qtime
+        integer                              :: k, ios
+        character(LEN=25)                    :: FMT
+        character(LEN=path_len + 2*name_len) :: file_loc, path
+        logical                              :: file_exist
+
+        if (precision == 1) then
+            ! FMT = "(F16.8,I14,8F16.8)"
+            FMT = "(F16.8,I14,10F16.8)"
+        else
+            ! FMT = "(F24.16,I14,8F24.16)"
+            FMT = "(F24.16,I14,10F24.16)"
+        end if
+
+        ! Cycle through list
+        do k = 1, n_el_particles_loc
+            write (LAG_EVOL_ID, FMT) qtime, lag_part_id(k, 1), particle_pos(k, 1, 1), particle_pos(k, 2, 1), particle_pos(k, 3, &
+                   & 1), particle_vel(k, 1, 1), particle_vel(k, 2, 1), particle_vel(k, 3, 1), f_p(k, 1), f_p(k, 2), f_p(k, 3), &
+                   & particle_rad(k, 1)
+        end do
+
+    end subroutine s_write_lag_particle_evol
+
+    impure subroutine s_close_lag_particle_evol
+
+        close (LAG_EVOL_ID)
+
+    end subroutine s_close_lag_particle_evol
+
+    subroutine s_open_void_evol
+
+        character(LEN=path_len + 2*name_len) :: file_loc
+        logical                              :: file_exist
+
+        if (proc_rank == 0) then
+            write (file_loc, '(A)') 'voidfraction.dat'
+            file_loc = trim(case_dir) // '/D/' // trim(file_loc)
+            call my_inquire(trim(file_loc), file_exist)
+            if (.not. file_exist) then
+                open (LAG_VOID_ID, FILE=trim(file_loc), form='formatted', position='rewind')
+                ! write (12, *) 'currentTime, averageVoidFraction, ', & 'maximumVoidFraction, totalParticlesVolume' write (12, *)
+                ! 'The averageVoidFraction value does ', & 'not reflect the real void fraction in the cloud since the ', & 'cells
+                ! which do not have bubbles are not accounted'
+            else
+                open (LAG_VOID_ID, FILE=trim(file_loc), form='formatted', position='append')
+            end if
+        end if
+
+    end subroutine s_open_void_evol
+
+    !> Subroutine that writes some useful statistics related to the volume fraction of the particles (void fraction) in the
+    !! computatioational domain on each time step.
+    !! @param q_time Current time
+    impure subroutine s_write_void_evol_particles(qtime)
+
+        real(wp), intent(in)                 :: qtime
+        real(wp)                             :: volcell, voltot
+        real(wp)                             :: lag_void_max, lag_void_avg, lag_vol
+        real(wp)                             :: void_max_glb, void_avg_glb, vol_glb
+        integer                              :: i, j, k
+        character(LEN=path_len + 2*name_len) :: file_loc
+        logical                              :: file_exist
+
+        lag_void_max = 0._wp
+        lag_void_avg = 0._wp
+        lag_vol = 0._wp
+        $:GPU_PARALLEL_LOOP(private='[volcell]', collapse=3, reduction='[[lag_vol, lag_void_avg], [lag_void_max]]', &
+                            & reductionOp='[+, MAX]', copy='[lag_vol, lag_void_avg, lag_void_max]')
+        do k = 0, p
+            do j = 0, n
+                do i = 0, m
+                    lag_void_max = max(lag_void_max, 1._wp - q_particles(alphaf_id)%sf(i, j, k))
+                    call s_get_char_vol(i, j, k, volcell)
+                    if ((1._wp - q_particles(alphaf_id)%sf(i, j, k)) > 5.0e-11_wp) then
+                        lag_void_avg = lag_void_avg + (1._wp - q_particles(alphaf_id)%sf(i, j, k))*volcell
+                        lag_vol = lag_vol + volcell
+                    end if
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+#ifdef MFC_MPI
+        if (num_procs > 1) then
+            call s_mpi_allreduce_max(lag_void_max, void_max_glb)
+            lag_void_max = void_max_glb
+            call s_mpi_allreduce_sum(lag_vol, vol_glb)
+            lag_vol = vol_glb
+            call s_mpi_allreduce_sum(lag_void_avg, void_avg_glb)
+            lag_void_avg = void_avg_glb
+        end if
+#endif
+        voltot = lag_void_avg
+        ! This voidavg value does not reflect the real void fraction in the cloud since the cell which does not have bubbles are not
+        ! accounted
+        if (lag_vol > 0._wp) lag_void_avg = lag_void_avg/lag_vol
+
+        if (proc_rank == 0) then
+            write (LAG_VOID_ID, '(6X,4e24.8)') qtime, lag_void_avg, lag_void_max, voltot
+        end if
+
+    end subroutine s_write_void_evol_particles
+
+    subroutine s_close_void_evol
+
+        if (proc_rank == 0) close (LAG_VOID_ID)
+
+    end subroutine s_close_void_evol
+
+    !> Subroutine that writes the restarting files for the particles in the lagrangian solver.
+    !! @param t_step Current time step
+    impure subroutine s_write_restart_lag_particles(t_step)
+
+        ! Generic string used to store the address of a particular file
+        integer, intent(in)                  :: t_step
+        character(LEN=path_len + 2*name_len) :: file_loc
+        logical                              :: file_exist
+        integer                              :: part_id, tot_part
+        integer                              :: i, k
+
+#ifdef MFC_MPI
+        ! For Parallel I/O
+        integer                                :: ifile, ierr
+        integer, dimension(MPI_STATUS_SIZE)    :: status
+        integer(KIND=MPI_OFFSET_KIND)          :: disp
+        integer                                :: view
+        integer, dimension(2)                  :: gsizes, lsizes, start_idx_part
+        integer, dimension(num_procs)          :: part_order, part_ord_mpi
+        integer, dimension(num_procs)          :: proc_particle_counts
+        real(wp), dimension(1:1,1:lag_io_vars) :: dummy
+        dummy = 0._wp
+
+        part_id = 0
+        if (n_el_particles_loc /= 0) then
+            do k = 1, n_el_particles_loc
+                if (particle_in_domain_physical(particle_pos(k,1:3,1))) then
+                    part_id = part_id + 1
+                end if
+            end do
+        end if
+
+        if (.not. parallel_io) return
+
+        lsizes(1) = part_id
+        lsizes(2) = lag_io_vars
+
+        ! Total number of particles
+        call MPI_ALLREDUCE(part_id, tot_part, 1, MPI_integer, MPI_SUM, MPI_COMM_WORLD, ierr)
+
+        call MPI_ALLGATHER(part_id, 1, MPI_INTEGER, proc_particle_counts, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
+
+        ! Calculate starting index for this processor's particles
+        call MPI_EXSCAN(lsizes(1), start_idx_part(1), 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, ierr)
+        if (proc_rank == 0) start_idx_part(1) = 0
+        start_idx_part(2) = 0
+
+        gsizes(1) = tot_part
+        gsizes(2) = lag_io_vars
+
+        write (file_loc, '(A,I0,A)') 'lag_bubbles_', t_step, '.dat'
+        file_loc = trim(case_dir) // '/restart_data' // trim(mpiiofs) // trim(file_loc)
+
+        ! Clean up existing file
+        if (proc_rank == 0) then
+            inquire (FILE=trim(file_loc), EXIST=file_exist)
+            if (file_exist) then
+                call MPI_FILE_DELETE(file_loc, mpi_info_int, ierr)
+            end if
+        end if
+
+        call MPI_BARRIER(MPI_COMM_WORLD, ierr)
+
+        if (proc_rank == 0) then
+            call MPI_FILE_OPEN(MPI_COMM_SELF, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), mpi_info_int, ifile, ierr)
+
+            ! Write header using MPI I/O for consistency
+            call MPI_FILE_WRITE(ifile, tot_part, 1, MPI_INTEGER, status, ierr)
+            call MPI_FILE_WRITE(ifile, mytime, 1, mpi_p, status, ierr)
+            call MPI_FILE_WRITE(ifile, dt, 1, mpi_p, status, ierr)
+            call MPI_FILE_WRITE(ifile, num_procs, 1, MPI_INTEGER, status, ierr)
+            call MPI_FILE_WRITE(ifile, proc_particle_counts, num_procs, MPI_INTEGER, status, ierr)
+
+            call MPI_FILE_CLOSE(ifile, ierr)
+        end if
+
+        call MPI_BARRIER(MPI_COMM_WORLD, ierr)
+
+        if (part_id > 0) then
+            allocate (MPI_IO_DATA_lag_bubbles(max(1, part_id),1:lag_io_vars))
+
+            i = 1
+            do k = 1, n_el_particles_loc
+                if (particle_in_domain_physical(particle_pos(k,1:3,1))) then
+                    MPI_IO_DATA_lag_bubbles(i, 1) = real(lag_part_id(k, 1))
+                    MPI_IO_DATA_lag_bubbles(i,2:4) = particle_pos(k,1:3,1)
+                    MPI_IO_DATA_lag_bubbles(i,5:7) = particle_posPrev(k,1:3,1)
+                    MPI_IO_DATA_lag_bubbles(i,8:10) = particle_vel(k,1:3,1)
+                    MPI_IO_DATA_lag_bubbles(i, 11) = particle_rad(k, 1)
+                    MPI_IO_DATA_lag_bubbles(i, 13) = particle_R0(k)
+                    MPI_IO_DATA_lag_bubbles(i, 14) = Rmax_stats_part(k)
+                    MPI_IO_DATA_lag_bubbles(i, 15) = Rmin_stats_part(k)
+                    MPI_IO_DATA_lag_bubbles(i, 19) = particle_mass(k)
+                    i = i + 1
+                end if
+            end do
+
+            call MPI_TYPE_CREATE_SUBARRAY(2, gsizes, lsizes, start_idx_part, MPI_ORDER_FORTRAN, mpi_p, view, ierr)
+            call MPI_TYPE_COMMIT(view, ierr)
+
+            call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), mpi_info_int, ifile, ierr)
+
+            ! Skip header (written by rank 0)
+            disp = int(sizeof(tot_part) + 2*sizeof(mytime) + sizeof(num_procs) + num_procs*sizeof(proc_particle_counts(1)), &
+                       & MPI_OFFSET_KIND)
+            call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, view, 'native', mpi_info_int, ierr)
+
+            call MPI_FILE_WRITE_ALL(ifile, MPI_IO_DATA_lag_bubbles, lag_io_vars*part_id, mpi_p, status, ierr)
+
+            call MPI_FILE_CLOSE(ifile, ierr)
+
+            deallocate (MPI_IO_DATA_lag_bubbles)
+        else
+            call MPI_TYPE_CONTIGUOUS(0, mpi_p, view, ierr)
+            call MPI_TYPE_COMMIT(view, ierr)
+
+            call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), mpi_info_int, ifile, ierr)
+
+            ! Skip header (written by rank 0)
+            disp = int(sizeof(tot_part) + 2*sizeof(mytime) + sizeof(num_procs) + num_procs*sizeof(proc_particle_counts(1)), &
+                       & MPI_OFFSET_KIND)
+            call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, view, 'native', mpi_info_int, ierr)
+
+            call MPI_FILE_WRITE_ALL(ifile, dummy, 0, mpi_p, status, ierr)
+
+            call MPI_FILE_CLOSE(ifile, ierr)
+        end if
+#endif
+
+    end subroutine s_write_restart_lag_particles
+
+    !> Calculate global maximum and minimum R/R0 ratio across all particles.
+    subroutine s_calculate_lag_particle_stats()
+
+        integer :: k
+
+        $:GPU_PARALLEL_LOOP(private='[k]', reduction='[[Rmax_glb], [Rmin_glb]]', reductionOp='[MAX, MIN]', &
+                            & copy='[Rmax_glb, Rmin_glb]')
+        do k = 1, n_el_particles_loc
+            Rmax_glb = max(Rmax_glb, particle_rad(k, 1)/particle_R0(k))
+            Rmin_glb = min(Rmin_glb, particle_rad(k, 1)/particle_R0(k))
+            Rmax_stats_part(k) = max(Rmax_stats_part(k), particle_rad(k, 1)/particle_R0(k))
+            Rmin_stats_part(k) = min(Rmin_stats_part(k), particle_rad(k, 1)/particle_R0(k))
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+    end subroutine s_calculate_lag_particle_stats
+
+    impure subroutine s_open_lag_particle_stats()
+
+        character(LEN=path_len + 2*name_len) :: file_loc
+        character(LEN=20)                    :: FMT
+        logical                              :: file_exist
+
+        write (file_loc, '(A,I0,A)') 'stats_lag_bubbles_', proc_rank, '.dat'
+        file_loc = trim(case_dir) // '/D/' // trim(file_loc)
+        call my_inquire(trim(file_loc), file_exist)
+
+        if (precision == 1) then
+            FMT = "(A10,A14,5A16)"
+        else
+            FMT = "(A10,A14,5A24)"
+        end if
+
+        if (.not. file_exist) then
+            open (LAG_STATS_ID, FILE=trim(file_loc), form='formatted', position='rewind')
+            write (LAG_STATS_ID, *) 'proc_rank, particleID, x, y, z, Rmax_glb, Rmin_glb'
+        else
+            open (LAG_STATS_ID, FILE=trim(file_loc), form='formatted', position='append')
+        end if
+
+    end subroutine s_open_lag_particle_stats
+
+    !> Write particle radius statistics to file.
+    impure subroutine s_write_lag_particle_stats()
+
+        integer                              :: k
+        character(LEN=path_len + 2*name_len) :: file_loc
+        character(LEN=20)                    :: FMT
+
+        $:GPU_UPDATE(host='[Rmax_glb, Rmin_glb]')
+
+        if (precision == 1) then
+            FMT = "(I10,I14,5F16.8)"
+        else
+            FMT = "(I10,I14,5F24.16)"
+        end if
+
+        do k = 1, n_el_particles_loc
+            write (LAG_STATS_ID, FMT) proc_rank, lag_part_id(k, 1), particle_pos(k, 1, 1), particle_pos(k, 2, 1), particle_pos(k, &
+                   & 3, 1), Rmax_stats_part(k), Rmin_stats_part(k)
+        end do
+
+    end subroutine s_write_lag_particle_stats
+
+    subroutine s_close_lag_particle_stats
+
+        close (LAG_STATS_ID)
+
+    end subroutine s_close_lag_particle_stats
+
+    !> The purpose of this subroutine is to remove one specific particle if dt is too small.
+    !! @param part_id Particle id
+    impure subroutine s_copy_lag_particle(dest, src)
+
+        integer, intent(in) :: src, dest
+
+        particle_R0(dest) = particle_R0(src)
+        Rmax_stats_part(dest) = Rmax_stats_part(src)
+        Rmin_stats_part(dest) = Rmin_stats_part(src)
+        particle_mass(dest) = particle_mass(src)
+        lag_part_id(dest, 1) = lag_part_id(src, 1)
+        particle_rad(dest,1:2) = particle_rad(src,1:2)
+        particle_vel(dest,1:3,1:2) = particle_vel(src,1:3,1:2)
+        particle_s(dest,1:3,1:2) = particle_s(src,1:3,1:2)
+        particle_pos(dest,1:3,1:2) = particle_pos(src,1:3,1:2)
+        particle_posPrev(dest,1:3,1:2) = particle_posPrev(src,1:3,1:2)
+        particle_draddt(dest,1:lag_num_ts) = particle_draddt(src,1:lag_num_ts)
+        f_p(dest,1:3) = f_p(src,1:3)
+        particle_dposdt(dest,1:3,1:lag_num_ts) = particle_dposdt(src,1:3,1:lag_num_ts)
+        particle_dveldt(dest,1:3,1:lag_num_ts) = particle_dveldt(src,1:3,1:lag_num_ts)
+
+    end subroutine s_copy_lag_particle
+
+    !> The purpose of this subroutine is to deallocate variables
+    impure subroutine s_finalize_particle_lagrangian_solver()
+
+        integer :: i
+
+        if (lag_params%write_void_evol) call s_close_void_evol
+        if (lag_params%write_bubbles) call s_close_lag_particle_evol()
+        if (lag_params%write_bubbles_stats) call s_close_lag_particle_stats()
+
+        do i = 1, q_particles_idx
+            @:DEALLOCATE(q_particles(i)%sf)
+        end do
+        @:DEALLOCATE(q_particles)
+
+        do i = 1, nField_vars
+            @:DEALLOCATE(field_vars(i)%sf)
+        end do
+        @:DEALLOCATE(field_vars)
+
+        do i = 1, nWeights_interp
+            @:DEALLOCATE(weights_x_interp(i)%sf)
+        end do
+        @:DEALLOCATE(weights_x_interp)
+
+        do i = 1, nWeights_interp
+            @:DEALLOCATE(weights_y_interp(i)%sf)
+        end do
+        @:DEALLOCATE(weights_y_interp)
+
+        do i = 1, nWeights_interp
+            @:DEALLOCATE(weights_z_interp(i)%sf)
+        end do
+        @:DEALLOCATE(weights_z_interp)
+
+        do i = 1, nWeights_grad
+            @:DEALLOCATE(weights_x_grad(i)%sf)
+        end do
+        @:DEALLOCATE(weights_x_grad)
+
+        do i = 1, nWeights_grad
+            @:DEALLOCATE(weights_y_grad(i)%sf)
+        end do
+        @:DEALLOCATE(weights_y_grad)
+
+        do i = 1, nWeights_grad
+            @:DEALLOCATE(weights_z_grad(i)%sf)
+        end do
+        @:DEALLOCATE(weights_z_grad)
+
+        ! Deallocating space
+        @:DEALLOCATE(lag_part_id)
+        @:DEALLOCATE(gid_to_local)
+        @:DEALLOCATE(particle_R0)
+        @:DEALLOCATE(Rmax_stats_part)
+        @:DEALLOCATE(Rmin_stats_part)
+        @:DEALLOCATE(particle_mass)
+        @:DEALLOCATE(p_AM)
+        @:DEALLOCATE(p_owner_rank)
+        @:DEALLOCATE(particle_rad)
+        @:DEALLOCATE(particle_pos)
+        @:DEALLOCATE(particle_posPrev)
+        @:DEALLOCATE(particle_vel)
+        @:DEALLOCATE(particle_s)
+        @:DEALLOCATE(particle_draddt)
+        @:DEALLOCATE(particle_dposdt)
+        @:DEALLOCATE(particle_dveldt)
+        @:DEALLOCATE(f_p)
+        @:DEALLOCATE(gSum)
+
+        @:DEALLOCATE(force_recv_ids)
+        @:DEALLOCATE(force_recv_vals)
+
+        @:DEALLOCATE(keep_bubble)
+        @:DEALLOCATE(wrap_bubble_loc, wrap_bubble_dir)
+
+        @:DEALLOCATE(linked_list)
+        @:DEALLOCATE(particle_head)
+
+        ! Deallocate cell list arrays
+        @:DEALLOCATE(cell_list_start)
+        @:DEALLOCATE(cell_list_count)
+        @:DEALLOCATE(cell_list_idx)
+
+    end subroutine s_finalize_particle_lagrangian_solver
+
+end module m_particles_EL
diff --git a/src/simulation/m_particles_EL_kernels.fpp b/src/simulation/m_particles_EL_kernels.fpp
new file mode 100644
index 0000000000..08d2c22c8c
--- /dev/null
+++ b/src/simulation/m_particles_EL_kernels.fpp
@@ -0,0 +1,904 @@
+!>
+!! @file m_particles_EL_kernels.fpp
+!! @brief Contains module m_particles_EL_kernels
+
+#:include 'macros.fpp'
+
+!> @brief This module contains kernel functions used to map the effect of the lagrangian particles in the Eulerian framework.
+module m_particles_EL_kernels
+
+    use m_mpi_proxy      !< Message passing interface (MPI) module proxy
+    use ieee_arithmetic  !< For checking NaN
+
+    implicit none
+
+    ! Cell list for particle-to-cell mapping (rebuilt each RK stage before smearing)
+    integer, allocatable, dimension(:,:,:) :: cell_list_start  ! (0:m, 0:n, 0:p)
+    integer, allocatable, dimension(:,:,:) :: cell_list_count  ! (0:m, 0:n, 0:p)
+    integer, allocatable, dimension(:)     :: cell_list_idx  ! (1:nParticles_glb) sorted particle indices
+    $:GPU_DECLARE(create='[cell_list_start, cell_list_count, cell_list_idx]')
+
+contains
+
+    ! !> The purpose of this subroutine is to compute each particles total contribution to the gaussian for proper normalization
+    subroutine s_compute_gaussian_contribution(rad, pos, cell, func_s)
+
+        $:GPU_ROUTINE(function_name='s_compute_gaussian_contribution',parallelism='[seq]', cray_inline=True)
+
+        real(wp), intent(in)               :: rad
+        real(wp), intent(in), dimension(3) :: pos
+        integer, intent(in), dimension(3)  :: cell
+        real(wp), intent(out)              :: func_s
+        real(wp)                           :: volpart, stddsv, Vol_loc, func
+        real(wp), dimension(3)             :: nodecoord, center
+        integer                            :: ip, jp, kp, di, dj, dk, di_beg, di_end, dj_beg, dj_end, dk_beg, dk_end, mapCells_loc
+        integer, dimension(3)              :: cellijk
+
+        mapCells_loc = 1
+
+        volpart = (4._wp/3._wp)*pi*rad**3
+
+        call s_compute_stddsv(cell, volpart, stddsv)
+
+        ip = cell(1)
+        jp = cell(2)
+        kp = cell(3)
+
+        di_beg = ip - mapCells_loc
+        di_end = ip + mapCells_loc
+        dj_beg = jp - mapCells_loc
+        dj_end = jp + mapCells_loc
+        dk_beg = kp
+        dk_end = kp
+
+        if (num_dims == 3) then
+            dk_beg = kp - mapCells_loc
+            dk_end = kp + mapCells_loc
+        end if
+
+        func_s = 0._wp
+        do dk = dk_beg, dk_end
+            do dj = dj_beg, dj_end
+                do di = di_beg, di_end
+                    nodecoord(1) = x_cc(di)
+                    nodecoord(2) = y_cc(dj)
+                    nodecoord(3) = 0._wp
+                    if (p > 0) nodecoord(3) = z_cc(dk)
+
+                    cellijk(1) = di
+                    cellijk(2) = dj
+                    cellijk(3) = dk
+
+                    center(1:2) = pos(1:2)
+                    center(3) = 0._wp
+                    if (p > 0) center(3) = pos(3)
+
+                    Vol_loc = dx(cellijk(1))*dy(cellijk(2))
+                    if (num_dims == 3) Vol_loc = dx(cellijk(1))*dy(cellijk(2))*dz(cellijk(3))
+
+                    call s_applygaussian(center, cellijk, nodecoord, stddsv, 0._wp, func)
+
+                    func_s = func_s + func*Vol_loc
+                end do
+            end do
+        end do
+
+    end subroutine s_compute_gaussian_contribution
+
+    !> The purpose of this subroutine is to compute the gaussian smearing of particle volume fraction and source terms with atomic
+    !! cell updates
+    subroutine s_gaussian_atomic(rad, vel, pos, force_p, gauSum, cell, updatedvar, onlyBeta)
+
+        $:GPU_ROUTINE(function_name='s_gaussian_atomic',parallelism='[seq]', cray_inline=True)
+
+        real(wp), intent(in) :: rad, gauSum
+        real(wp), intent(in), dimension(3) :: pos, vel, force_p
+        integer, intent(in), dimension(3) :: cell
+        type(scalar_field), dimension(:), intent(inout) :: updatedvar
+        real(wp) :: volpart, stddsv, Vol_loc, func, weight
+        real(wp) :: fp_x, fp_y, fp_z, vp_x, vp_y, vp_z
+        real(wp) :: addFun_alphap, addFun_alphap_vp_x, addFun_alphap_vp_y, addFun_alphap_vp_z
+        real(wp) :: addFun2_x, addFun2_y, addFun2_z, addFun_E
+        real(wp), dimension(3) :: nodecoord, center
+        integer :: ip, jp, kp, di, dj, dk, di_beg, di_end, dj_beg, dj_end, dk_beg, dk_end, mapCells_loc
+        integer, dimension(3) :: cellijk
+        logical, intent(in) :: onlyBeta
+
+        mapCells_loc = 1
+
+        volpart = (4._wp/3._wp)*pi*rad**3
+
+        call s_compute_stddsv(cell, volpart, stddsv)
+
+        ip = cell(1)
+        jp = cell(2)
+        kp = cell(3)
+
+        di_beg = ip - mapCells_loc
+        di_end = ip + mapCells_loc
+        dj_beg = jp - mapCells_loc
+        dj_end = jp + mapCells_loc
+        dk_beg = kp
+        dk_end = kp
+
+        if (num_dims == 3) then
+            dk_beg = kp - mapCells_loc
+            dk_end = kp + mapCells_loc
+        end if
+
+        fp_x = -force_p(1)
+        fp_y = -force_p(2)
+        fp_z = -force_p(3)
+
+        vp_x = vel(1)
+        vp_y = vel(2)
+        vp_z = vel(3)
+
+        center(1:2) = pos(1:2)
+        center(3) = 0._wp
+        if (p > 0) center(3) = pos(3)
+
+        do dk = dk_beg, dk_end
+            do dj = dj_beg, dj_end
+                do di = di_beg, di_end
+                    nodecoord(1) = x_cc(di)
+                    nodecoord(2) = y_cc(dj)
+                    nodecoord(3) = 0._wp
+                    if (p > 0) nodecoord(3) = z_cc(dk)
+
+                    cellijk(1) = di
+                    cellijk(2) = dj
+                    cellijk(3) = dk
+
+                    Vol_loc = dx(cellijk(1))*dy(cellijk(2))
+                    if (num_dims == 3) Vol_loc = Vol_loc*dz(cellijk(3))
+
+                    call s_applygaussian(center, cellijk, nodecoord, stddsv, 0._wp, func)
+
+                    if (gauSum <= 0._wp) return
+                    weight = func/gauSum
+
+                    addFun_alphap = weight*volpart
+                    $:GPU_ATOMIC(atomic='update')
+                    updatedvar(1)%sf(cellijk(1), cellijk(2), cellijk(3)) = updatedvar(1)%sf(cellijk(1), cellijk(2), &
+                               & cellijk(3)) + real(addFun_alphap, kind=stp)
+
+                    if (lag_params%solver_approach == 2 .and. .not. onlyBeta) then
+                        ! Update particle momentum field(x)
+                        addFun_alphap_vp_x = weight*volpart*vp_x
+                        $:GPU_ATOMIC(atomic='update')
+                        updatedvar(2)%sf(cellijk(1), cellijk(2), cellijk(3)) = updatedvar(2)%sf(cellijk(1), cellijk(2), &
+                                   & cellijk(3)) + real(addFun_alphap_vp_x, kind=stp)
+
+                        ! Update particle momentum field(y)
+                        addFun_alphap_vp_y = weight*volpart*vp_y
+                        $:GPU_ATOMIC(atomic='update')
+                        updatedvar(3)%sf(cellijk(1), cellijk(2), cellijk(3)) = updatedvar(3)%sf(cellijk(1), cellijk(2), &
+                                   & cellijk(3)) + real(addFun_alphap_vp_y, kind=stp)
+
+                        if (num_dims == 3) then
+                            ! Update particle momentum field(z)
+                            addFun_alphap_vp_z = weight*volpart*vp_z
+                            $:GPU_ATOMIC(atomic='update')
+                            updatedvar(4)%sf(cellijk(1), cellijk(2), cellijk(3)) = updatedvar(4)%sf(cellijk(1), cellijk(2), &
+                                       & cellijk(3)) + real(addFun_alphap_vp_z, kind=stp)
+                        end if
+
+                        ! Update x-momentum source term
+                        addFun2_x = weight*fp_x
+                        $:GPU_ATOMIC(atomic='update')
+                        updatedvar(5)%sf(cellijk(1), cellijk(2), cellijk(3)) = updatedvar(5)%sf(cellijk(1), cellijk(2), &
+                                   & cellijk(3)) + real(addFun2_x, kind=stp)
+
+                        ! Update y-momentum source term
+                        addFun2_y = weight*fp_y
+                        $:GPU_ATOMIC(atomic='update')
+                        updatedvar(6)%sf(cellijk(1), cellijk(2), cellijk(3)) = updatedvar(6)%sf(cellijk(1), cellijk(2), &
+                                   & cellijk(3)) + real(addFun2_y, kind=stp)
+
+                        if (num_dims == 3) then
+                            ! Update z-momentum source term
+                            addFun2_z = weight*fp_z
+                            $:GPU_ATOMIC(atomic='update')
+                            updatedvar(7)%sf(cellijk(1), cellijk(2), cellijk(3)) = updatedvar(7)%sf(cellijk(1), cellijk(2), &
+                                       & cellijk(3)) + real(addFun2_z, kind=stp)
+                        end if
+
+                        ! Update energy source term
+                        addFun_E = 0._wp
+                        $:GPU_ATOMIC(atomic='update')
+                        updatedvar(8)%sf(cellijk(1), cellijk(2), cellijk(3)) = updatedvar(8)%sf(cellijk(1), cellijk(2), &
+                                   & cellijk(3)) + real(addFun_E, kind=stp)
+                    end if
+                end do
+            end do
+        end do
+
+    end subroutine s_gaussian_atomic
+
+    !> The purpose of this subroutine is to apply the gaussian kernel function for each particle (Maeda and Colonius, 2018)).
+    subroutine s_applygaussian(center, cellaux, nodecoord, stddsv, strength_idx, func)
+
+        $:GPU_ROUTINE(function_name='s_applygaussian',parallelism='[seq]', cray_inline=True)
+
+        real(wp), dimension(3), intent(in) :: center
+        integer, dimension(3), intent(in)  :: cellaux
+        real(wp), dimension(3), intent(in) :: nodecoord
+        real(wp), intent(in)               :: stddsv
+        real(wp), intent(in)               :: strength_idx
+        real(wp), intent(out)              :: func
+        integer                            :: i
+        real(wp)                           :: distance
+        real(wp)                           :: theta, dtheta, L2, dzp, Lz2, zc
+        real(wp)                           :: Nr, Nr_count
+
+        distance = sqrt((center(1) - nodecoord(1))**2._wp + (center(2) - nodecoord(2))**2._wp + (center(3) - nodecoord(3))**2._wp)
+
+        if (num_dims == 3) then
+            !> 3D gaussian function
+            func = exp(-0.5_wp*(distance/stddsv)**2._wp)/(sqrt(2._wp*pi)*stddsv)**3._wp
+        else
+            if (cyl_coord) then
+                !> 2D cylindrical function:
+                ! We smear particles in the azimuthal direction for given r
+                theta = 0._wp
+                Nr = ceiling(2._wp*pi*nodecoord(2)/(y_cb(cellaux(2)) - y_cb(cellaux(2) - 1)))
+                dtheta = 2._wp*pi/Nr
+                L2 = center(2)**2._wp + nodecoord(2)**2._wp - 2._wp*center(2)*nodecoord(2)*cos(theta)
+                distance = sqrt((center(1) - nodecoord(1))**2._wp + L2)
+                ! Factor 2._wp is for symmetry (upper half of the 2D field (+r) is considered)
+                func = dtheta/2._wp/pi*exp(-0.5_wp*(distance/stddsv)**2._wp)/(sqrt(2._wp*pi)*stddsv)**3._wp
+                Nr_count = 0._wp
+                do while (Nr_count < Nr - 1._wp)
+                    Nr_count = Nr_count + 1._wp
+                    theta = Nr_count*dtheta
+                    ! trigonometric relation
+                    L2 = center(2)**2._wp + nodecoord(2)**2._wp - 2._wp*center(2)*nodecoord(2)*cos(theta)
+                    distance = sqrt((center(1) - nodecoord(1))**2._wp + L2)
+                    ! nodecoord(2)*dtheta is the azimuthal width of the cell
+                    func = func + dtheta/2._wp/pi*exp(-0.5_wp*(distance/stddsv)**2._wp)/(sqrt(2._wp*pi)*stddsv) &
+                                                      & **(3._wp*(strength_idx + 1._wp))
+                end do
+            else
+                !> 2D cartesian function: Equation (48) from Maeda and Colonius 2018
+                ! We smear particles considering a virtual depth (lag_params%charwidth) with lag_params%charNz cells
+                dzp = (lag_params%charwidth/(lag_params%charNz + 1._wp))
+
+                func = 0._wp
+                do i = 0, lag_params%charNz
+                    zc = (-lag_params%charwidth/2._wp + dzp*(0.5_wp + i))  ! Center of virtual cell i in z-direction
+                    Lz2 = (center(3) - zc)**2._wp
+                    distance = sqrt((center(1) - nodecoord(1))**2._wp + (center(2) - nodecoord(2))**2._wp + Lz2)
+                    func = func + dzp/lag_params%charwidth*exp(-0.5_wp*(distance/stddsv)**2._wp)/(sqrt(2._wp*pi)*stddsv)**3._wp
+                end do
+            end if
+        end if
+
+    end subroutine s_applygaussian
+
+    !> Calculates the standard deviation of the particle being smeared in the Eulerian framework.
+    !! @param cell Cell where the particle is located
+    !! @param volpart Volume of the particle
+    !! @param stddsv Standard deviaton
+    subroutine s_compute_stddsv(cell, volpart, stddsv)
+
+        $:GPU_ROUTINE(function_name='s_compute_stddsv',parallelism='[seq]', cray_inline=True)
+
+        integer, dimension(3), intent(in) :: cell
+        real(wp), intent(in)              :: volpart
+        real(wp), intent(out)             :: stddsv
+        real(wp)                          :: chardist, charvol
+        real(wp)                          :: rad
+
+        !> Compute characteristic distance
+        chardist = sqrt(dx(cell(1))*dy(cell(2)))
+        if (p > 0) chardist = (dx(cell(1))*dy(cell(2))*dz(cell(3)))**(1._wp/3._wp)
+
+        !> Compute characteristic volume
+        if (p > 0) then
+            charvol = dx(cell(1))*dy(cell(2))*dz(cell(3))
+        else
+            if (cyl_coord) then
+                charvol = dx(cell(1))*dy(cell(2))*y_cc(cell(2))*2._wp*pi
+            else
+                charvol = dx(cell(1))*dy(cell(2))*lag_params%charwidth
+            end if
+        end if
+
+        rad = (3._wp*volpart/(4._wp*pi))**(1._wp/3._wp)
+        stddsv = lag_params%epsilonb*max(chardist, rad)
+
+    end subroutine s_compute_stddsv
+
+    !> The purpose of this subroutine is to check if the current cell is outside the computational domain or not (including ghost
+    !! cells).
+    !! @param cellaux Tested cell to smear the particle effect in.
+    !! @param celloutside If true, then cellaux is outside the computational domain.
+    subroutine s_check_celloutside(cellaux, celloutside)
+
+        $:GPU_ROUTINE(function_name='s_check_celloutside',parallelism='[seq]', cray_inline=True)
+
+        integer, dimension(3), intent(inout) :: cellaux
+        logical, intent(out)                 :: celloutside
+
+        celloutside = .false.
+
+        if (num_dims == 2) then
+            if ((cellaux(1) < -buff_size) .or. (cellaux(2) < -buff_size)) then
+                celloutside = .true.
+            end if
+
+            if ((cellaux(1) > m + buff_size) .or. (cellaux(2) > n + buff_size)) then
+                celloutside = .true.
+            end if
+        else
+            if ((cellaux(1) < -buff_size) .or. (cellaux(2) < -buff_size) .or. (cellaux(3) < -buff_size)) then
+                celloutside = .true.
+            end if
+
+            if ((cellaux(1) > m + buff_size) .or. (cellaux(2) > n + buff_size) .or. (cellaux(3) > p + buff_size)) then
+                celloutside = .true.
+            end if
+        end if
+
+    end subroutine s_check_celloutside
+
+    !> This subroutine transforms the computational coordinates of the particle from real type into integer.
+    !! @param s Computational coordinates of the particle, real type
+    !! @param get_cell Computational coordinates of the particle, integer type
+    subroutine s_get_cell(s_cell, get_cell)
+
+        $:GPU_ROUTINE(function_name='s_get_cell',parallelism='[seq]', cray_inline=True)
+
+        real(wp), dimension(3), intent(in) :: s_cell
+        integer, dimension(3), intent(out) :: get_cell
+        integer                            :: i
+
+        get_cell(:) = int(s_cell(:))
+        do i = 1, num_dims
+            if (s_cell(i) < 0._wp) get_cell(i) = get_cell(i) - 1
+        end do
+
+    end subroutine s_get_cell
+
+    !> The purpose of this procedure is to calculate the characteristic cell volume
+    !! @param cell Computational coordinates (x, y, z)
+    !! @param Charvol Characteristic volume
+    subroutine s_get_char_vol(cellx, celly, cellz, Charvol)
+
+        $:GPU_ROUTINE(function_name='s_get_char_vol',parallelism='[seq]', cray_inline=True)
+
+        integer, intent(in)   :: cellx, celly, cellz
+        real(wp), intent(out) :: Charvol
+
+        if (p > 0) then
+            Charvol = dx(cellx)*dy(celly)*dz(cellz)
+        else
+            if (cyl_coord) then
+                Charvol = dx(cellx)*dy(celly)*y_cc(celly)*2._wp*pi
+            else
+                Charvol = dx(cellx)*dy(celly)*lag_params%charwidth
+            end if
+        end if
+
+    end subroutine s_get_char_vol
+
+    !! This function calculates the force on a particle based on the pressure gradient, velocity, and drag model.
+    !! @param pos Position of the particle
+    !! @param rad Radius of the particle
+    !! @param vel_p Velocity of the particle
+    !! @param mass_p Particle mass
+    !! @param Re Viscosity!
+    !! @param rho Density of the fluid
+    !! @param vol_frac Particle Volume Fraction
+    !! @param cell Computational coordinates of the particle
+    !! @param q_prim_vf Eulerian field with primitive variables
+    !! @return a Acceleration of the particle in direction i
+    subroutine s_get_particle_force(pos, rad, vel_p, mass_p, Re, gamm, vol_frac, drhodt, cell, q_prim_vf, fieldvars, wx, wy, wz, &
+                                    & force, rmass_add)
+        $:GPU_ROUTINE(parallelism='[seq]')
+        real(wp), intent(in)                                :: rad, mass_p, Re, gamm, vol_frac, drhodt
+        real(wp), dimension(3), intent(in)                  :: pos
+        integer, dimension(3), intent(in)                   :: cell
+        real(wp), dimension(3), intent(in)                  :: vel_p
+        type(scalar_field), dimension(:), intent(in)        :: fieldvars
+        type(scalar_field), dimension(:), intent(in)        :: wx, wy, wz
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
+        real(wp), dimension(3), intent(out)                 :: force
+        real(wp), intent(out)                               :: rmass_add
+        real(wp)                                            :: a, vol, rho_fluid, pressure_fluid
+        real(wp), dimension(3)                              :: v_rel, dp
+        real(wp), dimension(fd_order)                       :: xi, eta, L
+        real(wp)                                            :: particle_diam, gas_mu, vmag, cson
+        real(wp)                                            :: slip_velocity_x, slip_velocity_y, slip_velocity_z, beta
+        real(wp), dimension(3)                              :: fluid_vel
+        integer                                             :: dir
+
+        ! Added pass params
+        real(wp)               :: mach, Cam, flux_f, flux_b, div_u, SDrho, vpgradrho
+        real(wp), dimension(3) :: rhoDuDt, grad_rho, fam
+        integer, dimension(3)  :: p1
+
+        force = 0._wp
+        dp = 0._wp
+        grad_rho = 0._wp
+        fam = 0._wp
+        fluid_vel = 0._wp
+        v_rel = 0._wp
+        rhoDuDt = 0._wp
+        SDrho = 0._wp
+        ! div_u = 0._wp
+
+        !!Interpolation - either even ordered barycentric or 0th order
+        if (lag_params%interpolation_order > 1) then
+            rho_fluid = f_interp_barycentric(pos, cell, q_prim_vf, 1, wx, wy, wz)
+            pressure_fluid = f_interp_barycentric(pos, cell, q_prim_vf, E_idx, wx, wy, wz)
+            do dir = 1, num_dims
+                if (lag_params%pressure_force .or. lag_params%added_mass_model > 0) then
+                    dp(dir) = f_interp_barycentric(pos, cell, fieldvars, dir, wx, wy, wz)
+                end if
+                if (lag_params%added_mass_model > 0) then
+                    grad_rho(dir) = f_interp_barycentric(pos, cell, fieldvars, 3 + dir, wx, wy, wz)
+                    ! div_u = div_u + f_interp_barycentric(pos, cell, fieldvars, 6 + dir, wx, wy, wz)
+                end if
+                fluid_vel(dir) = f_interp_barycentric(pos, cell, q_prim_vf, momxb + dir - 1, wx, wy, wz)
+            end do
+        else
+            rho_fluid = q_prim_vf(1)%sf(cell(1), cell(2), cell(3))
+            pressure_fluid = q_prim_vf(E_idx)%sf(cell(1), cell(2), cell(3))
+            do dir = 1, num_dims
+                if (lag_params%pressure_force .or. lag_params%added_mass_model > 0) then
+                    dp(dir) = fieldvars(dir)%sf(cell(1), cell(2), cell(3))
+                end if
+                if (lag_params%added_mass_model > 0) then
+                    grad_rho(dir) = fieldvars(3 + dir)%sf(cell(1), cell(2), cell(3))
+                    ! div_u = div_u + fieldvars(6 + dir)%sf(cell(1), cell(2), cell(3))
+                end if
+                fluid_vel(dir) = q_prim_vf(momxb + dir - 1)%sf(cell(1), cell(2), cell(3))
+            end do
+        end if
+
+        v_rel = vel_p - fluid_vel
+
+        if (lag_params%qs_drag_model > 0 .or. lag_params%added_mass_model > 0) then
+            ! Quasi-steady Drag Force Parameters
+            slip_velocity_x = fluid_vel(1) - vel_p(1)
+            slip_velocity_y = fluid_vel(2) - vel_p(2)
+            if (num_dims == 3) then
+                slip_velocity_z = fluid_vel(3) - vel_p(3)
+                vmag = sqrt(slip_velocity_x*slip_velocity_x + slip_velocity_y*slip_velocity_y + slip_velocity_z*slip_velocity_z)
+            else if (num_dims == 2) then
+                vmag = sqrt(slip_velocity_x*slip_velocity_x + slip_velocity_y*slip_velocity_y)
+            end if
+            particle_diam = rad*2._wp
+            if (rho_fluid > 0._wp) then
+                cson = sqrt((gamm*pressure_fluid)/rho_fluid)
+            else
+                cson = 1._wp
+            end if
+            gas_mu = Re
+        end if
+
+        if (lag_params%added_mass_model > 0) then
+            rhoDuDt = -dp
+            vpgradrho = dot_product(vel_p, grad_rho)
+            SDrho = drhodt + fluid_vel(1)*grad_rho(1) + fluid_vel(2)*grad_rho(2) + fluid_vel(3)*grad_rho(3)
+            mach = vmag/cson
+        end if
+
+        ! Step 1: Force component quasi-steady
+        if (lag_params%qs_drag_model == 1) then
+            beta = QS_Parmar(rho_fluid, cson, gas_mu, gamm, vmag, particle_diam, vol_frac)
+            force = force - beta*v_rel
+        else if (lag_params%qs_drag_model == 2) then
+            beta = QS_Osnes(rho_fluid, cson, gas_mu, gamm, vmag, particle_diam, vol_frac)
+            force = force - beta*v_rel
+        else if (lag_params%qs_drag_model == 3) then
+            beta = QS_ModifiedParmar(rho_fluid, cson, gas_mu, gamm, vmag, particle_diam, vol_frac)
+            force = force - beta*v_rel
+        else if (lag_params%qs_drag_model == 4) then
+            beta = QS_Gidaspow(rho_fluid, cson, gas_mu, gamm, vmag, particle_diam, vol_frac)
+            force = force - beta*v_rel
+        else
+            ! No Quasi-Steady drag
+        end if
+
+        ! Step 1.1: Stokes drag
+        if (lag_params%stokes_drag == 1) then  ! Free slip Stokes drag
+            force = force - 4._wp*pi*gas_mu*rad*v_rel
+        else if (lag_params%stokes_drag == 2) then  ! No slip Stokes drag
+            force = force - 6._wp*pi*gas_mu*rad*v_rel
+        else
+            ! No stokes drag
+        end if
+
+        ! Step 2: Pressure Gradient Force
+        if (lag_params%pressure_force) then
+            vol = (4._wp/3._wp)*pi*(rad**3._wp)
+            force = force - vol*dp
+        end if
+
+        ! Step 3: Gravitational Force
+        if (lag_params%gravity_force) then
+            force = force + (mass_p)*accel_bf
+        end if
+
+        ! Step 4: Added Mass Force
+        if (lag_params%added_mass_model == 1) then
+            vol = (4._wp/3._wp)*pi*(rad**3._wp)
+            if (mach > 0.6_wp) then
+                Cam = 1._wp + 1.8_wp*(0.6_wp**2) + 7.6_wp*(0.6_wp**4)
+            else
+                Cam = 1._wp + 1.8_wp*mach**2 + 7.6_wp*mach**4
+            end if
+
+            Cam = 0.5_wp*Cam*(1._wp + 0.68_wp*vol_frac**2)
+            rmass_add = rho_fluid*vol*Cam  ! (1._wp-vol_frac)*rho_fluid*vol*Cam
+
+            fam = Cam*vol*(vel_p*SDrho + rhoDuDt + fluid_vel*(vpgradrho))
+
+            do dir = 1, num_dims
+                if (.not. ieee_is_finite(fam(dir))) then
+                    fam(dir) = 0._wp
+                    rmass_add = 0._wp
+                end if
+            end do
+            force = force + fam
+        else
+            rmass_add = 0._wp
+        end if
+
+        do dir = 1, num_dims
+            if (.not. ieee_is_finite(force(dir))) then
+                force(dir) = 0._wp
+            end if
+        end do
+
+    end subroutine s_get_particle_force
+
+    !> Interpolate an Eulerian field to a particle position using barycentric Lagrange interpolation with precomputed weights. Falls
+    !! back to nearest-cell value if the interpolant is non-finite.
+    !! @param pos Particle position
+    !! @param cell Grid cell containing the particle
+    !! @param field_vf Eulerian field to interpolate
+    !! @param field_index Component index in field_vf
+    function f_interp_barycentric(pos, cell, field_vf, field_index, wx, wy, wz) result(val)
+
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        real(wp), dimension(3), intent(in)           :: pos
+        integer, dimension(3), intent(in)            :: cell
+        type(scalar_field), dimension(:), intent(in) :: field_vf
+        type(scalar_field), dimension(:), intent(in) :: wx, wy, wz
+        integer, intent(in)                          :: field_index
+        integer                                      :: i, j, k, ix, jy, kz, npts, npts_z, N, a, b
+        integer                                      :: ix_count, jy_count, kz_count
+        real(wp)                                     :: weight, numerator, denominator, xBar, eps
+        real(wp)                                     :: val, local_min, local_max, prod_x, prod_y, prod_z
+
+        i = cell(1)
+        j = cell(2)
+        k = cell(3)
+
+        N = lag_params%interpolation_order
+        npts = N/2
+        npts_z = npts
+        if (num_dims == 2) npts_z = 0
+        eps = 1.e-12_wp
+        numerator = 0._wp
+        denominator = 0._wp
+
+        ! if (abs(pos(1) - x_cc(i)) <= eps .and. & abs(pos(2) - y_cc(j)) <= eps .and. & abs(pos(3) - z_cc(k)) <= eps) then val =
+        ! field_vf(field_index)%sf(i, j, k) return end if
+
+        ix_count = 0
+        do ix = i - npts, i + npts
+            ix_count = ix_count + 1
+            jy_count = 0
+            do jy = j - npts, j + npts
+                jy_count = jy_count + 1
+                kz_count = 0
+                do kz = k - npts_z, k + npts_z
+                    kz_count = kz_count + 1
+                    if (num_dims == 3) then
+                        xBar = (pos(1) - x_cc(ix))*(pos(2) - y_cc(jy))*(pos(3) - z_cc(kz))
+                        weight = wx(ix_count)%sf(i, 1, 1)*wy(jy_count)%sf(j, 1, 1)*wz(kz_count)%sf(k, 1, 1)
+                    else
+                        xBar = (pos(1) - x_cc(ix))*(pos(2) - y_cc(jy))
+                        weight = wx(ix_count)%sf(i, 1, 1)*wy(jy_count)%sf(j, 1, 1)
+                    end if
+                    weight = weight/xBar
+                    numerator = numerator + weight*field_vf(field_index)%sf(ix, jy, kz)
+                    denominator = denominator + weight
+                end do
+            end do
+        end do
+
+        val = numerator/denominator
+
+        if (.not. ieee_is_finite(val)) then
+            val = field_vf(field_index)%sf(i, j, k)
+        else if (abs(val) <= eps) then
+            val = 0._wp
+        end if
+
+    end function f_interp_barycentric
+
+    ! Quasi-steady force (Re_p and Ma_p corrections): Improved Drag Correlation for Spheres and Application to Shock-Tube
+    ! Experiments - Parmar et al. (2010) - AIAA Journal
+    !
+    ! Quasi-steady force (phi corrections): The Added Mass, Basset, and Viscous Drag Coefficients in Nondilute Bubbly Liquids
+    ! Undergoing Small-Amplitude Oscillatory Motion - Sangani et al. (1991) - Phys. Fluids A
+    function QS_Parmar(rho, cson, mu_fluid, gamma, vmag, dp, volume_fraction) result(beta)
+
+        $:GPU_ROUTINE(parallelism='[seq]')
+        real(wp), intent(in) :: rho, cson, mu_fluid, gamma, vmag, dp, volume_fraction
+        real(wp)             :: rcd1, rmacr, rcd_mcr, rcd_std, rmach_rat, rcd_M1
+        real(wp)             :: rcd_M2, C1, C2, C3, f1M, f2M, f3M, lrep, factor, cd, phi_corr
+        real(wp)             :: beta
+        real(wp)             :: rmachp, mp, phi, rep, re
+
+        rmachp = vmag/cson
+        mp = max(rmachp, 0.01_wp)
+        phi = max(volume_fraction, 0.0001_wp)
+        rep = vmag*dp*rho/mu_fluid
+        re = max(rep, 0.1_wp)
+
+        if (re < 1.e-14_wp) then
+            rcd1 = 1.0_wp
+        else
+            rmacr = 0.6_wp  ! Critical rmachp no
+            rcd_mcr = (1._wp + 0.15_wp*re**(0.684_wp)) + (re/24.0_wp)*(0.513_wp/(1._wp + 483._wp/re**(0.669_wp)))
+            if (mp <= rmacr) then
+                rcd_std = (1._wp + 0.15_wp*re**(0.687_wp)) + (re/24.0_wp)*(0.42_wp/(1._wp + 42500._wp/re**(1.16_wp)))
+                rmach_rat = mp/rmacr
+                rcd1 = rcd_std + (rcd_mcr - rcd_std)*rmach_rat
+            else if (mp <= 1.0_wp) then
+                rcd_M1 = (1.0_wp + 0.118_wp*re**0.813_wp) + (re/24.0_wp)*0.69_wp/(1.0_wp + 3550.0_wp/re**0.793_wp)
+                C1 = 6.48_wp
+                C2 = 9.28_wp
+                C3 = 12.21_wp
+                f1M = -1.884_wp + 8.422_wp*mp - 13.70_wp*mp**2 + 8.162_wp*mp**3
+                f2M = -2.228_wp + 10.35_wp*mp - 16.96_wp*mp**2 + 9.840_wp*mp**3
+                f3M = 4.362_wp - 16.91_wp*mp + 19.84_wp*mp**2 - 6.296_wp*mp**3
+                lrep = log(re)
+                factor = f1M*(lrep - C2)*(lrep - C3)/((C1 - C2)*(C1 - C3)) + f2M*(lrep - C1)*(lrep - C3)/((C2 - C1)*(C2 - C3)) &
+                              & + f3M*(lrep - C1)*(lrep - C2)/((C3 - C1)*(C3 - C2))
+                rcd1 = rcd_mcr + (rcd_M1 - rcd_mcr)*factor
+            else if (mp < 1.75_wp) then
+                rcd_M1 = (1.0_wp + 0.118_wp*re**0.813_wp) + (re/24.0_wp)*0.69_wp/(1.0_wp + 3550.0_wp/re**0.793_wp)
+                rcd_M2 = (1.0_wp + 0.107_wp*re**0.867_wp) + (re/24.0_wp)*0.646_wp/(1.0_wp + 861.0_wp/re**0.634_wp)
+                C1 = 6.48_wp
+                C2 = 8.93_wp
+                C3 = 12.21_wp
+                f1M = -2.963_wp + 4.392_wp*mp - 1.169_wp*mp**2 - 0.027_wp*mp**3 - 0.233_wp*exp((1.0_wp - mp)/0.011_wp)
+                f2M = -6.617_wp + 12.11_wp*mp - 6.501_wp*mp**2 + 1.182_wp*mp**3 - 0.174_wp*exp((1.0_wp - mp)/0.010_wp)
+                f3M = -5.866_wp + 11.57_wp*mp - 6.665_wp*mp**2 + 1.312_wp*mp**3 - 0.350_wp*exp((1.0_wp - mp)/0.012_wp)
+                lrep = log(re)
+                factor = f1M*(lrep - C2)*(lrep - C3)/((C1 - C2)*(C1 - C3)) + f2M*(lrep - C1)*(lrep - C3)/((C2 - C1)*(C2 - C3)) &
+                              & + f3M*(lrep - C1)*(lrep - C2)/((C3 - C1)*(C3 - C2))
+                rcd1 = rcd_M1 + (rcd_M2 - rcd_M1)*factor
+            else
+                rcd1 = (1.0_wp + 0.107_wp*re**0.867_wp) + (re/24.0_wp)*0.646_wp/(1.0_wp + 861.0_wp/re**0.634_wp)
+            end if  ! mp
+        end if  ! re
+
+        ! Sangani's volume fraction correction for dilute random arrays Capping volume fraction at 0.5
+        phi_corr = (1.0_wp + 5.94_wp*min(phi, 0.5_wp))
+
+        cd = (24.0_wp/re)*rcd1*phi_corr
+
+        beta = rcd1*3.0_wp*pi*mu_fluid*dp
+
+        beta = beta*phi_corr
+
+    end function QS_Parmar
+
+    ! Quasi-steady force (Re_p and Ma_p corrections): Improved Drag Correlation for Spheres and Application to Shock-Tube
+    ! Experiments - Parmar et al. (2010) - AIAA Journal
+    !
+    ! Quasi-steady force (phi corrections): Sangani et al. (1991) volume fraction correction overshoots the drag coefficient.
+    !
+    ! We adopt instead Osnes et al. (2023) volume fraction correction based on Tenneti et al. with one extra term.
+    !
+    ! At Mach=0, the drag coefficient from this subroutine matches very well with the one calculated using the Osnes subroutine, for
+    ! various Reynolds numbers and volume fractions.
+    function QS_ModifiedParmar(rho, cson, mu_fluid, gamma, vmag, dp, volume_fraction) result(beta)
+
+        $:GPU_ROUTINE(parallelism='[seq]')
+        real(wp), intent(in) :: rho, cson, mu_fluid, gamma, vmag, dp, volume_fraction
+        real(wp)             :: rcd1, rmacr, rcd_mcr, rcd_std, rmach_rat, rcd_M1
+        real(wp)             :: rcd_M2, C1, C2, C3, f1M, f2M, f3M, lrep, factor, cd, phi_corr
+        real(wp)             :: b1, b2, b3
+        real(wp)             :: beta
+        real(wp)             :: rmachp, mp, phi, rep, re
+
+        rmachp = vmag/cson
+        mp = max(rmachp, 0.01_wp)
+        phi = max(volume_fraction, 0.0001_wp)
+        rep = vmag*dp*rho/mu_fluid
+        re = max(rep, 0.1_wp)
+
+        if (re < 1e-14_wp) then
+            rcd1 = 1.0_wp
+        else
+            rmacr = 0.6_wp  ! Critical rmachp no.
+            rcd_mcr = (1._wp + 0.15_wp*re**(0.684_wp)) + (re/24.0_wp)*(0.513_wp/(1._wp + 483._wp/re**(0.669_wp)))
+            if (mp <= rmacr) then
+                rcd_std = (1._wp + 0.15_wp*re**(0.687_wp)) + (re/24.0_wp)*(0.42_wp/(1._wp + 42500._wp/re**(1.16_wp)))
+                rmach_rat = mp/rmacr
+                rcd1 = rcd_std + (rcd_mcr - rcd_std)*rmach_rat
+            else if (mp <= 1.0_wp) then
+                rcd_M1 = (1.0_wp + 0.118_wp*re**0.813_wp) + (re/24.0_wp)*0.69_wp/(1.0_wp + 3550.0_wp/re**0.793_wp)
+                C1 = 6.48_wp
+                C2 = 9.28_wp
+                C3 = 12.21_wp
+                f1M = -1.884_wp + 8.422_wp*mp - 13.70_wp*mp**2 + 8.162_wp*mp**3
+                f2M = -2.228_wp + 10.35_wp*mp - 16.96_wp*mp**2 + 9.840_wp*mp**3
+                f3M = 4.362_wp - 16.91_wp*mp + 19.84_wp*mp**2 - 6.296_wp*mp**3
+                lrep = log(re)
+                factor = f1M*(lrep - C2)*(lrep - C3)/((C1 - C2)*(C1 - C3)) + f2M*(lrep - C1)*(lrep - C3)/((C2 - C1)*(C2 - C3)) &
+                              & + f3M*(lrep - C1)*(lrep - C2)/((C3 - C1)*(C3 - C2))
+                rcd1 = rcd_mcr + (rcd_M1 - rcd_mcr)*factor
+            else if (mp < 1.75_wp) then
+                rcd_M1 = (1.0_wp + 0.118_wp*re**0.813_wp) + (re/24.0_wp)*0.69_wp/(1.0_wp + 3550.0_wp/re**0.793_wp)
+                rcd_M2 = (1.0_wp + 0.107_wp*re**0.867_wp) + (re/24.0_wp)*0.646_wp/(1.0_wp + 861.0_wp/re**0.634_wp)
+                C1 = 6.48_wp
+                C2 = 8.93_wp
+                C3 = 12.21_wp
+                f1M = -2.963_wp + 4.392_wp*mp - 1.169_wp*mp**2 - 0.027_wp*mp**3 - 0.233_wp*exp((1.0_wp - mp)/0.011_wp)
+                f2M = -6.617_wp + 12.11_wp*mp - 6.501_wp*mp**2 + 1.182_wp*mp**3 - 0.174_wp*exp((1.0_wp - mp)/0.010_wp)
+                f3M = -5.866_wp + 11.57_wp*mp - 6.665_wp*mp**2 + 1.312_wp*mp**3 - 0.350_wp*exp((1.0_wp - mp)/0.012_wp)
+                lrep = log(re)
+                factor = f1M*(lrep - C2)*(lrep - C3)/((C1 - C2)*(C1 - C3)) + f2M*(lrep - C1)*(lrep - C3)/((C2 - C1)*(C2 - C3)) &
+                              & + f3M*(lrep - C1)*(lrep - C2)/((C3 - C1)*(C3 - C2))
+                rcd1 = rcd_M1 + (rcd_M2 - rcd_M1)*factor
+            else
+                rcd1 = (1.0_wp + 0.107_wp*re**0.867_wp) + (re/24.0_wp)*0.646_wp/(1.0_wp + 861.0_wp/re**0.634_wp)
+            end if  ! mp
+        end if  ! re
+
+        ! Osnes's volume fraction correction
+        b1 = 5.81_wp*phi/((1.0_wp - phi)**2) + 0.48_wp*(phi**(1._wp/3._wp))/((1.0_wp - phi)**3)
+
+        b2 = ((1.0_wp - phi)**2)*(phi**3)*re*(0.95_wp + 0.61_wp*(phi**3)/((1.0_wp - phi)**2))
+
+        b3 = min(sqrt(20.0_wp*mp), &
+                 & 1.0_wp)*(5.65_wp*phi - 22.0_wp*(phi**2) + 23.4_wp*(phi**3))*(1._wp + tanh((mp - (0.65_wp - 0.24_wp*phi)) &
+                 & /0.35_wp))
+
+        cd = (24.0_wp/re)*rcd1
+
+        cd = cd/(1.0_wp - phi) + b3 + (24.0_wp/re)*(1.0_wp - phi)*(b1 + b2)
+
+        beta = 3.0_wp*pi*mu_fluid*dp*(re/24.0_wp)*cd
+
+    end function QS_ModifiedParmar
+
+    ! QS Force calculated as a function of Re, Ma and phi
+    !
+    ! Use Osnes etal (2023) correlations A.N. Osnes, M. Vartdal, M. Khalloufi, J. Capecelatro, and S. Balachandar. Comprehensive
+    ! quasi-steady force correlations for compressible flow through random particle suspensions. International Journal of Multiphase
+    ! Flow, Vol. 165, 104485, (2023). doi: https://doi.org/10.1016/j.imultiphaseflow.2023.104485.
+    !
+    ! E. Loth, J.T. Daspit, M. Jeong, T. Nagata, and T. Nonomura. Supersonic and hypersonic drag coefficients for a sphere. AIAA
+    ! Journal, Vol. 59(8), pp. 3261-3274, (2021). doi: https://doi.org/10.2514/1.J060153.
+    !
+    ! NOTE: Re<45 Rarefied formula of Loth et al has been redefined by Balachandar to avoid singularity as Ma -> 0.
+    function QS_Osnes(rho, cson, mu_fluid, gamma, vmag, dp, volume_fraction) result(beta)
+
+        $:GPU_ROUTINE(parallelism='[seq]')
+        real(wp), intent(in) :: rho, cson, mu_fluid, gamma, vmag, dp, volume_fraction
+        real(wp)             :: rmachp, mp, phi, rep, re
+        real(wp)             :: Knp, fKn, CD1, s, JM, CD2, cd_loth, CM, GM, HM, b1, b2, b3, cd, sgby2, JMt
+        real(wp)             :: beta
+
+        rmachp = vmag/cson
+        mp = max(rmachp, 0.01_wp)
+        phi = max(volume_fraction, 0.0001_wp)
+        rep = vmag*dp*rho/mu_fluid
+        re = max(rep, 0.1_wp)
+
+        ! Loth's correlation
+        if (re <= 45.0_wp) then
+            ! Rarefied-dominated regime
+            Knp = sqrt(0.5_wp*pi*gamma)*mp/re
+            if (Knp > 0.01_wp) then
+                fKn = 1.0_wp/(1.0_wp + Knp*(2.514_wp + 0.8_wp*exp(-0.55_wp/Knp)))
+            else
+                fKn = 1.0_wp/(1.0_wp + Knp*(2.514_wp + 0.8_wp*exp(-0.55_wp/0.01_wp)))
+            end if
+            CD1 = (24.0_wp/re)*(1.0_wp + 0.15_wp*re**(0.687_wp))*fKn
+            s = mp*sqrt(0.5_wp*gamma)
+            sgby2 = sqrt(0.5_wp*gamma)
+            if (mp <= 1._wp) then
+                ! JMt = 2.26_wp*(mp**4) - 0.1_wp*(mp**3) + 0.14_wp*mp
+                JMt = 2.26_wp*(mp**4) + 0.14_wp*mp
+            else
+                JMt = 1.6_wp*(mp**4) + 0.25_wp*(mp**3) + 0.11_wp*(mp**2) + 0.44_wp*mp
+            end if
+            !
+            ! Reformulated version of Loth et al. to avoid singularity at mp = 0
+            !
+            CD2 = (1.0_wp + 2.0_wp*(s**2))*exp(-s**2)*mp/((sgby2**3)*sqrt(pi)) + (4.0_wp*(s**4) + 4.0_wp*(s**2) - 1.0_wp)*erf(s) &
+                   & /(2.0_wp*(sgby2**4)) + (2.0_wp*(mp**3)/(3.0_wp*sgby2))*sqrt(pi)
+
+            CD2 = CD2/(1.0_wp + (((CD2/JMt) - 1.0_wp)*sqrt(re/45.0_wp)))
+            cd_loth = CD1/(1.0_wp + (mp**4)) + CD2/(1.0_wp + (mp**4))
+        else
+            ! Compression-dominated regime TLJ: coefficients tweaked to get continuous values on the two branches at the critical
+            ! points
+            if (mp < 1.5_wp) then
+                CM = 1.65_wp + 0.65_wp*tanh(4._wp*mp - 3.4_wp)
+            else
+                ! CM = 2.18_wp - 0.13_wp*tanh(0.9_wp*mp - 2.7_wp)
+                CM = 2.18_wp - 0.12913149918318745_wp*tanh(0.9_wp*mp - 2.7_wp)
+            end if
+            if (mp < 0.8_wp) then
+                GM = 166.0_wp*(mp**3) + 3.29_wp*(mp**2) - 10.9_wp*mp + 20._wp
+            else
+                ! GM = 5.0_wp + 40._wp*(mp**(-3))
+                GM = 5.0_wp + 47.809331200000017_wp*(mp**(-3))
+            end if
+            if (mp < 1._wp) then
+                HM = 0.0239_wp*(mp**3) + 0.212_wp*(mp**2) - 0.074_wp*mp + 1._wp
+            else
+                ! HM =   0.93_wp + 1.0_wp / (3.5_wp + (mp**5))
+                HM = 0.93967777777777772_wp + 1.0_wp/(3.5_wp + (mp**5))
+            end if
+
+            cd_loth = (24.0_wp/re)*(1._wp + 0.15_wp*(re**(0.687_wp)))*HM + 0.42_wp*CM/(1._wp + 42500._wp/re**(1.16_wp*CM) &
+                       & + GM/sqrt(re))
+        end if
+
+        b1 = 5.81_wp*phi/((1.0_wp - phi)**2) + 0.48_wp*(phi**(1._wp/3._wp))/((1.0_wp - phi)**3)
+
+        b2 = ((1.0_wp - phi)**2)*(phi**3)*re*(0.95_wp + 0.61_wp*(phi**3)/((1.0_wp - phi)**2))
+
+        b3 = min(sqrt(20.0_wp*mp), &
+                 & 1.0_wp)*(5.65_wp*phi - 22.0_wp*(phi**2) + 23.4_wp*(phi**3))*(1._wp + tanh((mp - (0.65_wp - 0.24_wp*phi)) &
+                 & /0.35_wp))
+
+        cd = cd_loth/(1.0_wp - phi) + b3 + (24.0_wp/re)*(1.0_wp - phi)*(b1 + b2)
+
+        beta = 3.0_wp*pi*mu_fluid*dp*(re/24.0_wp)*cd
+
+    end function QS_Osnes
+
+    ! Subroutine for Quasi-Steady Drag Model of Gidaspow
+    !
+    ! D. Gidaspow, Multiphase Flow and Fluidization (Academic Press, 1994)
+    !
+    ! Note: Model is provided per cell volume. We convert that to per particle using the particle volume fraction and volume
+    function QS_Gidaspow(rho, cson, mu_fluid, gamma, vmag, dp, volume_fraction) result(beta)
+
+        $:GPU_ROUTINE(parallelism='[seq]')
+        real(wp), intent(in) :: rho, cson, mu_fluid, gamma, vmag, dp, volume_fraction
+        real(wp)             :: cd, phifRep, phif
+        real(wp)             :: phi, rep, re
+        real(wp)             :: beta
+
+        rep = vmag*dp*rho/mu_fluid
+        phi = max(volume_fraction, 0.0001_wp)
+        phif = max(1._wp - volume_fraction, 0.0001_wp)
+        re = max(rep, 0.1_wp)
+
+        phifRep = phif*re
+
+        if (phifRep < 1000.0_wp) then
+            cd = 24.0_wp/phifRep*(1.0_wp + 0.15_wp*(phifRep)**0.687_wp)
+        else
+            cd = 0.44_wp
+        end if
+
+        if (phif < 0.8_wp) then
+            beta = 150.0_wp*((phi**2)*mu_fluid)/(phif*dp**2) + 1.75_wp*(rho*phi*vmag/dp)
+        else
+            beta = 0.75_wp*cd*phi*rho*vmag/(dp*phif**1.65_wp)
+        end if
+
+        beta = beta*(pi*dp**3)/(6.0_wp*phi)
+
+    end function QS_Gidaspow
+
+end module m_particles_EL_kernels
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index 1f347a0289..c343d33cf6 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -19,6 +19,7 @@ module m_rhs
     use m_cbc
     use m_bubbles_EE
     use m_bubbles_EL
+    use m_particles_EL
     use m_qbmm
     use m_hypoelastic
     use m_hyperelastic
@@ -38,9 +39,13 @@ module m_rhs
 
     private; public :: s_initialize_rhs_module, s_compute_rhs, s_finalize_rhs_module
 
+    !! This variable contains the WENO-reconstructed values of the cell-average conservative variables, which are located in
+    !! q_cons_vf, at cell-interior Gaussian quadrature points (QP).
     type(vector_field) :: q_cons_qp  !< WENO-reconstructed cell-average conservative variables at quadrature points
     $:GPU_DECLARE(create='[q_cons_qp]')
 
+    !! The primitive variables at cell-interior Gaussian quadrature points. These are calculated from the conservative variables and
+    !! gradient magnitude (GM) of the volume fractions, q_cons_qp and gm_alpha_qp, respectively.
     type(vector_field) :: q_prim_qp  !< Primitive variables at cell-interior quadrature points
     $:GPU_DECLARE(create='[q_prim_qp]')
 
@@ -67,7 +72,10 @@ module m_rhs
     type(scalar_field), allocatable, dimension(:) :: tau_Re_vf
     $:GPU_DECLARE(create='[tau_Re_vf]')
 
+    !> The gradient magnitude of the volume fractions at cell-interior Gaussian quadrature points. gm_alpha_qp is calculated from
+    !! individual first-order spatial derivatives located in dq_prim_ds_qp.
     type(vector_field) :: gm_alpha_qp  !< Volume fraction gradient magnitudes at cell-interior quadrature points
+
     $:GPU_DECLARE(create='[gm_alpha_qp]')
 
     !> @name The left and right WENO-reconstructed cell-boundary values of the cell- average gradient magnitude of volume fractions,
@@ -127,7 +135,8 @@ module m_rhs
 
 contains
 
-    !> Initialize the RHS module
+    !> The computation of parameters, the allocation of memory, the association of pointers and/or the execution of any other
+    !! procedures that are necessary to setup the module.
     impure subroutine s_initialize_rhs_module
 
         integer :: i, j, k, l, id  !< Generic loop iterators
@@ -197,6 +206,7 @@ contains
             $:GPU_ENTER_DATA(attach='[q_prim_qp%vf(psi_idx)%sf]')
         end if
 
+        ! Allocation/Association of flux_n, flux_src_n, and flux_gsrc_n
         if (.not. igr) then
             @:ALLOCATE(flux_n(1:num_dims))
             @:ALLOCATE(flux_src_n(1:num_dims))
@@ -209,42 +219,42 @@ contains
 
                 if (i == 1) then
                     do l = 1, sys_size
-                        @:ALLOCATE(flux_n(i)%vf(l)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
+                        @:ALLOCATE(flux_n(i)%vf(l)%sf( idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
                                    & idwbuff(3)%beg:idwbuff(3)%end))
-                        @:ALLOCATE(flux_gsrc_n(i)%vf(l)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
+                        @:ALLOCATE(flux_gsrc_n(i)%vf(l)%sf( idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
                                    & idwbuff(3)%beg:idwbuff(3)%end))
                     end do
 
                     if (viscous .or. surface_tension) then
                         do l = mom_idx%beg, E_idx
-                            @:ALLOCATE(flux_src_n(i)%vf(l)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
+                            @:ALLOCATE(flux_src_n(i)%vf(l)%sf( idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
                                        & idwbuff(3)%beg:idwbuff(3)%end))
                         end do
                     end if
 
-                    @:ALLOCATE(flux_src_n(i)%vf(adv_idx%beg)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
+                    @:ALLOCATE(flux_src_n(i)%vf(adv_idx%beg)%sf( idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
                                & idwbuff(3)%beg:idwbuff(3)%end))
 
                     if (riemann_solver == 1 .or. riemann_solver == 4) then
                         do l = adv_idx%beg + 1, adv_idx%end
-                            @:ALLOCATE(flux_src_n(i)%vf(l)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
+                            @:ALLOCATE(flux_src_n(i)%vf(l)%sf( idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
                                        & idwbuff(3)%beg:idwbuff(3)%end))
                         end do
                     end if
 
                     if (chemistry) then
                         do l = chemxb, chemxe
-                            @:ALLOCATE(flux_src_n(i)%vf(l)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
+                            @:ALLOCATE(flux_src_n(i)%vf(l)%sf( idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
                                        & idwbuff(3)%beg:idwbuff(3)%end))
                         end do
                         if (chem_params%diffusion .and. .not. viscous) then
-                            @:ALLOCATE(flux_src_n(i)%vf(E_idx)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
+                            @:ALLOCATE(flux_src_n(i)%vf(E_idx)%sf( idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
                                        & idwbuff(3)%beg:idwbuff(3)%end))
                         end if
                     end if
                 else
                     do l = 1, sys_size
-                        @:ALLOCATE(flux_gsrc_n(i)%vf(l)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
+                        @:ALLOCATE(flux_gsrc_n(i)%vf(l)%sf( idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
                                    & idwbuff(3)%beg:idwbuff(3)%end))
                     end do
                 end if
@@ -268,9 +278,11 @@ contains
                     end do
                 end if
             end do
+            ! END: Allocation/Association of flux_n, flux_src_n, and flux_gsrc_n
         end if
 
         if ((.not. igr) .or. dummy) then
+            ! Allocation of dq_prim_ds_qp
             @:ALLOCATE(dq_prim_dx_qp(1:1))
             @:ALLOCATE(dq_prim_dy_qp(1:1))
             @:ALLOCATE(dq_prim_dz_qp(1:1))
@@ -278,6 +290,7 @@ contains
             @:ALLOCATE(qL_prim(1:num_dims))
             @:ALLOCATE(qR_prim(1:num_dims))
 
+            ! Allocation/Association of dqK_prim_ds_n
             @:ALLOCATE(dqL_prim_dx_n(1:num_dims))
             @:ALLOCATE(dqL_prim_dy_n(1:num_dims))
             @:ALLOCATE(dqL_prim_dz_n(1:num_dims))
@@ -364,7 +377,7 @@ contains
                 @:ALLOCATE(dq_prim_dz_qp(1)%vf(1:sys_size))
 
                 do l = mom_idx%beg, mom_idx%end
-                    @:ALLOCATE(dq_prim_dx_qp(1)%vf(l)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
+                    @:ALLOCATE(dq_prim_dx_qp(1)%vf(l)%sf( idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
                                & idwbuff(3)%beg:idwbuff(3)%end))
                 end do
 
@@ -372,7 +385,7 @@ contains
 
                 if (n > 0) then
                     do l = mom_idx%beg, mom_idx%end
-                        @:ALLOCATE(dq_prim_dy_qp(1)%vf(l)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
+                        @:ALLOCATE(dq_prim_dy_qp(1)%vf(l)%sf( idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
                                    & idwbuff(3)%beg:idwbuff(3)%end))
                     end do
 
@@ -380,7 +393,7 @@ contains
 
                     if (p > 0) then
                         do l = mom_idx%beg, mom_idx%end
-                            @:ALLOCATE(dq_prim_dz_qp(1)%vf(l)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
+                            @:ALLOCATE(dq_prim_dz_qp(1)%vf(l)%sf( idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
                                        & idwbuff(3)%beg:idwbuff(3)%end))
                         end do
                         @:ACC_SETUP_VFs(dq_prim_dz_qp(1))
@@ -398,26 +411,26 @@ contains
 
                 do i = 1, num_dims
                     do l = mom_idx%beg, mom_idx%end
-                        @:ALLOCATE(dqL_prim_dx_n(i)%vf(l)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
+                        @:ALLOCATE(dqL_prim_dx_n(i)%vf(l)%sf( idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
                                    & idwbuff(3)%beg:idwbuff(3)%end))
-                        @:ALLOCATE(dqR_prim_dx_n(i)%vf(l)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
+                        @:ALLOCATE(dqR_prim_dx_n(i)%vf(l)%sf( idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
                                    & idwbuff(3)%beg:idwbuff(3)%end))
                     end do
 
                     if (n > 0) then
                         do l = mom_idx%beg, mom_idx%end
-                            @:ALLOCATE(dqL_prim_dy_n(i)%vf(l)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
+                            @:ALLOCATE(dqL_prim_dy_n(i)%vf(l)%sf( idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
                                        & idwbuff(3)%beg:idwbuff(3)%end))
-                            @:ALLOCATE(dqR_prim_dy_n(i)%vf(l)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
+                            @:ALLOCATE(dqR_prim_dy_n(i)%vf(l)%sf( idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
                                        & idwbuff(3)%beg:idwbuff(3)%end))
                         end do
                     end if
 
                     if (p > 0) then
                         do l = mom_idx%beg, mom_idx%end
-                            @:ALLOCATE(dqL_prim_dz_n(i)%vf(l)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
+                            @:ALLOCATE(dqL_prim_dz_n(i)%vf(l)%sf( idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
                                        & idwbuff(3)%beg:idwbuff(3)%end))
-                            @:ALLOCATE(dqR_prim_dz_n(i)%vf(l)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
+                            @:ALLOCATE(dqR_prim_dz_n(i)%vf(l)%sf( idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
                                        & idwbuff(3)%beg:idwbuff(3)%end))
                         end do
                     end if
@@ -455,7 +468,7 @@ contains
                         @:ALLOCATE(dqR_rsz_vf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
                                    & idwbuff(3)%beg:idwbuff(3)%end, mom_idx%beg:mom_idx%end))
                     end if
-                end if  ! end allocation for weno_Re_flux
+                end if
             else
                 @:ALLOCATE(dq_prim_dx_qp(1)%vf(1:sys_size))
                 @:ALLOCATE(dq_prim_dy_qp(1)%vf(1:sys_size))
@@ -473,7 +486,7 @@ contains
                         end if
                     end if
                 end do
-            end if  ! end allocation of viscous variables
+            end if
 
             $:GPU_PARALLEL_LOOP(private='[i, j, k, l, id]', collapse=4)
             do id = 1, num_dims
@@ -488,7 +501,7 @@ contains
                 end do
             end do
             $:END_GPU_PARALLEL_LOOP()
-        end if  ! end allocation for .not. igr
+        end if
 
         if (qbmm) then
             @:ALLOCATE(mom_sp(1:nmomsp), mom_3d(0:2, 0:2, nb))
@@ -496,7 +509,7 @@ contains
             do i = 0, 2
                 do j = 0, 2
                     do k = 1, nb
-                        @:ALLOCATE(mom_3d(i, j, k)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
+                        @:ALLOCATE(mom_3d(i, j, k)%sf( idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
                                    & idwbuff(3)%beg:idwbuff(3)%end))
                         @:ACC_SETUP_SFs(mom_3d(i, j, k))
                     end do
@@ -504,7 +517,7 @@ contains
             end do
 
             do i = 1, nmomsp
-                @:ALLOCATE(mom_sp(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
+                @:ALLOCATE(mom_sp(i)%sf( idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, &
                            & idwbuff(3)%beg:idwbuff(3)%end))
                 @:ACC_SETUP_SFs(mom_sp(i))
             end do
@@ -513,6 +526,9 @@ contains
         if (mpp_lim .and. bubbles_euler) then
             @:ALLOCATE(alf_sum%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, idwbuff(3)%beg:idwbuff(3)%end))
         end if
+        ! END: Allocation/Association of qK_cons_n and qK_prim_n
+
+        ! Allocation of gm_alphaK_n
         if (.not. igr) then
             @:ALLOCATE(gm_alphaL_n(1:num_dims))
             @:ALLOCATE(gm_alphaR_n(1:num_dims))
@@ -531,10 +547,9 @@ contains
 
     end subroutine s_initialize_rhs_module
 
-    !> Compute the right-hand side of the semi-discrete governing equations for a single time stage
+    !> @brief Computes the right-hand side of the semi-discrete governing equations for a single time stage.
     impure subroutine s_compute_rhs(q_cons_vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_in, rhs_pb, mv_in, rhs_mv, t_step, &
-
-        & time_avg, stage)
+                                    & time_avg, stage)
 
         type(scalar_field), dimension(sys_size), intent(inout)                                     :: q_cons_vf
         type(scalar_field), intent(inout)                                                          :: q_T_sf
@@ -555,8 +570,6 @@ contains
         integer :: id
         integer(kind=8) :: i, j, k, l, q  !< Generic loop iterators
 
-        ! RHS: halo exchange -> reconstruct -> Riemann solve -> flux difference -> source terms
-
         call nvtxStartRange("COMPUTE-RHS")
 
         call cpu_time(t_start)
@@ -641,7 +654,7 @@ contains
             call nvtxEndRange
         end if
 
-        ! Loop over coordinate directions for dimensional splitting
+        ! Dimensional Splitting Loop
         do id = 1, num_dims
             if (igr .or. dummy) then
                 if (id == 1) then
@@ -740,7 +753,7 @@ contains
                     end if
                 end if
 
-                call nvtxEndRange  ! WENO
+                call nvtxEndRange
 
                 ! Configuring Coordinate Direction Indexes
                 if (id == 1) then
@@ -751,6 +764,9 @@ contains
                     irx%beg = 0; iry%beg = 0; irz%beg = -1
                 end if
                 irx%end = m; iry%end = n; irz%end = p
+                ! $:GPU_UPDATE(host='[qL_rsx_vf,qR_rsx_vf]') print *, "L", qL_rsx_vf(100:300, 0, 0, 1) print *, "R",
+                ! qR_rsx_vf(100:300, 0, 0, 1)
+
                 ! Computing Riemann Solver Flux and Source Flux
                 call nvtxStartRange("RHS-RIEMANN-SOLVER")
                 call s_riemann_solver(qR_rsx_vf, qR_rsy_vf, qR_rsz_vf, dqR_prim_dx_n(id)%vf, dqR_prim_dy_n(id)%vf, &
@@ -759,6 +775,9 @@ contains
                                       & q_prim_qp%vf, flux_n(id)%vf, flux_src_n(id)%vf, flux_gsrc_n(id)%vf, id, irx, iry, irz)
                 call nvtxEndRange
 
+                !$:GPU_UPDATE(host='[flux_n(1)%vf(1)%sf]')
+                ! print *, "FLUX", flux_n(1)%vf(1)%sf(100:300, 0, 0)
+
                 ! Additional physics and source terms RHS addition for advection source
                 call nvtxStartRange("RHS-ADVECTION-SRC")
                 call s_compute_advection_source_term(id, rhs_vf, q_cons_qp, q_prim_qp, flux_src_n(id))
@@ -776,7 +795,7 @@ contains
                     call nvtxEndRange
                 end if
 
-                ! Viscous stress contribution to RHS
+                ! RHS additions for viscosity
                 if (viscous .or. surface_tension .or. chem_params%diffusion) then
                     call nvtxStartRange("RHS-ADD-PHYSICS")
                     call s_compute_additional_physics_rhs(id, q_prim_qp%vf, rhs_vf, flux_src_n(id)%vf, dq_prim_dx_qp(1)%vf, &
@@ -784,7 +803,7 @@ contains
                     call nvtxEndRange
                 end if
 
-                ! Bubble dynamics source terms
+                ! RHS additions for sub-grid bubbles_euler
                 if (bubbles_euler) then
                     call nvtxStartRange("RHS-BUBBLES-COMPUTE")
                     call s_compute_bubbles_EE_rhs(id, q_prim_qp%vf, divu)
@@ -848,14 +867,32 @@ contains
         end if
 
         if (bubbles_lagrange) then
-            ! RHS additions for sub-grid bubbles_lagrange
-            call nvtxStartRange("RHS-EL-BUBBLES-SRC")
-            call s_compute_bubbles_EL_source(q_cons_qp%vf(1:sys_size), q_prim_qp%vf(1:sys_size), rhs_vf)
-            call nvtxEndRange
             ! Compute bubble dynamics
             if (.not. adap_dt) then
                 call nvtxStartRange("RHS-EL-BUBBLES-DYN")
-                call s_compute_bubble_EL_dynamics(q_prim_qp%vf(1:sys_size), stage)
+                call s_compute_bubble_EL_dynamics(q_prim_qp%vf(1:sys_size), bc_type, stage)
+                call nvtxEndRange
+            end if
+
+            ! RHS additions for sub-grid bubbles_lagrange
+            if (lag_params%solver_approach == 2) then
+                call nvtxStartRange("RHS-EL-BUBBLES-SRC")
+                call s_compute_bubbles_EL_source(q_cons_qp%vf(1:sys_size), q_prim_qp%vf(1:sys_size), rhs_vf)
+                call nvtxEndRange
+            end if
+        end if
+
+        if (particles_lagrange) then
+            ! Compute particle dynamics, forces, dvdt
+            call nvtxStartRange("RHS-EL-PARTICLES-DYN")
+            call s_compute_particle_EL_dynamics(q_prim_qp%vf(1:sys_size), bc_type, stage, qL_rsx_vf, qL_rsy_vf, qL_rsz_vf, &
+                                                & qR_rsx_vf, qR_rsy_vf, qR_rsz_vf, rhs_vf)
+            call nvtxEndRange
+
+            ! RHS additions for sub-grid particles_lagrange
+            if (lag_params%solver_approach == 2) then
+                call nvtxStartRange("RHS-EL-PARTICLES-SRC")
+                call s_compute_particles_EL_source(q_cons_qp%vf(1:sys_size), q_prim_qp%vf(1:sys_size), rhs_vf, stage)
                 call nvtxEndRange
             end if
         end if
@@ -870,7 +907,7 @@ contains
 
         ! END: Additional physics and source terms
 
-        if (run_time_info .or. probe_wrt .or. ib .or. bubbles_lagrange) then
+        if (run_time_info .or. probe_wrt .or. ib .or. bubbles_lagrange .or. particles_lagrange) then
             if (.not. igr .or. dummy) then
                 $:GPU_PARALLEL_LOOP(private='[i, j, k, l]', collapse=4)
                 do i = 1, sys_size
@@ -898,7 +935,7 @@ contains
 
     end subroutine s_compute_rhs
 
-    !> Accumulate advection source contributions from a given coordinate direction into the RHS
+    !> @brief Accumulates advection source contributions from a given coordinate direction into the RHS.
     subroutine s_compute_advection_source_term(idir, rhs_vf, q_cons_vf, q_prim_vf, flux_src_n_vf)
 
         integer, intent(in) :: idir
@@ -1010,7 +1047,7 @@ contains
             $:END_GPU_PARALLEL_LOOP()
 
             if (model_eqns == 3) then
-                $:GPU_PARALLEL_LOOP(collapse=4, private='[i_fluid_loop, k, l, q, inv_ds, advected_qty_val, pressure_val, &
+                $:GPU_PARALLEL_LOOP(collapse=4,private='[i_fluid_loop, k, l, q, inv_ds, advected_qty_val, pressure_val, &
                                     & flux_face1, flux_face2]')
                 do l = 0, p
                     do k = 0, n
@@ -1059,7 +1096,7 @@ contains
                 call s_cbc(q_prim_vf%vf, flux_n(idir)%vf, flux_src_n_vf%vf, idir, 1, irx, iry, irz)
             end if
 
-            if (grid_geometry == 3) then  ! Cylindrical Coordinates
+            if (grid_geometry == 3) then
                 $:GPU_PARALLEL_LOOP(collapse=4,private='[j, k, l, q, inv_ds, velocity_val, flux_face1, flux_face2]')
                 do j = 1, sys_size
                     do k = 0, p
@@ -1088,7 +1125,7 @@ contains
                     end do
                 end do
                 $:END_GPU_PARALLEL_LOOP()
-            else  ! Cartesian Coordinates
+            else
                 $:GPU_PARALLEL_LOOP(collapse=4,private='[j, k, l, q, inv_ds, flux_face1, flux_face2]')
                 do j = 1, sys_size
                     do k = 0, p
@@ -1106,7 +1143,7 @@ contains
             end if
 
             if (model_eqns == 3) then
-                $:GPU_PARALLEL_LOOP(collapse=4, private='[i_fluid_loop, k, l, q, inv_ds, advected_qty_val, pressure_val, &
+                $:GPU_PARALLEL_LOOP(collapse=4,private='[i_fluid_loop, k, l, q, inv_ds, advected_qty_val, pressure_val, &
                                     & flux_face1, flux_face2]')
                 do k = 0, p
                     do q = 0, n
@@ -1131,9 +1168,8 @@ contains
 
     contains
 
-        !> Add the advection source flux-difference terms for a single coordinate direction to the RHS
+        !> @brief Adds the advection source flux-difference terms for a single coordinate direction to the RHS.
         subroutine s_add_directional_advection_source_terms(current_idir, rhs_vf_arg, q_cons_vf_arg, q_prim_vf_arg, &
-
             & flux_src_n_vf_arg, Kterm_arg)
             integer, intent(in)                                    :: current_idir
             type(scalar_field), dimension(sys_size), intent(inout) :: rhs_vf_arg
@@ -1168,7 +1204,7 @@ contains
                         end do
                     end do
                     $:END_GPU_PARALLEL_LOOP()
-                else  ! Other Riemann solvers
+                else
                     if (alt_soundspeed) then
                         if (bubbles_euler .neqv. .true.) then
                             $:GPU_PARALLEL_LOOP(collapse=3, private='[k_idx, l_idx, q_idx, local_inv_ds, local_q_cons_val, &
@@ -1176,7 +1212,7 @@ contains
                             do q_idx = 0, p; do l_idx = 0, n; do k_idx = 0, m
                                 local_inv_ds = 1._wp/dx(k_idx)
                                 local_q_cons_val = q_cons_vf_arg%vf(advxe)%sf(k_idx, l_idx, q_idx)
-                                local_k_term_val = Kterm_arg(k_idx, l_idx, q_idx)  ! Access is safe due to outer alt_soundspeed check
+                                local_k_term_val = Kterm_arg(k_idx, l_idx, q_idx)  ! Access is safe
                                 local_term_coeff = local_q_cons_val - local_k_term_val
                                 local_flux1 = flux_src_n_vf_arg%vf(advxe)%sf(k_idx, l_idx, q_idx)
                                 local_flux2 = flux_src_n_vf_arg%vf(advxe)%sf(k_idx - 1, l_idx, q_idx)
@@ -1199,7 +1235,7 @@ contains
                             end do; end do; end do
                             $:END_GPU_PARALLEL_LOOP()
                         end if
-                    else  ! NOT alt_soundspeed
+                    else
                         $:GPU_PARALLEL_LOOP(collapse=4,private='[j_adv, k_idx, l_idx, q_idx, local_inv_ds, local_term_coeff, &
                                             & local_flux1, local_flux2]')
                         do j_adv = advxb, advxe
@@ -1215,8 +1251,8 @@ contains
                         $:END_GPU_PARALLEL_LOOP()
                     end if
                 end if
-            case (2)
-                ! y-direction: loops q_idx (x), k_idx (y), l_idx (z); sf(q_idx, k_idx, l_idx); dy(k_idx); Kterm(q_idx,k_idx,l_idx)
+            case (2) &
+                  &  ! y-direction: loops q_idx (x), k_idx (y), l_idx (z); sf(q_idx, k_idx, l_idx); dy(k_idx); Kterm(q_idx,k_idx,l_idx)
                 use_standard_riemann = (riemann_solver == 1 .or. riemann_solver == 4)
                 if (use_standard_riemann) then
                     $:GPU_PARALLEL_LOOP(collapse=4,private='[j_adv, k_idx, l_idx, q_idx, local_inv_ds, local_term_coeff, &
@@ -1236,7 +1272,7 @@ contains
                         end do
                     end do
                     $:END_GPU_PARALLEL_LOOP()
-                else  ! Other Riemann solvers
+                else
                     if (alt_soundspeed) then
                         if (bubbles_euler .neqv. .true.) then
                             $:GPU_PARALLEL_LOOP(collapse=3, private='[k_idx, l_idx, q_idx, local_inv_ds, local_q_cons_val, &
@@ -1275,7 +1311,7 @@ contains
                             end do; end do; end do
                             $:END_GPU_PARALLEL_LOOP()
                         end if
-                    else  ! NOT alt_soundspeed
+                    else
                         $:GPU_PARALLEL_LOOP(collapse=4,private='[j_adv, k_idx, l_idx, q_idx, local_inv_ds, local_term_coeff, &
                                             & local_flux1, local_flux2]')
                         do j_adv = advxb, advxe
@@ -1291,8 +1327,8 @@ contains
                         $:END_GPU_PARALLEL_LOOP()
                     end if
                 end if
-            case (3)
-                ! z-direction: loops l_idx (x), q_idx (y), k_idx (z); sf(l_idx, q_idx, k_idx); dz(k_idx); Kterm(l_idx,q_idx,k_idx)
+            case (3) &
+                  &  ! z-direction: loops l_idx (x), q_idx (y), k_idx (z); sf(l_idx, q_idx, k_idx); dz(k_idx); Kterm(l_idx,q_idx,k_idx)
                 if (grid_geometry == 3) then
                     use_standard_riemann = (riemann_solver == 1)
                 else
@@ -1317,7 +1353,7 @@ contains
                         end do
                     end do
                     $:END_GPU_PARALLEL_LOOP()
-                else  ! Other Riemann solvers
+                else
                     if (alt_soundspeed) then
                         if (bubbles_euler .neqv. .true.) then
                             $:GPU_PARALLEL_LOOP(collapse=3, private='[k_idx, l_idx, q_idx, local_inv_ds, local_q_cons_val, &
@@ -1348,7 +1384,7 @@ contains
                             end do; end do; end do
                             $:END_GPU_PARALLEL_LOOP()
                         end if
-                    else  ! NOT alt_soundspeed
+                    else
                         $:GPU_PARALLEL_LOOP(collapse=4, private='[j_adv, k_idx, l_idx, q_idx, local_inv_ds, local_term_coeff, &
                                             & local_flux1, local_flux2]')
                         do j_adv = advxb, advxe
@@ -1370,7 +1406,7 @@ contains
 
     end subroutine s_compute_advection_source_term
 
-    !> Add viscous, surface-tension, and species-diffusion source flux contributions to the RHS for a given direction
+    !> @brief Adds viscous, surface-tension, and species-diffusion source flux contributions to the RHS for a given direction.
     subroutine s_compute_additional_physics_rhs(idir, q_prim_vf, rhs_vf, flux_src_n_in, dq_prim_dx_vf, dq_prim_dy_vf, dq_prim_dz_vf)
 
         integer, intent(in)                                    :: idir
@@ -1620,14 +1656,23 @@ contains
 
     end subroutine s_compute_additional_physics_rhs
 
-    !> Reconstruct left and right cell-boundary values from cell-averaged variables
+    !> The purpose of this subroutine is to WENO-reconstruct the left and the right cell-boundary values, including values at the
+    !! Gaussian quadrature points, from the cell-averaged variables.
+    !! @param v_vf Cell-average variables
+    !! @param vL_x Left reconstructed cell-boundary values in x
+    !! @param vL_y Left reconstructed cell-boundary values in y
+    !! @param vL_z Left reconstructed cell-boundary values in z
+    !! @param vR_x Right reconstructed cell-boundary values in x
+    !! @param vR_y Right reconstructed cell-boundary values in y
+    !! @param vR_z Right reconstructed cell-boundary values in z
+    !! @param norm_dir Splitting coordinate direction
     subroutine s_reconstruct_cell_boundary_values(v_vf, vL_x, vL_y, vL_z, vR_x, vR_y, vR_z, norm_dir)
 
         type(scalar_field), dimension(iv%beg:iv%end), intent(in) :: v_vf
         real(wp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:), intent(inout) :: vL_x, vL_y, vL_z
         real(wp), dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:), intent(inout) :: vR_x, vR_y, vR_z
         integer, intent(in) :: norm_dir
-        integer :: recon_dir  !< Coordinate direction of the reconstruction
+        integer :: recon_dir  !< Coordinate direction of the WENO reconstruction
         integer :: i, j, k, l
 
         #:for SCHEME, TYPE in [('weno','WENO_TYPE'), ('muscl','MUSCL_TYPE')]
@@ -1666,7 +1711,7 @@ contains
 
     end subroutine s_reconstruct_cell_boundary_values
 
-    !> Perform first-order (piecewise constant) reconstruction of left and right cell-boundary values
+    !> @brief Performs first-order (piecewise constant) reconstruction of left and right cell-boundary values.
     subroutine s_reconstruct_cell_boundary_values_first_order(v_vf, vL_x, vL_y, vL_z, vR_x, vR_y, vR_z, norm_dir)
 
         type(scalar_field), dimension(iv%beg:iv%end), intent(in) :: v_vf
diff --git a/src/simulation/m_sim_helpers.fpp b/src/simulation/m_sim_helpers.fpp
index e20ec982fe..b9a5b10cf8 100644
--- a/src/simulation/m_sim_helpers.fpp
+++ b/src/simulation/m_sim_helpers.fpp
@@ -19,6 +19,9 @@ module m_sim_helpers
 contains
 
     !> Computes the modified dtheta for Fourier filtering in azimuthal direction
+    !! @param k y coordinate index
+    !! @param l z coordinate index
+    !! @return fltr_dtheta Modified dtheta value for cylindrical coordinates
     function f_compute_filtered_dtheta(k, l) result(fltr_dtheta)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -42,6 +45,12 @@ contains
     end function f_compute_filtered_dtheta
 
     !> Computes inviscid CFL terms for multi-dimensional cases (2D/3D only)
+    !! @param vel directional velocities
+    !! @param c mixture speed of sound
+    !! @param j x coordinate index
+    !! @param k y coordinate index
+    !! @param l z coordinate index
+    !! @return cfl_terms computed CFL terms for 2D/3D cases
     function f_compute_multidim_cfl_terms(vel, c, j, k, l) result(cfl_terms)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -70,6 +79,20 @@ contains
     end function f_compute_multidim_cfl_terms
 
     !> Computes enthalpy
+    !! @param q_prim_vf cell centered primitive variables
+    !! @param pres mixture pressure
+    !! @param rho mixture density
+    !! @param gamma mixture gamma
+    !! @param pi_inf mixture pi_inf
+    !! @param Re mixture reynolds number
+    !! @param H mixture enthalpy
+    !! @param alpha component alphas
+    !! @param vel directional velocities
+    !! @param vel_sum squard sum of velocity components
+    !! @param qv Fluid reference energy
+    !! @param j x index
+    !! @param k y index
+    !! @param l z index
     subroutine s_compute_enthalpy(q_prim_vf, pres, rho, gamma, pi_inf, Re, H, alpha, vel, vel_sum, qv, j, k, l)
 
         $:GPU_ROUTINE(function_name='s_compute_enthalpy',parallelism='[seq]', cray_inline=True)
@@ -138,80 +161,88 @@ contains
     end subroutine s_compute_enthalpy
 
     !> Computes stability criterion for a specified dt
-    subroutine s_compute_stability_from_dt(vel, c, rho, Re_l, j, k, l, icfl_sf, vcfl_sf, Rc_sf)
+    !! @param vel directional velocities
+    !! @param c mixture speed of sound
+    !! @param rho Density
+    !! @param Re_l mixture Reynolds number
+    !! @param j x index
+    !! @param k y index
+    !! @param l z index
+    !! @param icfl cell-centered inviscid cfl number
+    !! @param vcfl (optional) cell-centered viscous CFL number
+    !! @param Rc (optional) cell centered Rc
+    subroutine s_compute_stability_from_dt(vel, c, rho, Re_l, j, k, l, icfl, vcfl, Rc)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        real(wp), intent(in), dimension(num_vels)                 :: vel
-        real(wp), intent(in)                                      :: c, rho
-        real(wp), dimension(0:m,0:n,0:p), intent(inout)           :: icfl_sf
-        real(wp), dimension(0:m,0:n,0:p), intent(inout), optional :: vcfl_sf, Rc_sf
-        real(wp), dimension(2), intent(in)                        :: Re_l
-        integer, intent(in)                                       :: j, k, l
-        real(wp)                                                  :: fltr_dtheta
+        real(wp), intent(in), dimension(num_vels) :: vel
+        real(wp), intent(in)                      :: c, rho
+        real(wp), intent(inout)                   :: icfl
+        real(wp), intent(inout), optional         :: vcfl, Rc
+        real(wp), dimension(2), intent(in)        :: Re_l
+        integer, intent(in)                       :: j, k, l
+        real(wp)                                  :: fltr_dtheta
 
         ! Inviscid CFL calculation
         if (p > 0 .or. n > 0) then
-            ! 2D/3D
-            icfl_sf(j, k, l) = dt/f_compute_multidim_cfl_terms(vel, c, j, k, l)
+            icfl = dt/f_compute_multidim_cfl_terms(vel, c, j, k, l)
         else
-            ! 1D
-            icfl_sf(j, k, l) = (dt/dx(j))*(abs(vel(1)) + c)
+            icfl = (dt/dx(j))*(abs(vel(1)) + c)
         end if
 
         ! Viscous calculations
         if (viscous) then
             if (p > 0) then
                 #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                    ! 3D
                     if (grid_geometry == 3) then
                         fltr_dtheta = f_compute_filtered_dtheta(k, l)
-                        vcfl_sf(j, k, l) = maxval(dt/Re_l/rho)/min(dx(j), dy(k), fltr_dtheta)**2._wp
-                        Rc_sf(j, k, l) = min(dx(j)*(abs(vel(1)) + c), dy(k)*(abs(vel(2)) + c), &
-                              & fltr_dtheta*(abs(vel(3)) + c))/maxval(1._wp/Re_l)
+                        vcfl = maxval(dt/Re_l/rho)/min(dx(j), dy(k), fltr_dtheta)**2._wp
+                        Rc = min(dx(j)*(abs(vel(1)) + c), dy(k)*(abs(vel(2)) + c), fltr_dtheta*(abs(vel(3)) + c))/maxval(1._wp/Re_l)
                     else
-                        vcfl_sf(j, k, l) = maxval(dt/Re_l/rho)/min(dx(j), dy(k), dz(l))**2._wp
-                        Rc_sf(j, k, l) = min(dx(j)*(abs(vel(1)) + c), dy(k)*(abs(vel(2)) + c), &
-                              & dz(l)*(abs(vel(3)) + c))/maxval(1._wp/Re_l)
+                        vcfl = maxval(dt/Re_l/rho)/min(dx(j), dy(k), dz(l))**2._wp
+                        Rc = min(dx(j)*(abs(vel(1)) + c), dy(k)*(abs(vel(2)) + c), dz(l)*(abs(vel(3)) + c))/maxval(1._wp/Re_l)
                     end if
                 #:endif
             else if (n > 0) then
-                ! 2D
-                vcfl_sf(j, k, l) = maxval(dt/Re_l/rho)/min(dx(j), dy(k))**2._wp
-                Rc_sf(j, k, l) = min(dx(j)*(abs(vel(1)) + c), dy(k)*(abs(vel(2)) + c))/maxval(1._wp/Re_l)
+                vcfl = maxval(dt/Re_l/rho)/min(dx(j), dy(k))**2._wp
+                Rc = min(dx(j)*(abs(vel(1)) + c), dy(k)*(abs(vel(2)) + c))/maxval(1._wp/Re_l)
             else
-                ! 1D
-                vcfl_sf(j, k, l) = maxval(dt/Re_l/rho)/dx(j)**2._wp
-                Rc_sf(j, k, l) = dx(j)*(abs(vel(1)) + c)/maxval(1._wp/Re_l)
+                vcfl = maxval(dt/Re_l/rho)/dx(j)**2._wp
+                Rc = dx(j)*(abs(vel(1)) + c)/maxval(1._wp/Re_l)
             end if
         end if
 
     end subroutine s_compute_stability_from_dt
 
     !> Computes dt for a specified CFL number
+    !! @param vel directional velocities
+    !! @param c Speed of sound
+    !! @param max_dt cell centered maximum dt
+    !! @param rho cell centered density
+    !! @param Re_l cell centered Reynolds number
+    !! @param j x coordinate
+    !! @param k y coordinate
+    !! @param l z coordinate
     subroutine s_compute_dt_from_cfl(vel, c, max_dt, rho, Re_l, j, k, l)
 
         $:GPU_ROUTINE(parallelism='[seq]')
-        real(wp), dimension(num_vels), intent(in)       :: vel
-        real(wp), intent(in)                            :: c, rho
-        real(wp), dimension(0:m,0:n,0:p), intent(inout) :: max_dt
-        real(wp), dimension(2), intent(in)              :: Re_l
-        integer, intent(in)                             :: j, k, l
-        real(wp)                                        :: icfl_dt, vcfl_dt
-        real(wp)                                        :: fltr_dtheta
+        real(wp), dimension(num_vels), intent(in) :: vel
+        real(wp), intent(in)                      :: c, rho
+        real(wp), intent(inout)                   :: max_dt
+        real(wp), dimension(2), intent(in)        :: Re_l
+        integer, intent(in)                       :: j, k, l
+        real(wp)                                  :: icfl_dt, vcfl_dt
+        real(wp)                                  :: fltr_dtheta
 
         ! Inviscid CFL calculation
         if (p > 0 .or. n > 0) then
-            ! 2D/3D cases
             icfl_dt = cfl_target*f_compute_multidim_cfl_terms(vel, c, j, k, l)
         else
-            ! 1D case
             icfl_dt = cfl_target*(dx(j)/(abs(vel(1)) + c))
         end if
 
         ! Viscous calculations
         if (viscous) then
             if (p > 0) then
-                ! 3D
                 if (grid_geometry == 3) then
                     fltr_dtheta = f_compute_filtered_dtheta(k, l)
                     vcfl_dt = cfl_target*(min(dx(j), dy(k), fltr_dtheta)**2._wp)/maxval(1/(rho*Re_l))
@@ -219,18 +250,16 @@ contains
                     vcfl_dt = cfl_target*(min(dx(j), dy(k), dz(l))**2._wp)/maxval(1/(rho*Re_l))
                 end if
             else if (n > 0) then
-                ! 2D
                 vcfl_dt = cfl_target*(min(dx(j), dy(k))**2._wp)/maxval((1/Re_l)/rho)
             else
-                ! 1D
                 vcfl_dt = cfl_target*(dx(j)**2._wp)/maxval(1/(rho*Re_l))
             end if
         end if
 
-        if (any(Re_size > 0)) then
-            max_dt(j, k, l) = min(icfl_dt, vcfl_dt)
+        if (viscous) then
+            max_dt = min(icfl_dt, vcfl_dt)
         else
-            max_dt(j, k, l) = icfl_dt
+            max_dt = icfl_dt
         end if
 
     end subroutine s_compute_dt_from_cfl
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index c3f38ef500..c19eb94846 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -31,6 +31,7 @@ module m_start_up
     use m_viscous
     use m_bubbles_EE
     use m_bubbles_EL
+    use m_particles_EL  !< Lagrange particle dynamics routines
     use ieee_arithmetic
     use m_helper_basic
     use m_helper
@@ -59,6 +60,7 @@ module m_start_up
 contains
 
     !> Read data files. Dispatch subroutine that replaces procedure pointer.
+    !! @param q_cons_vf Conservative variables
     impure subroutine s_read_data_files(q_cons_vf)
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
@@ -71,24 +73,26 @@ contains
 
     end subroutine s_read_data_files
 
-    !> Verify the input file exists and read it
+    !> The purpose of this procedure is to first verify that an input file has been made available by the user. Provided that this
+    !! is so, the input file is then read in.
     impure subroutine s_read_input_file
 
+        ! Relative path to the input file provided by the user
         character(LEN=name_len), parameter :: file_path = './simulation.inp'
         logical                            :: file_exist  !< Logical used to check the existence of the input file
         integer                            :: iostatus
-        ! Integer to check iostat of file read
+        !! Integer to check iostat of file read
 
         character(len=1000) :: line
 
+        ! Namelist of the global parameters which may be specified by user
+
         namelist /user_inputs/ case_dir, run_time_info, m, n, p, dt, &
             t_step_start, t_step_stop, t_step_save, t_step_print, &
             model_eqns, mpp_lim, time_stepper, weno_eps, &
             rdma_mpi, teno_CT, mp_weno, weno_avg, &
             riemann_solver, low_Mach, wave_speeds, avg_state, &
             bc_x, bc_y, bc_z, &
-            x_a, y_a, z_a, x_b, y_b, z_b, &
-            x_domain, y_domain, z_domain, &
             hypoelasticity, &
             ib, num_ibs, patch_ib, &
             ib_state_wrt, &
@@ -110,8 +114,11 @@ contains
             & g_y, g_z, n_start, t_save, t_stop, cfl_adap_dt, cfl_const_dt, cfl_target, surface_tension, bubbles_lagrange, &
             & lag_params, hyperelasticity, R0ref, num_bc_patches, Bx0, cont_damage, tau_star, cont_damage_s, alpha_bar, &
             & hyper_cleaning, hyper_cleaning_speed, hyper_cleaning_tau, alf_factor, num_igr_iters, num_igr_warm_start_iters, &
-            & int_comp, ic_eps, ic_beta, nv_uvm_out_of_core, nv_uvm_igr_temps_on_gpu, nv_uvm_pref_gpu, down_sample, fft_wrt
+            & int_comp, ic_eps, ic_beta, nv_uvm_out_of_core, nv_uvm_igr_temps_on_gpu, nv_uvm_pref_gpu, down_sample, fft_wrt, &
+            & particles_lagrange, particle_pp
 
+        ! Checking that an input file has been provided by the user. If it has, then the input file is read in, otherwise,
+        ! simulation exits.
         inquire (FILE=trim(file_path), EXIST=file_exist)
 
         if (file_exist) then
@@ -131,6 +138,7 @@ contains
                 bodyForces = .true.
             end if
 
+            ! Store m,n,p into global m,n,p
             m_glb = m
             n_glb = n
             p_glb = p
@@ -139,20 +147,30 @@ contains
 
             if (cfl_adap_dt .or. cfl_const_dt) cfl_dt = .true.
 
-            if (any((/bc_x%beg, bc_x%end, bc_y%beg, bc_y%end, bc_z%beg, bc_z%end/) == -17) .or. num_bc_patches > 0) then
+            if (any((/bc_x%beg, bc_x%end, bc_y%beg, bc_y%end, bc_z%beg, bc_z%end/) == BC_DIRICHLET) .or. num_bc_patches > 0) then
                 bc_io = .true.
             end if
+
+            if (bc_x%beg == BC_PERIODIC .and. bc_x%end == BC_PERIODIC) periodic_bc(1) = .true.
+            if (bc_y%beg == BC_PERIODIC .and. bc_y%end == BC_PERIODIC) periodic_bc(2) = .true.
+            if (bc_z%beg == BC_PERIODIC .and. bc_z%end == BC_PERIODIC) periodic_bc(3) = .true.
         else
             call s_mpi_abort(trim(file_path) // ' is missing. Exiting.')
         end if
 
     end subroutine s_read_input_file
 
-    !> Validate that all user-provided inputs form a consistent simulation configuration
+    !> The goal of this procedure is to verify that each of the user provided inputs is valid and that their combination constitutes
+    !! a meaningful configuration for the simulation.
     impure subroutine s_check_input_file
 
+        ! Relative path to the current directory file in the case directory
         character(LEN=path_len) :: file_path
-        logical                 :: file_exist
+
+        ! Logical used to check the existence of the current directory file
+        logical :: file_exist
+
+        ! Logistics
 
         file_path = trim(case_dir) // '/.'
 
@@ -167,15 +185,21 @@ contains
 
     end subroutine s_check_input_file
 
-    !> Read serial initial condition and grid data files and compute cell-width distributions
+    !> @brief Reads serial initial condition and grid data files and computes cell-width distributions.
+    !! @param q_cons_vf Cell-averaged conservative variables
     impure subroutine s_read_serial_data_files(q_cons_vf)
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         character(LEN=path_len + 2*name_len) :: t_step_dir  !< Relative path to the starting time-step directory
         character(LEN=path_len + 3*name_len) :: file_path   !< Relative path to the grid and conservative variables data files
         logical :: file_exist
+        ! Logical used to check the existence of the data files
+
         integer :: i, r
 
+        ! Confirming that the directory from which the initial condition and the grid data files are to be read in exists and
+        ! exiting otherwise
+
         if (cfl_dt) then
             write (t_step_dir, '(A,I0,A,I0)') trim(case_dir) // '/p_all/p', proc_rank, '/', n_start
         else
@@ -195,6 +219,7 @@ contains
             call s_assign_default_bc_type(bc_type)
         end if
 
+        ! Cell-boundary Locations in x-direction
         file_path = trim(t_step_dir) // '/x_cb.dat'
 
         inquire (FILE=trim(file_path), EXIST=file_exist)
@@ -218,6 +243,7 @@ contains
             end do
         end if
 
+        ! Cell-boundary Locations in y-direction
         if (n > 0) then
             file_path = trim(t_step_dir) // '/y_cb.dat'
 
@@ -234,6 +260,7 @@ contains
             y_cc(0:n) = y_cb(-1:n - 1) + dy(0:n)/2._wp
         end if
 
+        ! Cell-boundary Locations in z-direction
         if (p > 0) then
             file_path = trim(t_step_dir) // '/z_cb.dat'
 
@@ -293,7 +320,8 @@ contains
 
     end subroutine s_read_serial_data_files
 
-    !> Read parallel initial condition and grid data files via MPI I/O
+    !> @brief Reads parallel initial condition and grid data files via MPI I/O.
+    !! @param q_cons_vf Conservative variables
     impure subroutine s_read_parallel_data_files(q_cons_vf)
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
@@ -321,6 +349,7 @@ contains
         allocate (y_cb_glb(-1:n_glb))
         allocate (z_cb_glb(-1:p_glb))
 
+        ! Read in cell boundary locations in x-direction
         file_loc = trim(case_dir) // '/restart_data' // trim(mpiiofs) // 'x_cb.dat'
         inquire (FILE=trim(file_loc), EXIST=file_exist)
 
@@ -343,8 +372,11 @@ contains
             call s_mpi_abort('File ' // trim(file_loc) // ' is missing. Exiting.')
         end if
 
+        ! Assigning local cell boundary locations
         x_cb(-1:m) = x_cb_glb((start_idx(1) - 1):(start_idx(1) + m))
+        ! Computing the cell width distribution
         dx(0:m) = x_cb(0:m) - x_cb(-1:m - 1)
+        ! Computing the cell center locations
         x_cc(0:m) = x_cb(-1:m - 1) + dx(0:m)/2._wp
 
         if (ib) then
@@ -358,6 +390,7 @@ contains
         end if
 
         if (n > 0) then
+            ! Read in cell boundary locations in y-direction
             file_loc = trim(case_dir) // '/restart_data' // trim(mpiiofs) // 'y_cb.dat'
             inquire (FILE=trim(file_loc), EXIST=file_exist)
 
@@ -370,11 +403,15 @@ contains
                 call s_mpi_abort('File ' // trim(file_loc) // ' is missing. Exiting.')
             end if
 
+            ! Assigning local cell boundary locations
             y_cb(-1:n) = y_cb_glb((start_idx(2) - 1):(start_idx(2) + n))
+            ! Computing the cell width distribution
             dy(0:n) = y_cb(0:n) - y_cb(-1:n - 1)
+            ! Computing the cell center locations
             y_cc(0:n) = y_cb(-1:n - 1) + dy(0:n)/2._wp
 
             if (p > 0) then
+                ! Read in cell boundary locations in z-direction
                 file_loc = trim(case_dir) // '/restart_data' // trim(mpiiofs) // 'z_cb.dat'
                 inquire (FILE=trim(file_loc), EXIST=file_exist)
 
@@ -387,8 +424,11 @@ contains
                     call s_mpi_abort('File ' // trim(file_loc) // 'is missing. Exiting.')
                 end if
 
+                ! Assigning local cell boundary locations
                 z_cb(-1:p) = z_cb_glb((start_idx(3) - 1):(start_idx(3) + p))
+                ! Computing the cell width distribution
                 dz(0:p) = z_cb(0:p) - z_cb(-1:p - 1)
+                ! Computing the cell center locations
                 z_cc(0:p) = z_cb(-1:p - 1) + dz(0:p)/2._wp
             end if
         end if
@@ -407,6 +447,7 @@ contains
             if (file_exist) then
                 call MPI_FILE_OPEN(MPI_COMM_SELF, file_loc, MPI_MODE_RDONLY, mpi_info_int, ifile, ierr)
 
+                ! Initialize MPI data I/O
                 if (down_sample) then
                     call s_initialize_mpi_data_ds(q_cons_vf)
                 else
@@ -418,17 +459,20 @@ contains
                 end if
 
                 if (down_sample) then
+                    ! Size of local arrays
                     data_size = (m_ds + 3)*(n_ds + 3)*(p_ds + 3)
                     m_glb_read = m_glb_ds + 1
                     n_glb_read = n_glb_ds + 1
                     p_glb_read = p_glb_ds + 1
                 else
+                    ! Size of local arrays
                     data_size = (m + 1)*(n + 1)*(p + 1)
                     m_glb_read = m_glb + 1
                     n_glb_read = n_glb + 1
                     p_glb_read = p_glb + 1
                 end if
 
+                ! Resize some integers so MPI can read even the biggest file
                 m_MOK = int(m_glb_read + 1, MPI_OFFSET_KIND)
                 n_MOK = int(m_glb_read + 1, MPI_OFFSET_KIND)
                 p_MOK = int(m_glb_read + 1, MPI_OFFSET_KIND)
@@ -437,8 +481,9 @@ contains
                 str_MOK = int(name_len, MPI_OFFSET_KIND)
                 NVARS_MOK = int(sys_size, MPI_OFFSET_KIND)
 
+                ! Read the data for each variable
                 if (bubbles_euler .or. elasticity) then
-                    do i = 1, sys_size  ! adv_idx%end
+                    do i = 1, sys_size
                         var_MOK = int(i, MPI_OFFSET_KIND)
 
                         call MPI_FILE_READ(ifile, MPI_IO_DATA%var(i)%sf, data_size*mpi_io_type, mpi_io_p, status, ierr)
@@ -474,6 +519,7 @@ contains
                 call s_mpi_abort('File ' // trim(file_loc) // ' is missing. Exiting.')
             end if
         else
+            ! Open the file to read conservative variables
             if (cfl_dt) then
                 write (file_loc, '(I0,A)') n_start, '.dat'
             else
@@ -485,14 +531,18 @@ contains
             if (file_exist) then
                 call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, MPI_MODE_RDONLY, mpi_info_int, ifile, ierr)
 
+                ! Initialize MPI data I/O
+
                 if (ib) then
                     call s_initialize_mpi_data(q_cons_vf, ib_markers)
                 else
                     call s_initialize_mpi_data(q_cons_vf)
                 end if
 
+                ! Size of local arrays
                 data_size = (m + 1)*(n + 1)*(p + 1)
 
+                ! Resize some integers so MPI can read even the biggest file
                 m_MOK = int(m_glb + 1, MPI_OFFSET_KIND)
                 n_MOK = int(n_glb + 1, MPI_OFFSET_KIND)
                 p_MOK = int(p_glb + 1, MPI_OFFSET_KIND)
@@ -501,9 +551,11 @@ contains
                 str_MOK = int(name_len, MPI_OFFSET_KIND)
                 NVARS_MOK = int(sys_size, MPI_OFFSET_KIND)
 
+                ! Read the data for each variable
                 if (bubbles_euler .or. elasticity) then
-                    do i = 1, sys_size  ! adv_idx%end
+                    do i = 1, sys_size
                         var_MOK = int(i, MPI_OFFSET_KIND)
+                        ! Initial displacement to skip at beginning of file
                         disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1)
 
                         call MPI_FILE_SET_VIEW(ifile, disp, mpi_io_p, MPI_IO_DATA%view(i), 'native', mpi_info_int, ierr)
@@ -513,6 +565,7 @@ contains
                     if (qbmm .and. .not. polytropic) then
                         do i = sys_size + 1, sys_size + 2*nb*nnode
                             var_MOK = int(i, MPI_OFFSET_KIND)
+                            ! Initial displacement to skip at beginning of file
                             disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1)
 
                             call MPI_FILE_SET_VIEW(ifile, disp, mpi_io_p, MPI_IO_DATA%view(i), 'native', mpi_info_int, ierr)
@@ -523,6 +576,7 @@ contains
                     do i = 1, sys_size
                         var_MOK = int(i, MPI_OFFSET_KIND)
 
+                        ! Initial displacement to skip at beginning of file
                         disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1)
 
                         call MPI_FILE_SET_VIEW(ifile, disp, mpi_io_p, MPI_IO_DATA%view(i), 'native', mpi_info_int, ierr)
@@ -549,7 +603,9 @@ contains
 
     end subroutine s_read_parallel_data_files
 
-    !> Initialize internal-energy equations from phase mass, mixture momentum, and total energy
+    !> The purpose of this procedure is to initialize the values of the internal-energy equations of each phase from the mass of
+    !! each phase, the mixture momentum and mixture-total-energy equations.
+    !! @param v_vf conservative variables
     subroutine s_initialize_internal_energy_equations(v_vf)
 
         type(scalar_field), dimension(sys_size), intent(inout) :: v_vf
@@ -606,7 +662,7 @@ contains
 
     end subroutine s_initialize_internal_energy_equations
 
-    !> Advance the simulation by one time step, handling CFL-based dt and time-stepper dispatch
+    !> @brief Advances the simulation by one time step, handling CFL-based dt and time-stepper dispatch.
     impure subroutine s_perform_time_step(t_step, time_avg)
 
         integer, intent(inout)  :: t_step
@@ -672,9 +728,8 @@ contains
 
     end subroutine s_perform_time_step
 
-    !> Collect per-process wall-clock times and write aggregate performance metrics to file
+    !> @brief Collects per-process wall-clock times and writes aggregate performance metrics to file.
     impure subroutine s_save_performance_metrics(time_avg, time_final, io_time_avg, io_time_final, proc_time, io_proc_time, &
-
         & file_exists)
 
         real(wp), intent(inout)               :: time_avg, time_final
@@ -733,7 +788,7 @@ contains
 
     end subroutine s_save_performance_metrics
 
-    !> Save conservative variable data to disk at the current time step
+    !> @brief Saves conservative variable data to disk at the current time step.
     impure subroutine s_save_data(t_step, start, finish, io_time_avg, nt)
 
         integer, intent(inout)  :: t_step
@@ -796,7 +851,7 @@ contains
         if (bubbles_lagrange) then
             $:GPU_UPDATE(host='[lag_id, mtn_pos, mtn_posPrev, mtn_vel, intfc_rad, intfc_vel, bub_R0, Rmax_stats, Rmin_stats, &
                          & bub_dphidt, gas_p, gas_mv, gas_mg, gas_betaT, gas_betaC]')
-            do i = 1, nBubs
+            do i = 1, n_el_bubs_loc
                 if (ieee_is_nan(intfc_rad(i, 1)) .or. intfc_rad(i, 1) <= 0._wp) then
                     call s_mpi_abort("Bubble radius is negative or NaN, please reduce dt.")
                 end if
@@ -807,6 +862,20 @@ contains
             $:GPU_UPDATE(host='[Rmax_stats, Rmin_stats, gas_p, gas_mv, intfc_vel]')
             call s_write_restart_lag_bubbles(save_count)  ! parallel
             if (lag_params%write_bubbles_stats) call s_write_lag_bubble_stats()
+        else if (particles_lagrange) then
+            $:GPU_UPDATE(host='[lag_part_id, particle_pos, particle_posPrev, particle_vel, particle_rad, particle_R0, &
+                         & Rmax_stats_part, Rmin_stats_part, particle_mass]')
+            do i = 1, n_el_particles_loc
+                if (ieee_is_nan(particle_rad(i, 1)) .or. particle_rad(i, 1) <= 0._wp) then
+                    call s_mpi_abort("Particle radius is negative or NaN, please reduce dt.")
+                end if
+            end do
+
+            $:GPU_UPDATE(host='[q_particles(1)%sf]')
+            call s_write_data_files(q_cons_ts(stor)%vf, q_T_sf, q_prim_vf, save_count, bc_type, q_particles(1))
+            $:GPU_UPDATE(host='[Rmax_stats_part, Rmin_stats_part]')
+            call s_write_restart_lag_particles(save_count)  ! parallel
+            if (lag_params%write_bubbles_stats) call s_write_lag_particle_stats()
         else
             call s_write_data_files(q_cons_ts(stor)%vf, q_T_sf, q_prim_vf, save_count, bc_type)
         end if
@@ -827,7 +896,7 @@ contains
 
     end subroutine s_save_data
 
-    !> Initialize all simulation sub-modules in the required dependency order
+    !> @brief Initializes all simulation sub-modules in the required dependency order.
     impure subroutine s_initialize_modules
 
         integer  :: m_ds, n_ds, p_ds
@@ -848,6 +917,11 @@ contains
         if (bubbles_euler .or. bubbles_lagrange) then
             call s_initialize_bubbles_model()
         end if
+
+        if (particles_lagrange) then
+            call s_initialize_particles_model()
+        end if
+
         call s_initialize_mpi_common_module()
         call s_initialize_mpi_proxy_module()
         call s_initialize_variables_conversion_module()
@@ -888,6 +962,7 @@ contains
             end do
         end if
 
+        ! Reading in the user provided initial condition and grid data
         if (down_sample) then
             call s_read_data_files(q_cons_temp)
             call s_upsample_data(q_cons_ts(1)%vf, q_cons_temp)
@@ -902,6 +977,7 @@ contains
             call s_read_data_files(q_cons_ts(1)%vf)
         end if
 
+        ! Populating the buffers of the grid variables using the boundary conditions
         call s_populate_grid_variables_buffers()
 
         if (model_eqns == 3) call s_initialize_internal_energy_equations(q_cons_ts(1)%vf)
@@ -931,14 +1007,16 @@ contains
         end if
 
         call s_initialize_derived_variables()
-        if (bubbles_lagrange) call s_initialize_bubbles_EL_module(q_cons_ts(1)%vf)
+
+        if (bubbles_lagrange) call s_initialize_bubbles_EL_module(q_cons_ts(1)%vf, bc_type)
+        if (particles_lagrange) call s_initialize_particles_EL_module(q_cons_ts(1)%vf, bc_type)
 
         if (hypoelasticity) call s_initialize_hypoelastic_module()
         if (hyperelasticity) call s_initialize_hyperelastic_module()
 
     end subroutine s_initialize_modules
 
-    !> Set up the MPI execution environment, bind GPUs, and decompose the computational domain
+    !> @brief Sets up the MPI execution environment, binds GPUs, and decomposes the computational domain.
     impure subroutine s_initialize_mpi_domain
 
         integer :: ierr
@@ -955,8 +1033,11 @@ contains
 #endif
 #endif
 
+        ! Initializing MPI execution environment
+
         call s_mpi_initialize()
 
+        ! Bind GPUs if OpenACC is enabled
 #ifdef MFC_GPU
 #ifndef MFC_MPI
         local_size = 1
@@ -979,6 +1060,9 @@ contains
 #endif
 #endif
 
+        ! The rank 0 processor assigns default values to the user inputs prior to reading them in from the input file. Next, the
+        ! user inputs are read and their consistency is checked. The identification of any inconsistencies will result in the
+        ! termination of the simulation.
         if (proc_rank == 0) then
             call s_assign_default_values_to_user_inputs()
             call s_read_input_file()
@@ -1000,6 +1084,9 @@ contains
 #endif
         end if
 
+        ! Broadcasting the user inputs to all of the processors and performing the parallel computational domain decomposition.
+        ! Neither procedure has to be carried out if the simulation is in fact not truly executed in parallel.
+
         call s_mpi_bcast_user_inputs()
 
         call s_initialize_parallel_io()
@@ -1008,10 +1095,11 @@ contains
 
     end subroutine s_initialize_mpi_domain
 
-    !> Transfer initial conservative variable and model parameter data to the GPU device
+    !> @brief Transfers initial conservative variable and model parameter data to the GPU device.
     subroutine s_initialize_gpu_vars
 
         integer :: i
+        ! Update GPU DATA
 
         if (.not. down_sample) then
             do i = 1, sys_size
@@ -1052,6 +1140,10 @@ contains
         $:GPU_UPDATE(device='[bc_y%vb1, bc_y%vb2, bc_y%vb3, bc_y%ve1, bc_y%ve2, bc_y%ve3]')
         $:GPU_UPDATE(device='[bc_z%vb1, bc_z%vb2, bc_z%vb3, bc_z%ve1, bc_z%ve2, bc_z%ve3]')
 
+        $:GPU_UPDATE(device='[bc_x%beg, bc_x%end]')
+        $:GPU_UPDATE(device='[bc_y%beg, bc_y%end]')
+        $:GPU_UPDATE(device='[bc_z%beg, bc_z%end]')
+
         $:GPU_UPDATE(device='[bc_x%grcbc_in, bc_x%grcbc_out, bc_x%grcbc_vel_out]')
         $:GPU_UPDATE(device='[bc_y%grcbc_in, bc_y%grcbc_out, bc_y%grcbc_vel_out]')
         $:GPU_UPDATE(device='[bc_z%grcbc_in, bc_z%grcbc_out, bc_z%grcbc_vel_out]')
@@ -1078,7 +1170,7 @@ contains
 
     end subroutine s_initialize_gpu_vars
 
-    !> Finalize and deallocate all simulation sub-modules in reverse initialization order
+    !> @brief Finalizes and deallocates all simulation sub-modules in reverse initialization order.
     impure subroutine s_finalize_modules
 
         call s_finalize_time_steppers_module()
@@ -1105,6 +1197,7 @@ contains
         call s_finalize_boundary_common_module()
         if (relax) call s_finalize_relaxation_solver_module()
         if (bubbles_lagrange) call s_finalize_lagrangian_solver()
+        if (particles_lagrange) call s_finalize_particle_lagrangian_solver()
         if (viscous .and. (.not. igr)) then
             call s_finalize_viscous_module()
         end if
@@ -1113,6 +1206,7 @@ contains
         if (surface_tension) call s_finalize_surface_tension_module()
         if (bodyForces) call s_finalize_body_forces_module()
 
+        ! Terminating MPI execution environment
         call s_mpi_finalize()
 
     end subroutine s_finalize_modules
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 4a741a4068..852728ae6f 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -15,6 +15,7 @@ module m_time_steppers
     use m_data_output
     use m_bubbles_EE
     use m_bubbles_EL
+    use m_particles_EL  !< Lagrange particle dynamics routines
     use m_ibm
     use m_hyperelastic
     use m_mpi_proxy
@@ -33,18 +34,18 @@ module m_time_steppers
     type(scalar_field), allocatable, dimension(:)    :: q_prim_vf  !< Cell-average primitive variables at the current time-stage
     type(scalar_field), allocatable, dimension(:)    :: rhs_vf     !< Cell-average RHS variables at the current time-stage
     type(integer_field), allocatable, dimension(:,:) :: bc_type    !< Boundary condition identifiers
+
     !> Cell-average primitive variables at consecutive TIMESTEPS
     type(vector_field), allocatable, dimension(:) :: q_prim_ts1, q_prim_ts2
     real(wp), allocatable, dimension(:,:,:,:,:)   :: rhs_pb
     type(scalar_field)                            :: q_T_sf  !< Cell-average temperature variables at the current time-stage
     real(wp), allocatable, dimension(:,:,:,:,:)   :: rhs_mv
-    real(wp), allocatable, dimension(:,:,:)       :: max_dt
     integer, private                              :: num_ts  !< Number of time stages in the time-stepping scheme
     integer                                       :: stor    !< storage index
     real(wp), allocatable, dimension(:,:)         :: rk_coef
     integer, private                              :: num_probe_ts
 
-    $:GPU_DECLARE(create='[q_cons_ts, q_prim_vf, q_T_sf, rhs_vf, q_prim_ts1, q_prim_ts2, rhs_mv, rhs_pb, max_dt, rk_coef, stor, bc_type]')
+    $:GPU_DECLARE(create='[q_cons_ts, q_prim_vf, q_T_sf, rhs_vf, q_prim_ts1, q_prim_ts2, rhs_mv, rhs_pb, rk_coef, stor, bc_type]')
 
     !> @cond
 #if defined(__NVCOMPILER_GPU_UNIFIED_MEM)
@@ -59,7 +60,8 @@ module m_time_steppers
 
 contains
 
-    !> Initialize the time steppers module
+    !> The computation of parameters, the allocation of memory, the association of pointers and/or the execution of any other
+    !! procedures that are necessary to setup the module.
     impure subroutine s_initialize_time_steppers_module
 
 #ifdef FRONTIER_UNIFIED
@@ -71,8 +73,8 @@ contains
 #endif
 #endif
         integer :: i, j  !< Generic loop iterators
-        ! Setting number of time-stages for selected time-stepping scheme
 
+        ! Setting number of time-stages for selected time-stepping scheme
         if (time_stepper == 1) then
             num_ts = 1
         else if (any(time_stepper == (/2, 3/))) then
@@ -328,7 +330,7 @@ contains
 
             @:ALLOCATE(rhs_pb(idwbuff(1)%beg:idwbuff(1)%beg + 1, idwbuff(2)%beg:idwbuff(2)%beg + 1, &
                        & idwbuff(3)%beg:idwbuff(3)%beg + 1, 1:nnode, 1:nb))
-        else
+        else  ! All other timesteps
             @:ALLOCATE(pb_ts(1)%sf(0,0,0,0,0))
             @:ACC_SETUP_SFs(pb_ts(1))
 
@@ -362,7 +364,7 @@ contains
 
             @:ALLOCATE(rhs_mv(idwbuff(1)%beg:idwbuff(1)%beg + 1, idwbuff(2)%beg:idwbuff(2)%beg + 1, &
                        & idwbuff(3)%beg:idwbuff(3)%beg + 1, 1:nnode, 1:nb))
-        else
+        else  ! All other timesteps
             @:ALLOCATE(mv_ts(1)%sf(0,0,0,0,0))
             @:ACC_SETUP_SFs(mv_ts(1))
 
@@ -382,7 +384,7 @@ contains
                 @:ACC_SETUP_SFs(rhs_vf(i))
                 @:PREFER_GPU(rhs_vf(i)%sf)
             end do
-        else
+        else  ! All other timesteps
             do i = 1, sys_size
                 @:ALLOCATE(rhs_vf(i)%sf(0:m, 0:n, 0:p))
                 @:ACC_SETUP_SFs(rhs_vf(i))
@@ -399,10 +401,6 @@ contains
             call s_open_ib_state_file()
         end if
 
-        if (cfl_dt) then
-            @:ALLOCATE(max_dt(0:m, 0:n, 0:p))
-        end if
-
         ! Allocating arrays to store the bc types
         @:ALLOCATE(bc_type(1:num_dims,1:2))
 
@@ -452,7 +450,7 @@ contains
 
     end subroutine s_initialize_time_steppers_module
 
-    !> Advance the solution one full step using a TVD Runge-Kutta time integrator
+    !> @brief Advances the solution one full step using a TVD Runge-Kutta time integrator.
     impure subroutine s_tvd_rk(t_step, time_avg, nstage)
 
 #ifdef _CRAYFTN
@@ -497,7 +495,12 @@ contains
                 end if
             end if
 
-            if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=s)
+            if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(q_prim_vf, bc_type, stage=s)
+
+            if (particles_lagrange) then
+                call s_update_lagrange_particles_tdv_rk(q_prim_vf, bc_type, stage=s)
+            end if
+
             $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, sys_size
                 do l = 0, p
@@ -520,6 +523,7 @@ contains
                 end do
             end do
             $:END_GPU_PARALLEL_LOOP()
+
             ! Evolve pb and mv for non-polytropic qbmm
             if (qbmm .and. (.not. polytropic)) then
                 $:GPU_PARALLEL_LOOP(collapse=5)
@@ -589,41 +593,41 @@ contains
 
         if (t_step >= 2) then
             wall_time_avg = (wall_time + (t_step - 2)*wall_time_avg)/(t_step - 1)
-        else
+        else  ! All other timesteps
             wall_time_avg = 0._wp
         end if
 
     end subroutine s_tvd_rk
 
     !> Bubble source part in Strang operator splitting scheme
+    !! @param stage Current time-stage
     impure subroutine s_adaptive_dt_bubble(stage)
 
         integer, intent(in) :: stage
         type(vector_field)  :: gm_alpha_qp
 
-        call s_convert_conservative_to_primitive_variables(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, idwint)
+        call s_convert_conservative_to_primitive_variables(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, idwbuff)
 
         if (bubbles_euler) then
             call s_compute_bubble_EE_source(q_cons_ts(1)%vf, q_prim_vf, rhs_vf, divu)
             call s_comp_alpha_from_n(q_cons_ts(1)%vf)
         else if (bubbles_lagrange) then
             call s_populate_variables_buffers(bc_type, q_prim_vf, pb_ts(1)%sf, mv_ts(1)%sf)
-            call s_compute_bubble_EL_dynamics(q_prim_vf, stage)
-            call s_transfer_data_to_tmp()
-            call s_smear_voidfraction()
+            call s_compute_bubble_EL_dynamics(q_prim_vf, bc_type, stage)
+
             if (stage == 3) then
                 if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
                 if (lag_params%write_bubbles) then
                     $:GPU_UPDATE(host='[gas_p, gas_mv, intfc_rad, intfc_vel]')
-                    call s_write_lag_particles(mytime)
+                    call s_write_lag_bubble_evol(mytime)
                 end if
-                call s_write_void_evol(mytime)
+                if (lag_params%write_void_evol) call s_write_void_evol(mytime)
             end if
         end if
 
     end subroutine s_adaptive_dt_bubble
 
-    !> Compute the global time step size from CFL stability constraints across all cells
+    !> @brief Computes the global time step size from CFL stability constraints across all cells.
     impure subroutine s_compute_dt()
 
         real(wp) :: rho  !< Cell-avg. density
@@ -644,6 +648,7 @@ contains
         real(wp)               :: H        !< Cell-avg. enthalpy
         real(wp), dimension(2) :: Re       !< Cell-avg. Reynolds numbers
         type(vector_field)     :: gm_alpha_qp
+        real(wp)               :: max_dt
         real(wp)               :: dt_local
         integer                :: j, k, l  !< Generic loop iterators
 
@@ -651,7 +656,9 @@ contains
             call s_convert_conservative_to_primitive_variables(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, idwint)
         end if
 
-        $:GPU_PARALLEL_LOOP(collapse=3, private='[vel, alpha, Re, rho, vel_sum, pres, gamma, pi_inf, c, H, qv]')
+        dt_local = huge(1.0_wp)
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[vel, alpha, Re, rho, vel_sum, pres, gamma, pi_inf, c, H, qv]', &
+                            & reduction='[[dt_local]]', reductionOp='[min]')
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -665,18 +672,15 @@ contains
                     call s_compute_speed_of_sound(pres, rho, gamma, pi_inf, H, alpha, vel_sum, 0._wp, c, qv)
 
                     call s_compute_dt_from_cfl(vel, c, max_dt, rho, Re, j, k, l)
+                    dt_local = min(dt_local, max_dt)
                 end do
             end do
         end do
         $:END_GPU_PARALLEL_LOOP()
 
-        #:call GPU_PARALLEL(copyout='[dt_local]', copyin='[max_dt]')
-            dt_local = minval(max_dt)
-        #:endcall GPU_PARALLEL
-
         if (num_procs == 1) then
             dt = dt_local
-        else
+        else  ! All other timesteps
             call s_mpi_allreduce_min(dt_local, dt)
         end if
 
@@ -684,7 +688,10 @@ contains
 
     end subroutine s_compute_dt
 
-    !> Apply the body forces source term at each Runge-Kutta stage
+    !> This subroutine applies the body forces source term at each Runge-Kutta stage
+    !! @param q_cons_vf Conservative variables
+    !! @param q_prim_vf_in Primitive variables
+    !! @param rhs_vf_in Right-hand side variables
     subroutine s_apply_bodyforces(q_cons_vf, q_prim_vf_in, rhs_vf_in, ldt)
 
         type(scalar_field), dimension(1:sys_size), intent(inout) :: q_cons_vf
@@ -712,7 +719,7 @@ contains
 
     end subroutine s_apply_bodyforces
 
-    !> Update immersed boundary positions and velocities at the current Runge-Kutta stage
+    !> @brief Updates immersed boundary positions and velocities at the current Runge-Kutta stage.
     subroutine s_propagate_immersed_boundaries(s)
 
         integer, intent(in) :: s
@@ -754,8 +761,9 @@ contains
                     ! update the angular velocity with the torque value
                     patch_ib(i)%angular_vel = (patch_ib(i)%angular_vel*patch_ib(i)%moment) + (rk_coef(s, &
                              & 3)*dt*patch_ib(i)%torque/rk_coef(s, 4))  ! add the torque to the angular momentum
-                    call s_compute_moment_of_inertia(i, patch_ib(i)%angular_vel)
-                    ! update the moment of inertia to be based on the direction of the angular momentum
+                    call s_compute_moment_of_inertia(i, &
+                                                     & patch_ib(i)%angular_vel) &
+                                                     &  ! update the moment of inertia to be based on the direction of the angular momentum
                     patch_ib(i)%angular_vel = patch_ib(i)%angular_vel/patch_ib(i) &
                              & %moment  ! convert back to angular velocity with the new moment of inertia
                 end if
@@ -780,7 +788,8 @@ contains
 
     end subroutine s_propagate_immersed_boundaries
 
-    !> Save the temporary q_prim_vf vector into q_prim_ts for use in p_main
+    !> This subroutine saves the temporary q_prim_vf vector into the q_prim_ts vector that is then used in p_main
+    !! @param t_step current time-step
     subroutine s_time_step_cycling(t_step)
 
         integer, intent(in) :: t_step
@@ -862,6 +871,7 @@ contains
         use hipfort_check
 #endif
         integer :: i, j  !< Generic loop iterators
+
         ! Deallocating the cell-average conservative variables
 #if defined(__NVCOMPILER_GPU_UNIFIED_MEM)
         do j = 1, sys_size
diff --git a/toolchain/mfc/params/definitions.py b/toolchain/mfc/params/definitions.py
index 2f3631c228..a193d9df49 100644
--- a/toolchain/mfc/params/definitions.py
+++ b/toolchain/mfc/params/definitions.py
@@ -627,8 +627,8 @@ def get_value_label(param_name: str, value: int) -> str:
     },
     # Bubbles
     "bubble_model": {
-        "choices": [1, 2, 3],
-        "value_labels": {1: "Gilmore", 2: "Keller-Miksis", 3: "Rayleigh-Plesset"},
+        "choices": [0, 1, 2, 3],
+        "value_labels": {0: "Particle", 1: "Gilmore", 2: "Keller-Miksis", 3: "Rayleigh-Plesset"},
     },
     # Output
     "format": {
@@ -880,6 +880,10 @@ def _load():
     for n in ["polytropic", "bubbles_euler", "polydisperse", "qbmm", "bubbles_lagrange"]:
         _r(n, LOG, {"bubbles"})
 
+    # Particles
+    for n in ["particles_lagrange"]:
+        _r(n, LOG, {"particles"})
+
     # Viscosity
     _r("viscous", LOG, {"viscosity"})
 
@@ -1053,6 +1057,13 @@ def _load():
         _r(f"p_{d}", REAL, math=r"\f$\phi_" + d + r"\f$")
         _r(f"bf_{d}", LOG)
 
+    # Interfacial flow inputs
+    _r("normMag", REAL)
+    _r("p0_ic", REAL)
+    _r("g0_ic", REAL)
+    _r("normFac", REAL)
+    _r("interface_file", STR)
+
     # INDEXED PARAMETERS
 
     # patch_icpp (10 patches)
@@ -1149,10 +1160,12 @@ def _load():
     ]:
         _r(f"bub_pp%{a}", REAL, {"bubbles"}, math=sym)
 
+    # particle_pp (particle properties)
+    for a in ["rho0ref_particle", "cp_particle"]:
+        _r(f"particle_pp%{a}", REAL, {"particles"})
+
     # patch_ib (immersed boundaries) — registered as indexed family for O(1) lookup.
-    # max_index is None so the parameter registry stays compact (no enumeration).
-    # The Fortran-side upper bound (num_patches_max in m_constants.fpp) is parsed
-    # and enforced by the case_validator, not by max_index here.
+    # max_index is sourced from Fortran's num_patches_max in m_constants.fpp.
     _ib_tags = {"ib"}
     _ib_attrs: Dict[str, tuple] = {}
     for a in ["geometry", "moving_ibm"]:
@@ -1260,12 +1273,20 @@ def _load():
             _r(f"simplex_params%perturb_vel_offset({d},{j})", REAL)
 
     # lag_params (Lagrangian bubbles)
-    for a in ["heatTransfer_model", "massTransfer_model", "pressure_corrector", "write_bubbles", "write_bubbles_stats"]:
+    for a in ["heatTransfer_model", "massTransfer_model", "pressure_corrector", "write_bubbles", "write_bubbles_stats", "pressure_force", "gravity_force", "write_void_evol"]:
         _r(f"lag_params%{a}", LOG, {"bubbles"})
-    for a in ["solver_approach", "cluster_type", "smooth_type", "nBubs_glb"]:
+    for a in ["solver_approach", "cluster_type", "smooth_type", "nBubs_glb", "drag_model", "vel_model", "charNz"]:
         _r(f"lag_params%{a}", INT, {"bubbles"})
     for a in ["epsilonb", "valmaxvoid", "charwidth", "c0", "rho0", "T0", "x0", "Thost"]:
         _r(f"lag_params%{a}", REAL, {"bubbles"})
+    _r("lag_params%input_path", STR, {"bubbles"})
+
+    # --- lag_params (Lagrangian particles) ---
+    for a in ["nParticles_glb", "stokes_drag", "qs_drag_model", "added_mass_model", "interpolation_order"]:
+        _r(f"lag_params%{a}", INT, {"particles"})
+
+    for a in ["collision_force"]:
+        _r(f"lag_params%{a}", LOG, {"particles"})
 
     # chem_params
     for a in ["diffusion", "reactions"]:
diff --git a/toolchain/mfc/params/descriptions.py b/toolchain/mfc/params/descriptions.py
index 7906b6d38d..2a15f8ed37 100644
--- a/toolchain/mfc/params/descriptions.py
+++ b/toolchain/mfc/params/descriptions.py
@@ -250,6 +250,14 @@
     "lag_mg_wrt": "Write bubble gas mass",
     "lag_betaT_wrt": "Write bubble heat transfer coefficient",
     "lag_betaC_wrt": "Write bubble mass transfer coefficient",
+    # Interfacial flow parameters
+    "interface_file": "Path to interface geometry data file",
+    "normFac": "Interface normalization factor",
+    "normMag": "Interface normal magnitude",
+    "g0_ic": "Initial gas volume fraction for interfacial IC",
+    "p0_ic": "Initial pressure for interfacial IC",
+    # Particle parameters
+    "particles_lagrange": "Enable Lagrangian solid particle solver",
 }
 
 # Patterns for auto-generating descriptions of indexed parameters