From 31d11da72e0aa456332e0d5011fa8f88c7cabe99 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 20 Apr 2026 21:42:50 -0400
Subject: [PATCH 01/15] attempted debug of MSVC stack overflow

---
 quest/src/cpu/cpu_subroutines.cpp |  4 ++--
 quest/src/cpu/cpu_types.hpp       | 20 +++++++++++++-------
 2 files changed, 15 insertions(+), 9 deletions(-)
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index ad6bc3c4..1be3b913 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -412,7 +412,7 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, v
 
     // use cpu_qcomp arithmetic overloads (avoid qcomp's)
     cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
-    auto elems = getCpuQcomps<2>(matr.elems); // MSVC requires explicit template param, bah!
+    auto elems = getCpuQcomps2(matr.elems); // MSVC requires explicit template param, bah!
 
     auto sortedQubits   = util_getSorted(ctrls, {targ});
     auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0});
@@ -495,7 +495,7 @@ void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
 
     // use cpu_qcomp arithmetic overloads (avoid qcomp's)
     cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
-    auto elems = getCpuQcomps<4>(matr.elems); // MSVC requires explicit template param, bah!
+    auto elems = getCpuQcomps4(matr.elems); // MSVC requires explicit template param, bah!
 
     auto sortedQubits   = util_getSorted(ctrls, {targ1, targ2});
     auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1, targ2}, {0, 0});
diff --git a/quest/src/cpu/cpu_types.hpp b/quest/src/cpu/cpu_types.hpp
index a426dfce..b7f52d40 100644
--- a/quest/src/cpu/cpu_types.hpp
+++ b/quest/src/cpu/cpu_types.hpp
@@ -127,16 +127,22 @@ INLINE qcomp getQcomp(const cpu_qcomp& a) {
 
 
 // creator for fixed-size dense matrices (CompMatr1 and CompMatr2) ((not inlined!))
-template <int dim>
-std::array<std::array<cpu_qcomp,dim>,dim> getCpuQcomps(qcomp matr[dim][dim]) {
+std::array<std::array<cpu_qcomp,2>,2> getCpuQcomps2(qcomp matr[2][2]) {
 
-    // detect brain-dead compiler inferencing (looking at you MSVC...)
-    static_assert(dim == 2 || dim == 4, "getCpuQcomps called with unexpected dim");
+    std::array<std::array<cpu_qcomp,2>,2> out;
 
-    std::array<std::array<cpu_qcomp,dim>,dim> out;
+    for (int i=0; i<2; i++)
+        for (int j=0; j<2; j++)
+            out[i][j] = getCpuQcomp(matr[i][j]);
+
+    return out;
+}
+std::array<std::array<cpu_qcomp,4>,4> getCpuQcomps4(qcomp matr[4][4]) {
+
+    std::array<std::array<cpu_qcomp,4>,4> out;
 
-    for (int i=0; i<dim; i++)
-        for (int j=0; j<dim; j++)
+    for (int i=0; i<4; i++)
+        for (int j=0; j<4; j++)
             out[i][j] = getCpuQcomp(matr[i][j]);
 
     return out;

From 471d746b2f8a897c1d99de8a6b69f8bb351ecc46 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 20 Apr 2026 21:45:19 -0400
Subject: [PATCH 02/15] Retriggering CI

---
 .github/workflows/compile.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml
index d1901553..676085fe 100644
--- a/.github/workflows/compile.yml
+++ b/.github/workflows/compile.yml
@@ -28,6 +28,7 @@ on:
     branches:
       - main
       - devel
+      - msvc-so-debug
   pull_request:
     branches:
       - main

From bc8464185af0953bf393b7582631f499caa7907f Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 20 Apr 2026 22:26:53 -0400
Subject: [PATCH 03/15] Rename

---
 .github/workflows/compile.yml     | 1 -
 quest/src/cpu/cpu_subroutines.cpp | 4 ++--
 quest/src/cpu/cpu_types.hpp       | 9 +++++++--
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml
index 676085fe..d1901553 100644
--- a/.github/workflows/compile.yml
+++ b/.github/workflows/compile.yml
@@ -28,7 +28,6 @@ on:
     branches:
       - main
       - devel
-      - msvc-so-debug
   pull_request:
     branches:
       - main
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index 1be3b913..619d6072 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -412,7 +412,7 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, v
 
     // use cpu_qcomp arithmetic overloads (avoid qcomp's)
     cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
-    auto elems = getCpuQcomps2(matr.elems); // MSVC requires explicit template param, bah!
+    auto elems = getCpuQcompsMatr1(matr.elems); // MSVC requires explicit template param, bah!
 
     auto sortedQubits   = util_getSorted(ctrls, {targ});
     auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0});
@@ -495,7 +495,7 @@ void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
 
     // use cpu_qcomp arithmetic overloads (avoid qcomp's)
     cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
-    auto elems = getCpuQcomps4(matr.elems); // MSVC requires explicit template param, bah!
+    auto elems = getCpuQcompsMatr2(matr.elems); // MSVC requires explicit template param, bah!
 
     auto sortedQubits   = util_getSorted(ctrls, {targ1, targ2});
     auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1, targ2}, {0, 0});
diff --git a/quest/src/cpu/cpu_types.hpp b/quest/src/cpu/cpu_types.hpp
index b7f52d40..9539926c 100644
--- a/quest/src/cpu/cpu_types.hpp
+++ b/quest/src/cpu/cpu_types.hpp
@@ -127,7 +127,12 @@ INLINE qcomp getQcomp(const cpu_qcomp& a) {
 
 
 // creator for fixed-size dense matrices (CompMatr1 and CompMatr2) ((not inlined!))
-std::array<std::array<cpu_qcomp,2>,2> getCpuQcomps2(qcomp matr[2][2]) {
+std::array<std::array<cpu_qcomp,2>,2> getCpuQcompsMatr1(qcomp matr[2][2]) {
+
+    // dumb and explicit here because MSVC + OpenMP breaks
+    // when templating this - not worth fixing here because
+    // we are considering a refactor which merges cpu_types.hpp
+    // with gpu_types.cuh anyway
 
     std::array<std::array<cpu_qcomp,2>,2> out;
 
@@ -137,7 +142,7 @@ std::array<std::array<cpu_qcomp,2>,2> getCpuQcomps2(qcomp matr[2][2]) {
 
     return out;
 }
-std::array<std::array<cpu_qcomp,4>,4> getCpuQcomps4(qcomp matr[4][4]) {
+std::array<std::array<cpu_qcomp,4>,4> getCpuQcompsMatr2(qcomp matr[4][4]) {
 
     std::array<std::array<cpu_qcomp,4>,4> out;
 

From b21ea150e87ccd7bf3e0ec7e9da137a7e0a46596 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 20 Apr 2026 22:55:47 -0400
Subject: [PATCH 04/15] more debugging grr

---
 .github/workflows/compile.yml     |  1 +
 quest/src/cpu/cpu_subroutines.cpp | 50 ++++++++++++++++++++++++++-----
 2 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml
index d1901553..676085fe 100644
--- a/.github/workflows/compile.yml
+++ b/.github/workflows/compile.yml
@@ -28,6 +28,7 @@ on:
     branches:
       - main
       - devel
+      - msvc-so-debug
   pull_request:
     branches:
       - main
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index 619d6072..e9689ac7 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -412,7 +412,11 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, v
 
     // use cpu_qcomp arithmetic overloads (avoid qcomp's)
     cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
-    auto elems = getCpuQcompsMatr1(matr.elems); // MSVC requires explicit template param, bah!
+    // auto elems = getCpuQcompsMatr1(matr.elems);
+    cpu_qcomp m00 = getCpuQcomp(matr.elems[0][0]); // MSVC cannot handle 2D cpu_qcomps
+    cpu_qcomp m01 = getCpuQcomp(matr.elems[0][1]);
+    cpu_qcomp m10 = getCpuQcomp(matr.elems[1][0]);
+    cpu_qcomp m11 = getCpuQcomp(matr.elems[1][1]);
 
     auto sortedQubits   = util_getSorted(ctrls, {targ});
     auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0});
@@ -432,8 +436,10 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, v
         cpu_qcomp amp0 = amps[i0];
         cpu_qcomp amp1 = amps[i1];
 
-        amps[i0] = elems[0][0]*amp0 + elems[0][1]*amp1;
-        amps[i1] = elems[1][0]*amp0 + elems[1][1]*amp1;
+        // amps[i0] = elems[0][0]*amp0 + elems[0][1]*amp1;
+        // amps[i1] = elems[1][0]*amp0 + elems[1][1]*amp1;
+        amps[i0] = m00*amp0 + m01*amp1;
+        amps[i1] = m10*amp0 + m11*amp1;
     }
 }
 
@@ -495,7 +501,30 @@ void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
 
     // use cpu_qcomp arithmetic overloads (avoid qcomp's)
     cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
-    auto elems = getCpuQcompsMatr2(matr.elems); // MSVC requires explicit template param, bah!
+
+    // auto elems = getCpuQcompsMatr2(matr.elems); // MSVC requires explicit template param, bah!
+
+    cpu_qcomp m00 = getCpuQcomp(matr.elems[0][0]);
+    cpu_qcomp m01 = getCpuQcomp(matr.elems[0][1]);
+    cpu_qcomp m02 = getCpuQcomp(matr.elems[0][2]);
+    cpu_qcomp m03 = getCpuQcomp(matr.elems[0][3]);
+
+    cpu_qcomp m10 = getCpuQcomp(matr.elems[1][0]);
+    cpu_qcomp m11 = getCpuQcomp(matr.elems[1][1]);
+    cpu_qcomp m12 = getCpuQcomp(matr.elems[1][2]);
+    cpu_qcomp m13 = getCpuQcomp(matr.elems[1][3]);
+
+    cpu_qcomp m20 = getCpuQcomp(matr.elems[2][0]);
+    cpu_qcomp m21 = getCpuQcomp(matr.elems[2][1]);
+    cpu_qcomp m22 = getCpuQcomp(matr.elems[2][2]);
+    cpu_qcomp m23 = getCpuQcomp(matr.elems[2][3]);
+
+    cpu_qcomp m30 = getCpuQcomp(matr.elems[3][0]);
+    cpu_qcomp m31 = getCpuQcomp(matr.elems[3][1]);
+    cpu_qcomp m32 = getCpuQcomp(matr.elems[3][2]);
+    cpu_qcomp m33 = getCpuQcomp(matr.elems[3][3]);
+
+    
 
     auto sortedQubits   = util_getSorted(ctrls, {targ1, targ2});
     auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1, targ2}, {0, 0});
@@ -520,10 +549,15 @@ void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
         cpu_qcomp amp11 = amps[i11];
 
         // amps[i_n] = sum_j matr.elems[n][j] amp[i_n]
-        amps[i00] = elems[0][0]*amp00 + elems[0][1]*amp01 + elems[0][2]*amp10 + elems[0][3]*amp11;
-        amps[i01] = elems[1][0]*amp00 + elems[1][1]*amp01 + elems[1][2]*amp10 + elems[1][3]*amp11;
-        amps[i10] = elems[2][0]*amp00 + elems[2][1]*amp01 + elems[2][2]*amp10 + elems[2][3]*amp11;
-        amps[i11] = elems[3][0]*amp00 + elems[3][1]*amp01 + elems[3][2]*amp10 + elems[3][3]*amp11;
+        // amps[i00] = elems[0][0]*amp00 + elems[0][1]*amp01 + elems[0][2]*amp10 + elems[0][3]*amp11;
+        // amps[i01] = elems[1][0]*amp00 + elems[1][1]*amp01 + elems[1][2]*amp10 + elems[1][3]*amp11;
+        // amps[i10] = elems[2][0]*amp00 + elems[2][1]*amp01 + elems[2][2]*amp10 + elems[2][3]*amp11;
+        // amps[i11] = elems[3][0]*amp00 + elems[3][1]*amp01 + elems[3][2]*amp10 + elems[3][3]*amp11;
+
+        amps[i00] = m00*amp00 + m01*amp01 + m02*amp10 + m03*amp11;
+        amps[i01] = m10*amp00 + m11*amp01 + m12*amp10 + m13*amp11;
+        amps[i10] = m20*amp00 + m21*amp01 + m22*amp10 + m23*amp11;
+        amps[i11] = m30*amp00 + m31*amp01 + m32*amp10 + m33*amp11;
     }
 }
 

From 92957e2c1d61c962b55b175c97c87c77473784b4 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 20 Apr 2026 23:15:14 -0400
Subject: [PATCH 05/15] enable SEH

---
 tests/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 31b0ea75..1a9625ce 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,5 +1,9 @@
 # @author Oliver Thomson Brown
 
+# DEBUG:
+# trying to get MSVC to compile
+add_compile_definitions(CATCH_CONFIG_WINDOWS_SEH)
+
 add_executable(tests
   main.cpp
 )

From 13ba54ced5fd036e586097d304aa1a05a2e1b490 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 20 Apr 2026 23:30:23 -0400
Subject: [PATCH 06/15] Enable exception unwinding

---
 CMakeLists.txt       | 9 +++++++++
 tests/CMakeLists.txt | 6 ++++++
 2 files changed, 15 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dc209ca5..edff2fb1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -315,6 +315,11 @@ target_compile_options(QuEST
   $<$<COMPILE_LANGUAGE:C>:${WARNING_FLAG}>
 )
 
+target_compile_options(QuEST
+  PRIVATE
+  $<$<CXX_COMPILER_ID:MSVC>:/EHsc>
+)
+
 
 
 # ============================
@@ -593,6 +598,10 @@ if (ENABLE_TESTING)
     )
     
     FetchContent_MakeAvailable(Catch2)
+
+    target_compile_options(Catch2 PRIVATE
+      $<$<CXX_COMPILER_ID:MSVC>:/EHsc>
+    )
   
   # otherwise fail
   else()
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 1a9625ce..b465924a 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -10,6 +10,12 @@ add_executable(tests
 target_link_libraries(tests PRIVATE QuEST::QuEST Catch2::Catch2)
 target_compile_features(tests PUBLIC cxx_std_20)
 
+
+target_compile_options(tests PRIVATE
+  $<$<CXX_COMPILER_ID:MSVC>:/EHsc>
+)
+
+
 add_subdirectory(unit)
 add_subdirectory(utils)
 add_subdirectory(integration)

From 8f7d540d30b528c5fecae27153bf8d120f8da57b Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 20 Apr 2026 23:36:29 -0400
Subject: [PATCH 07/15] Try just EHs

---
 CMakeLists.txt       | 4 ++--
 tests/CMakeLists.txt | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index edff2fb1..f254a169 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -317,7 +317,7 @@ target_compile_options(QuEST
 
 target_compile_options(QuEST
   PRIVATE
-  $<$<CXX_COMPILER_ID:MSVC>:/EHsc>
+  $<$<CXX_COMPILER_ID:MSVC>:/EHs>
 )
 
 
@@ -600,7 +600,7 @@ if (ENABLE_TESTING)
     FetchContent_MakeAvailable(Catch2)
 
     target_compile_options(Catch2 PRIVATE
-      $<$<CXX_COMPILER_ID:MSVC>:/EHsc>
+      $<$<CXX_COMPILER_ID:MSVC>:/EHs>
     )
   
   # otherwise fail
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index b465924a..1ab5766f 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -12,7 +12,7 @@ target_compile_features(tests PUBLIC cxx_std_20)
 
 
 target_compile_options(tests PRIVATE
-  $<$<CXX_COMPILER_ID:MSVC>:/EHsc>
+  $<$<CXX_COMPILER_ID:MSVC>:/EHs>
 )
 
 

From 79200eb9f51e8d669c3dc90c278d3f843d43b90b Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 20 Apr 2026 23:37:55 -0400
Subject: [PATCH 08/15] stringification MSVC bug workaround

---
 tests/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 1ab5766f..464bcec0 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -3,6 +3,7 @@
 # DEBUG:
 # trying to get MSVC to compile
 add_compile_definitions(CATCH_CONFIG_WINDOWS_SEH)
+add_compile_definitions(CATCH_CONFIG_DISABLE_STRINGIFICATION)
 
 add_executable(tests
   main.cpp

From b6380cf59297a94f305ac1774155b2737009cb23 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 20 Apr 2026 23:38:35 -0400
Subject: [PATCH 09/15] Enable bigobj

---
 tests/CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 464bcec0..ad26883c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -15,6 +15,9 @@ target_compile_features(tests PUBLIC cxx_std_20)
 target_compile_options(tests PRIVATE
   $<$<CXX_COMPILER_ID:MSVC>:/EHs>
 )
+target_compile_options(tests PRIVATE
+  $<$<CXX_COMPILER_ID:MSVC>:/bigobj>
+)
 
 
 add_subdirectory(unit)

From d0b620fab2c83ab49749ce578c70c4d3e5f7f510 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 20 Apr 2026 23:39:42 -0400
Subject: [PATCH 10/15] bigobj in Catch2 too

---
 CMakeLists.txt       | 3 +++
 tests/CMakeLists.txt | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f254a169..89900be3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -602,6 +602,9 @@ if (ENABLE_TESTING)
     target_compile_options(Catch2 PRIVATE
       $<$<CXX_COMPILER_ID:MSVC>:/EHs>
     )
+    target_compile_options(Catch2 PRIVATE
+      $<$<CXX_COMPILER_ID:MSVC>:/bigobj>
+    )
   
   # otherwise fail
   else()
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index ad26883c..f3fb5483 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -18,6 +18,9 @@ target_compile_options(tests PRIVATE
 target_compile_options(tests PRIVATE
   $<$<CXX_COMPILER_ID:MSVC>:/bigobj>
 )
+target_compile_options(Catch2 PRIVATE
+  $<$<CXX_COMPILER_ID:MSVC>:/bigobj>
+)
 
 
 add_subdirectory(unit)

From b2b5b022f4781ea9a0e2c1ff0edd307a27222915 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 20 Apr 2026 23:52:37 -0400
Subject: [PATCH 11/15] Retain only bigobj

---
 CMakeLists.txt       | 12 ------------
 tests/CMakeLists.txt | 13 ++-----------
 2 files changed, 2 insertions(+), 23 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 89900be3..dc209ca5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -315,11 +315,6 @@ target_compile_options(QuEST
   $<$<COMPILE_LANGUAGE:C>:${WARNING_FLAG}>
 )
 
-target_compile_options(QuEST
-  PRIVATE
-  $<$<CXX_COMPILER_ID:MSVC>:/EHs>
-)
-
 
 
 # ============================
@@ -598,13 +593,6 @@ if (ENABLE_TESTING)
     )
     
     FetchContent_MakeAvailable(Catch2)
-
-    target_compile_options(Catch2 PRIVATE
-      $<$<CXX_COMPILER_ID:MSVC>:/EHs>
-    )
-    target_compile_options(Catch2 PRIVATE
-      $<$<CXX_COMPILER_ID:MSVC>:/bigobj>
-    )
   
   # otherwise fail
   else()
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index f3fb5483..6289e616 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,10 +1,5 @@
 # @author Oliver Thomson Brown
 
-# DEBUG:
-# trying to get MSVC to compile
-add_compile_definitions(CATCH_CONFIG_WINDOWS_SEH)
-add_compile_definitions(CATCH_CONFIG_DISABLE_STRINGIFICATION)
-
 add_executable(tests
   main.cpp
 )
@@ -12,12 +7,8 @@ target_link_libraries(tests PRIVATE QuEST::QuEST Catch2::Catch2)
 target_compile_features(tests PUBLIC cxx_std_20)
 
 
-target_compile_options(tests PRIVATE
-  $<$<CXX_COMPILER_ID:MSVC>:/EHs>
-)
-target_compile_options(tests PRIVATE
-  $<$<CXX_COMPILER_ID:MSVC>:/bigobj>
-)
+# DEBUG:
+# trying to get MSVC to compile
 target_compile_options(Catch2 PRIVATE
   $<$<CXX_COMPILER_ID:MSVC>:/bigobj>
 )

From 215746a5b427b121ba9a8b3c980858bdc1ff7241 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Tue, 21 Apr 2026 00:03:55 -0400
Subject: [PATCH 12/15] just EHsc and bigobj

---
 tests/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 6289e616..78534aef 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -10,6 +10,7 @@ target_compile_features(tests PUBLIC cxx_std_20)
 # DEBUG:
 # trying to get MSVC to compile
 target_compile_options(Catch2 PRIVATE
+  $<$<CXX_COMPILER_ID:MSVC>:/EHsc>
   $<$<CXX_COMPILER_ID:MSVC>:/bigobj>
 )
 

From f4356afdaaa39dda61528672b1f5ef77e9c7e07f Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Tue, 21 Apr 2026 00:15:20 -0400
Subject: [PATCH 13/15] increase mem for template instantiation

---
 .github/workflows/compile.yml | 2 +-
 tests/CMakeLists.txt          | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml
index 676085fe..595b585c 100644
--- a/.github/workflows/compile.yml
+++ b/.github/workflows/compile.yml
@@ -62,7 +62,7 @@ jobs:
       # compile QuEST with all combinations of below flags
       matrix:
         os: [windows-latest, ubuntu-latest, macos-latest]
-        precision: [1, 2, 4]
+        precision: [4, 2, 1] ################################### DEBUG! long double first for MSVC debug
         omp:       [ON, OFF]
         mpi:       [ON, OFF]
         cuda:      [ON, OFF]
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 78534aef..0996bf75 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -12,6 +12,7 @@ target_compile_features(tests PUBLIC cxx_std_20)
 target_compile_options(Catch2 PRIVATE
   $<$<CXX_COMPILER_ID:MSVC>:/EHsc>
   $<$<CXX_COMPILER_ID:MSVC>:/bigobj>
+  $<$<CXX_COMPILER_ID:MSVC>:/Zm500>
 )
 
 

From ef23df1a5c103c4983df440b04d46602878b20e3 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Tue, 21 Apr 2026 00:21:27 -0400
Subject: [PATCH 14/15] try fast compile

---
 tests/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 0996bf75..aba33339 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -14,6 +14,8 @@ target_compile_options(Catch2 PRIVATE
   $<$<CXX_COMPILER_ID:MSVC>:/bigobj>
   $<$<CXX_COMPILER_ID:MSVC>:/Zm500>
 )
+add_compile_definitions(CATCH_CONFIG_DISABLE_STRINGIFICATION)
+add_compile_definitions(CATCH_CONFIG_FAST_COMPILE)
 
 
 add_subdirectory(unit)

From aeb1c5bc6e896668aa377b985068d16e31006bb2 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Wed, 22 Apr 2026 02:01:39 -0400
Subject: [PATCH 15/15] avoiding template?!

---
 quest/src/core/fastmath.hpp | 154 ++++++++++++++++++------------------
 quest/src/cpu/cpu_types.hpp |  81 +++++++++++++++++++
 quest/src/gpu/gpu_types.cuh |  84 ++++++++++++++++++++
 3 files changed, 242 insertions(+), 77 deletions(-)

diff --git a/quest/src/core/fastmath.hpp b/quest/src/core/fastmath.hpp
index 0dbcee9e..b27429b4 100644
--- a/quest/src/core/fastmath.hpp
+++ b/quest/src/core/fastmath.hpp
@@ -100,83 +100,83 @@ INLINE void fast_getSubQuregValues(qindex basisStateIndex, int* numQubitsPerSubQ
  */
 
 
-// T = qcomp, cpu_qcomp, gpu_qcomp
-template <typename T>
-INLINE T fast_getPauliStrElem(PauliStr str, qindex row, qindex col) {
-
-    // this function is called by both fullstatediagmatr_setElemsToPauliStrSum()
-    // and densmatr_setAmpsToPauliStrSum_sub(). The former's PauliStr can have
-    // Paulis on any of the 64 sites, but the latter's PauliStr is always
-    // constrainted to the lower 32 sites (because a 32-qubit density matrix
-    // is already too large for the world's computers). As such, the latter
-    // scenario can be optimised since str.highPaulis == 0, making the second
-    // loop below redundant. Avoiding this loop can at most half the runtime,
-    // though opens the risk that the former caller erroneously has its upper
-    // Paulis ignore. We forego this optimisation in defensive design, and
-    // because this function is only invoked during data structure initilisation
-    // and ergo infrequently.
-
-    // regrettably duplicated from paulis.cpp which is inaccessible here
-    constexpr int numPaulisPerMask = sizeof(PAULI_MASK_TYPE) * 8 / 2;
-
-    // T-agnostic complex literals
-    T p0, p1,n1, pI,nI;
-    p0 = {0,  0}; //  0
-    p1 = {+1, 0}; //  1
-    n1 = {-1, 0}; // -1
-    pI = {0, +1}; //  i
-    nI = {0, -1}; // -i
-
-    // 'matrices' below is not declared constexpr or static const, even though
-    // it is fixed/known at compile-time, because this makes it incompatible
-    // with CUDA kernels/thrust. It is instead left as runtime innitialisation
-    // but this poses no real slowdown; this function, and its caller, are inlined
-    // so these 16 amps are re-processed one for each full enumeration of the
-    // PauliStrSum which is expected to have significantly more terms/coeffs
-    T matrices[][2][2] = {
-        {{p1,p0},{p0,p1}},  // I
-        {{p0,p1},{p1,p0}},  // X
-        {{p0,nI},{pI,p0}},  // Y
-        {{p1,p0},{p0,n1}}}; // Z
-
-    T elem = p1; // 1
-
-    // could be compile-time unrolled into 32 iterations
-    for (int t=0; t<numPaulisPerMask; t++) {
-        int p = getTwoAdjacentBits(str.lowPaulis, 2*t);
-        int i = getBit(row, t);
-        int j = getBit(col, t);
-        elem *= matrices[p][i][j];
-    }
-
-    // could be compile-time unrolled into 32 iterations
-    for (int t=0; t<numPaulisPerMask; t++) {
-        int p = getTwoAdjacentBits(str.highPaulis, 2*t);
-        int i = getBit(row, t + numPaulisPerMask);
-        int j = getBit(col, t + numPaulisPerMask);
-        elem *= matrices[p][i][j];
-    }
-
-    return elem;
-}
-
-
-// T = qcomp, cpu_qcomp, gpu_qcomp
-template <typename T>
-INLINE T fast_getPauliStrSumElem(T* coeffs, PauliStr* strings, qindex numTerms, qindex row, qindex col) {
-
-    // this function accepts unpacked PauliStrSum fields since a PauliStrSum cannot 
-    // be directly processed in CUDA kernels/thrust due to its 'qcomp' field.
-    // it also assumes str.highPaulis==0 for all str in strings, as per above func.
-
-    T elem = {0, 0}; // type-agnostic complex literal
-
-    // this loop is expected exponentially smaller than caller's loop
-    for (qindex n=0; n<numTerms; n++)
-        elem += coeffs[n] * fast_getPauliStrElem<T>(strings[n], row, col);
-
-    return elem;
-}
+// // T = qcomp, cpu_qcomp, gpu_qcomp
+// template <typename T>
+// INLINE T fast_getPauliStrElem(PauliStr str, qindex row, qindex col) {
+
+//     // this function is called by both fullstatediagmatr_setElemsToPauliStrSum()
+//     // and densmatr_setAmpsToPauliStrSum_sub(). The former's PauliStr can have
+//     // Paulis on any of the 64 sites, but the latter's PauliStr is always
+//     // constrainted to the lower 32 sites (because a 32-qubit density matrix
+//     // is already too large for the world's computers). As such, the latter
+//     // scenario can be optimised since str.highPaulis == 0, making the second
+//     // loop below redundant. Avoiding this loop can at most half the runtime,
+//     // though opens the risk that the former caller erroneously has its upper
+//     // Paulis ignore. We forego this optimisation in defensive design, and
+//     // because this function is only invoked during data structure initilisation
+//     // and ergo infrequently.
+
+//     // regrettably duplicated from paulis.cpp which is inaccessible here
+//     constexpr int numPaulisPerMask = sizeof(PAULI_MASK_TYPE) * 8 / 2;
+
+//     // T-agnostic complex literals
+//     T p0, p1,n1, pI,nI;
+//     p0 = {0,  0}; //  0
+//     p1 = {+1, 0}; //  1
+//     n1 = {-1, 0}; // -1
+//     pI = {0, +1}; //  i
+//     nI = {0, -1}; // -i
+
+//     // 'matrices' below is not declared constexpr or static const, even though
+//     // it is fixed/known at compile-time, because this makes it incompatible
+//     // with CUDA kernels/thrust. It is instead left as runtime innitialisation
+//     // but this poses no real slowdown; this function, and its caller, are inlined
+//     // so these 16 amps are re-processed one for each full enumeration of the
+//     // PauliStrSum which is expected to have significantly more terms/coeffs
+//     T matrices[][2][2] = {
+//         {{p1,p0},{p0,p1}},  // I
+//         {{p0,p1},{p1,p0}},  // X
+//         {{p0,nI},{pI,p0}},  // Y
+//         {{p1,p0},{p0,n1}}}; // Z
+
+//     T elem = p1; // 1
+
+//     // could be compile-time unrolled into 32 iterations
+//     for (int t=0; t<numPaulisPerMask; t++) {
+//         int p = getTwoAdjacentBits(str.lowPaulis, 2*t);
+//         int i = getBit(row, t);
+//         int j = getBit(col, t);
+//         elem *= matrices[p][i][j];
+//     }
+
+//     // could be compile-time unrolled into 32 iterations
+//     for (int t=0; t<numPaulisPerMask; t++) {
+//         int p = getTwoAdjacentBits(str.highPaulis, 2*t);
+//         int i = getBit(row, t + numPaulisPerMask);
+//         int j = getBit(col, t + numPaulisPerMask);
+//         elem *= matrices[p][i][j];
+//     }
+
+//     return elem;
+// }
+
+
+// // T = qcomp, cpu_qcomp, gpu_qcomp
+// template <typename T>
+// INLINE T fast_getPauliStrSumElem(T* coeffs, PauliStr* strings, qindex numTerms, qindex row, qindex col) {
+
+//     // this function accepts unpacked PauliStrSum fields since a PauliStrSum cannot 
+//     // be directly processed in CUDA kernels/thrust due to its 'qcomp' field.
+//     // it also assumes str.highPaulis==0 for all str in strings, as per above func.
+
+//     T elem = {0, 0}; // type-agnostic complex literal
+
+//     // this loop is expected exponentially smaller than caller's loop
+//     for (qindex n=0; n<numTerms; n++)
+//         elem += coeffs[n] * fast_getPauliStrElem<T>(strings[n], row, col);
+
+//     return elem;
+// }
 
 
 #endif // FASTMATH_HPP
\ No newline at end of file
diff --git a/quest/src/cpu/cpu_types.hpp b/quest/src/cpu/cpu_types.hpp
index 9539926c..651ad0e4 100644
--- a/quest/src/cpu/cpu_types.hpp
+++ b/quest/src/cpu/cpu_types.hpp
@@ -198,4 +198,85 @@ static_assert(std::is_trivially_copyable_v<cpu_qcomp>);
 // casting is safe for all circumstances (e.g. heap mem, static lists)
 
 
+
+
+
+INLINE cpu_qcomp fast_getPauliStrElem(PauliStr str, qindex row, qindex col) {
+
+    // this function is called by both fullstatediagmatr_setElemsToPauliStrSum()
+    // and densmatr_setAmpsToPauliStrSum_sub(). The former's PauliStr can have
+    // Paulis on any of the 64 sites, but the latter's PauliStr is always
+    // constrainted to the lower 32 sites (because a 32-qubit density matrix
+    // is already too large for the world's computers). As such, the latter
+    // scenario can be optimised since str.highPaulis == 0, making the second
+    // loop below redundant. Avoiding this loop can at most half the runtime,
+    // though opens the risk that the former caller erroneously has its upper
+    // Paulis ignore. We forego this optimisation in defensive design, and
+    // because this function is only invoked during data structure initilisation
+    // and ergo infrequently.
+
+    // regrettably duplicated from paulis.cpp which is inaccessible here
+    constexpr int numPaulisPerMask = sizeof(PAULI_MASK_TYPE) * 8 / 2;
+
+    // T-agnostic complex literals
+    cpu_qcomp p0, p1,n1, pI,nI;
+    p0 = {0,  0}; //  0
+    p1 = {+1, 0}; //  1
+    n1 = {-1, 0}; // -1
+    pI = {0, +1}; //  i
+    nI = {0, -1}; // -i
+
+    // 'matrices' below is not declared constexpr or static const, even though
+    // it is fixed/known at compile-time, because this makes it incompatible
+    // with CUDA kernels/thrust. It is instead left as runtime innitialisation
+    // but this poses no real slowdown; this function, and its caller, are inlined
+    // so these 16 amps are re-processed one for each full enumeration of the
+    // PauliStrSum which is expected to have significantly more terms/coeffs
+    cpu_qcomp matrices[][2][2] = {
+        {{p1,p0},{p0,p1}},  // I
+        {{p0,p1},{p1,p0}},  // X
+        {{p0,nI},{pI,p0}},  // Y
+        {{p1,p0},{p0,n1}}}; // Z
+
+    cpu_qcomp elem = p1; // 1
+
+    // could be compile-time unrolled into 32 iterations
+    for (int t=0; t<numPaulisPerMask; t++) {
+        int p = getTwoAdjacentBits(str.lowPaulis, 2*t);
+        int i = getBit(row, t);
+        int j = getBit(col, t);
+        elem *= matrices[p][i][j];
+    }
+
+    // could be compile-time unrolled into 32 iterations
+    for (int t=0; t<numPaulisPerMask; t++) {
+        int p = getTwoAdjacentBits(str.highPaulis, 2*t);
+        int i = getBit(row, t + numPaulisPerMask);
+        int j = getBit(col, t + numPaulisPerMask);
+        elem *= matrices[p][i][j];
+    }
+
+    return elem;
+}
+
+
+
+INLINE cpu_qcomp fast_getPauliStrSumElem(cpu_qcomp* coeffs, PauliStr* strings, qindex numTerms, qindex row, qindex col) {
+
+    // this function accepts unpacked PauliStrSum fields since a PauliStrSum cannot 
+    // be directly processed in CUDA kernels/thrust due to its 'qcomp' field.
+    // it also assumes str.highPaulis==0 for all str in strings, as per above func.
+
+    cpu_qcomp elem = {0, 0}; // type-agnostic complex literal
+
+    // this loop is expected exponentially smaller than caller's loop
+    for (qindex n=0; n<numTerms; n++)
+        elem += coeffs[n] * fast_getPauliStrElem(strings[n], row, col);
+
+    return elem;
+}
+
+
+
+
 #endif // CPU_TYPES_HPP
\ No newline at end of file
diff --git a/quest/src/gpu/gpu_types.cuh b/quest/src/gpu/gpu_types.cuh
index 8b8d3dad..2acb1648 100644
--- a/quest/src/gpu/gpu_types.cuh
+++ b/quest/src/gpu/gpu_types.cuh
@@ -277,4 +277,88 @@ __host__ inline std::array<gpu_qcomp,16> unpackMatrixToGpuQcomps(CompMatr2 in) {
 }
 
 
+
+
+
+
+
+
+
+INLINE gpu_qcomp fast_getPauliStrElem(PauliStr str, qindex row, qindex col) {
+
+    // this function is called by both fullstatediagmatr_setElemsToPauliStrSum()
+    // and densmatr_setAmpsToPauliStrSum_sub(). The former's PauliStr can have
+    // Paulis on any of the 64 sites, but the latter's PauliStr is always
+    // constrainted to the lower 32 sites (because a 32-qubit density matrix
+    // is already too large for the world's computers). As such, the latter
+    // scenario can be optimised since str.highPaulis == 0, making the second
+    // loop below redundant. Avoiding this loop can at most half the runtime,
+    // though opens the risk that the former caller erroneously has its upper
+    // Paulis ignore. We forego this optimisation in defensive design, and
+    // because this function is only invoked during data structure initilisation
+    // and ergo infrequently.
+
+    // regrettably duplicated from paulis.cpp which is inaccessible here
+    constexpr int numPaulisPerMask = sizeof(PAULI_MASK_TYPE) * 8 / 2;
+
+    // T-agnostic complex literals
+    gpu_qcomp p0, p1,n1, pI,nI;
+    p0 = {0,  0}; //  0
+    p1 = {+1, 0}; //  1
+    n1 = {-1, 0}; // -1
+    pI = {0, +1}; //  i
+    nI = {0, -1}; // -i
+
+    // 'matrices' below is not declared constexpr or static const, even though
+    // it is fixed/known at compile-time, because this makes it incompatible
+    // with CUDA kernels/thrust. It is instead left as runtime innitialisation
+    // but this poses no real slowdown; this function, and its caller, are inlined
+    // so these 16 amps are re-processed one for each full enumeration of the
+    // PauliStrSum which is expected to have significantly more terms/coeffs
+    gpu_qcomp matrices[][2][2] = {
+        {{p1,p0},{p0,p1}},  // I
+        {{p0,p1},{p1,p0}},  // X
+        {{p0,nI},{pI,p0}},  // Y
+        {{p1,p0},{p0,n1}}}; // Z
+
+    gpu_qcomp elem = p1; // 1
+
+    // could be compile-time unrolled into 32 iterations
+    for (int t=0; t<numPaulisPerMask; t++) {
+        int p = getTwoAdjacentBits(str.lowPaulis, 2*t);
+        int i = getBit(row, t);
+        int j = getBit(col, t);
+        elem *= matrices[p][i][j];
+    }
+
+    // could be compile-time unrolled into 32 iterations
+    for (int t=0; t<numPaulisPerMask; t++) {
+        int p = getTwoAdjacentBits(str.highPaulis, 2*t);
+        int i = getBit(row, t + numPaulisPerMask);
+        int j = getBit(col, t + numPaulisPerMask);
+        elem *= matrices[p][i][j];
+    }
+
+    return elem;
+}
+
+
+
+INLINE gpu_qcomp fast_getPauliStrSumElem(gpu_qcomp* coeffs, PauliStr* strings, qindex numTerms, qindex row, qindex col) {
+
+    // this function accepts unpacked PauliStrSum fields since a PauliStrSum cannot 
+    // be directly processed in CUDA kernels/thrust due to its 'qcomp' field.
+    // it also assumes str.highPaulis==0 for all str in strings, as per above func.
+
+    gpu_qcomp elem = {0, 0}; // type-agnostic complex literal
+
+    // this loop is expected exponentially smaller than caller's loop
+    for (qindex n=0; n<numTerms; n++)
+        elem += coeffs[n] * fast_getPauliStrElem(strings[n], row, col);
+
+    return elem;
+}
+
+
+
 #endif // GPU_TYPES_HPP
\ No newline at end of file