From 31d11da72e0aa456332e0d5011fa8f88c7cabe99 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Mon, 20 Apr 2026 21:42:50 -0400 Subject: [PATCH 01/15] attempted debug of MSVC stack overflow --- quest/src/cpu/cpu_subroutines.cpp | 4 ++-- quest/src/cpu/cpu_types.hpp | 20 +++++++++++++------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp index ad6bc3c4..1be3b913 100644 --- a/quest/src/cpu/cpu_subroutines.cpp +++ b/quest/src/cpu/cpu_subroutines.cpp @@ -412,7 +412,7 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector ctrls, v // use cpu_qcomp arithmetic overloads (avoid qcomp's) cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); - auto elems = getCpuQcomps<2>(matr.elems); // MSVC requires explicit template param, bah! + auto elems = getCpuQcomps2(matr.elems); // MSVC requires explicit template param, bah! auto sortedQubits = util_getSorted(ctrls, {targ}); auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0}); @@ -495,7 +495,7 @@ void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve // use cpu_qcomp arithmetic overloads (avoid qcomp's) cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); - auto elems = getCpuQcomps<4>(matr.elems); // MSVC requires explicit template param, bah! + auto elems = getCpuQcomps4(matr.elems); // MSVC requires explicit template param, bah! auto sortedQubits = util_getSorted(ctrls, {targ1, targ2}); auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1, targ2}, {0, 0}); diff --git a/quest/src/cpu/cpu_types.hpp b/quest/src/cpu/cpu_types.hpp index a426dfce..b7f52d40 100644 --- a/quest/src/cpu/cpu_types.hpp +++ b/quest/src/cpu/cpu_types.hpp @@ -127,16 +127,22 @@ INLINE qcomp getQcomp(const cpu_qcomp& a) { // creator for fixed-size dense matrices (CompMatr1 and CompMatr2) ((not inlined!)) -template -std::array,dim> getCpuQcomps(qcomp matr[dim][dim]) { +std::array,2> getCpuQcomps2(qcomp matr[2][2]) { - // detect brain-dead compiler inferencing (looking at you MSVC...) - static_assert(dim == 2 || dim == 4, "getCpuQcomps called with unexpected dim"); + std::array,2> out; - std::array,dim> out; + for (int i=0; i<2; i++) + for (int j=0; j<2; j++) + out[i][j] = getCpuQcomp(matr[i][j]); + + return out; +} +std::array,4> getCpuQcomps4(qcomp matr[4][4]) { + + std::array,4> out; - for (int i=0; i Date: Mon, 20 Apr 2026 21:45:19 -0400 Subject: [PATCH 02/15] Retriggering CI --- .github/workflows/compile.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index d1901553..676085fe 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -28,6 +28,7 @@ on: branches: - main - devel + - msvc-so-debug pull_request: branches: - main From bc8464185af0953bf393b7582631f499caa7907f Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Mon, 20 Apr 2026 22:26:53 -0400 Subject: [PATCH 03/15] Rename --- .github/workflows/compile.yml | 1 - quest/src/cpu/cpu_subroutines.cpp | 4 ++-- quest/src/cpu/cpu_types.hpp | 9 +++++++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index 676085fe..d1901553 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -28,7 +28,6 @@ on: branches: - main - devel - - msvc-so-debug pull_request: branches: - main diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp index 1be3b913..619d6072 100644 --- a/quest/src/cpu/cpu_subroutines.cpp +++ b/quest/src/cpu/cpu_subroutines.cpp @@ -412,7 +412,7 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector ctrls, v // use cpu_qcomp arithmetic overloads (avoid qcomp's) cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); - auto elems = getCpuQcomps2(matr.elems); // MSVC requires explicit template param, bah! + auto elems = getCpuQcompsMatr1(matr.elems); // MSVC requires explicit template param, bah! auto sortedQubits = util_getSorted(ctrls, {targ}); auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0}); @@ -495,7 +495,7 @@ void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve // use cpu_qcomp arithmetic overloads (avoid qcomp's) cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); - auto elems = getCpuQcomps4(matr.elems); // MSVC requires explicit template param, bah! + auto elems = getCpuQcompsMatr2(matr.elems); // MSVC requires explicit template param, bah! auto sortedQubits = util_getSorted(ctrls, {targ1, targ2}); auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1, targ2}, {0, 0}); diff --git a/quest/src/cpu/cpu_types.hpp b/quest/src/cpu/cpu_types.hpp index b7f52d40..9539926c 100644 --- a/quest/src/cpu/cpu_types.hpp +++ b/quest/src/cpu/cpu_types.hpp @@ -127,7 +127,12 @@ INLINE qcomp getQcomp(const cpu_qcomp& a) { // creator for fixed-size dense matrices (CompMatr1 and CompMatr2) ((not inlined!)) -std::array,2> getCpuQcomps2(qcomp matr[2][2]) { +std::array,2> getCpuQcompsMatr1(qcomp matr[2][2]) { + + // dumb and explicit here because MSVC + OpenMP breaks + // when templating this - not worth fixing here because + // we are considering a refactor which merges cpu_types.hpp + // with gpu_types.cuh anyway std::array,2> out; @@ -137,7 +142,7 @@ std::array,2> getCpuQcomps2(qcomp matr[2][2]) { return out; } -std::array,4> getCpuQcomps4(qcomp matr[4][4]) { +std::array,4> getCpuQcompsMatr2(qcomp matr[4][4]) { std::array,4> out; From b21ea150e87ccd7bf3e0ec7e9da137a7e0a46596 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Mon, 20 Apr 2026 22:55:47 -0400 Subject: [PATCH 04/15] more debugging grr --- .github/workflows/compile.yml | 1 + quest/src/cpu/cpu_subroutines.cpp | 50 ++++++++++++++++++++++++++----- 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index d1901553..676085fe 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -28,6 +28,7 @@ on: branches: - main - devel + - msvc-so-debug pull_request: branches: - main diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp index 619d6072..e9689ac7 100644 --- a/quest/src/cpu/cpu_subroutines.cpp +++ b/quest/src/cpu/cpu_subroutines.cpp @@ -412,7 +412,11 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector ctrls, v // use cpu_qcomp arithmetic overloads (avoid qcomp's) cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); - auto elems = getCpuQcompsMatr1(matr.elems); // MSVC requires explicit template param, bah! + // auto elems = getCpuQcompsMatr1(matr.elems); + cpu_qcomp m00 = getCpuQcomp(matr.elems[0][0]); // MSVC cannot handle 2D cpu_qcomps + cpu_qcomp m01 = getCpuQcomp(matr.elems[0][1]); + cpu_qcomp m10 = getCpuQcomp(matr.elems[1][0]); + cpu_qcomp m11 = getCpuQcomp(matr.elems[1][1]); auto sortedQubits = util_getSorted(ctrls, {targ}); auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0}); @@ -432,8 +436,10 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector ctrls, v cpu_qcomp amp0 = amps[i0]; cpu_qcomp amp1 = amps[i1]; - amps[i0] = elems[0][0]*amp0 + elems[0][1]*amp1; - amps[i1] = elems[1][0]*amp0 + elems[1][1]*amp1; + // amps[i0] = elems[0][0]*amp0 + elems[0][1]*amp1; + // amps[i1] = elems[1][0]*amp0 + elems[1][1]*amp1; + amps[i0] = m00*amp0 + m01*amp1; + amps[i1] = m10*amp0 + m11*amp1; } } @@ -495,7 +501,30 @@ void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve // use cpu_qcomp arithmetic overloads (avoid qcomp's) cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); - auto elems = getCpuQcompsMatr2(matr.elems); // MSVC requires explicit template param, bah! + + // auto elems = getCpuQcompsMatr2(matr.elems); // MSVC requires explicit template param, bah! + + cpu_qcomp m00 = getCpuQcomp(matr.elems[0][0]); + cpu_qcomp m01 = getCpuQcomp(matr.elems[0][1]); + cpu_qcomp m02 = getCpuQcomp(matr.elems[0][2]); + cpu_qcomp m03 = getCpuQcomp(matr.elems[0][3]); + + cpu_qcomp m10 = getCpuQcomp(matr.elems[1][0]); + cpu_qcomp m11 = getCpuQcomp(matr.elems[1][1]); + cpu_qcomp m12 = getCpuQcomp(matr.elems[1][2]); + cpu_qcomp m13 = getCpuQcomp(matr.elems[1][3]); + + cpu_qcomp m20 = getCpuQcomp(matr.elems[2][0]); + cpu_qcomp m21 = getCpuQcomp(matr.elems[2][1]); + cpu_qcomp m22 = getCpuQcomp(matr.elems[2][2]); + cpu_qcomp m23 = getCpuQcomp(matr.elems[2][3]); + + cpu_qcomp m30 = getCpuQcomp(matr.elems[3][0]); + cpu_qcomp m31 = getCpuQcomp(matr.elems[3][1]); + cpu_qcomp m32 = getCpuQcomp(matr.elems[3][2]); + cpu_qcomp m33 = getCpuQcomp(matr.elems[3][3]); + + auto sortedQubits = util_getSorted(ctrls, {targ1, targ2}); auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1, targ2}, {0, 0}); @@ -520,10 +549,15 @@ void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve cpu_qcomp amp11 = amps[i11]; // amps[i_n] = sum_j matr.elems[n][j] amp[i_n] - amps[i00] = elems[0][0]*amp00 + elems[0][1]*amp01 + elems[0][2]*amp10 + elems[0][3]*amp11; - amps[i01] = elems[1][0]*amp00 + elems[1][1]*amp01 + elems[1][2]*amp10 + elems[1][3]*amp11; - amps[i10] = elems[2][0]*amp00 + elems[2][1]*amp01 + elems[2][2]*amp10 + elems[2][3]*amp11; - amps[i11] = elems[3][0]*amp00 + elems[3][1]*amp01 + elems[3][2]*amp10 + elems[3][3]*amp11; + // amps[i00] = elems[0][0]*amp00 + elems[0][1]*amp01 + elems[0][2]*amp10 + elems[0][3]*amp11; + // amps[i01] = elems[1][0]*amp00 + elems[1][1]*amp01 + elems[1][2]*amp10 + elems[1][3]*amp11; + // amps[i10] = elems[2][0]*amp00 + elems[2][1]*amp01 + elems[2][2]*amp10 + elems[2][3]*amp11; + // amps[i11] = elems[3][0]*amp00 + elems[3][1]*amp01 + elems[3][2]*amp10 + elems[3][3]*amp11; + + amps[i00] = m00*amp00 + m01*amp01 + m02*amp10 + m03*amp11; + amps[i01] = m10*amp00 + m11*amp01 + m12*amp10 + m13*amp11; + amps[i10] = m20*amp00 + m21*amp01 + m22*amp10 + m23*amp11; + amps[i11] = m30*amp00 + m31*amp01 + m32*amp10 + m33*amp11; } } From 92957e2c1d61c962b55b175c97c87c77473784b4 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Mon, 20 Apr 2026 23:15:14 -0400 Subject: [PATCH 05/15] enable SEH --- tests/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 31b0ea75..1a9625ce 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,5 +1,9 @@ # @author Oliver Thomson Brown +# DEBUG: +# trying to get MSVC to compile +add_compile_definitions(CATCH_CONFIG_WINDOWS_SEH) + add_executable(tests main.cpp ) From 13ba54ced5fd036e586097d304aa1a05a2e1b490 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Mon, 20 Apr 2026 23:30:23 -0400 Subject: [PATCH 06/15] Enable exception unwinding --- CMakeLists.txt | 9 +++++++++ tests/CMakeLists.txt | 6 ++++++ 2 files changed, 15 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index dc209ca5..edff2fb1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -315,6 +315,11 @@ target_compile_options(QuEST $<$:${WARNING_FLAG}> ) +target_compile_options(QuEST + PRIVATE + $<$:/EHsc> +) + # ============================ @@ -593,6 +598,10 @@ if (ENABLE_TESTING) ) FetchContent_MakeAvailable(Catch2) + + target_compile_options(Catch2 PRIVATE + $<$:/EHsc> + ) # otherwise fail else() diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 1a9625ce..b465924a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -10,6 +10,12 @@ add_executable(tests target_link_libraries(tests PRIVATE QuEST::QuEST Catch2::Catch2) target_compile_features(tests PUBLIC cxx_std_20) + +target_compile_options(tests PRIVATE + $<$:/EHsc> +) + + add_subdirectory(unit) add_subdirectory(utils) add_subdirectory(integration) From 8f7d540d30b528c5fecae27153bf8d120f8da57b Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Mon, 20 Apr 2026 23:36:29 -0400 Subject: [PATCH 07/15] Try just EHs --- CMakeLists.txt | 4 ++-- tests/CMakeLists.txt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index edff2fb1..f254a169 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -317,7 +317,7 @@ target_compile_options(QuEST target_compile_options(QuEST PRIVATE - $<$:/EHsc> + $<$:/EHs> ) @@ -600,7 +600,7 @@ if (ENABLE_TESTING) FetchContent_MakeAvailable(Catch2) target_compile_options(Catch2 PRIVATE - $<$:/EHsc> + $<$:/EHs> ) # otherwise fail diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b465924a..1ab5766f 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -12,7 +12,7 @@ target_compile_features(tests PUBLIC cxx_std_20) target_compile_options(tests PRIVATE - $<$:/EHsc> + $<$:/EHs> ) From 79200eb9f51e8d669c3dc90c278d3f843d43b90b Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Mon, 20 Apr 2026 23:37:55 -0400 Subject: [PATCH 08/15] stringification MSVC bug workaround --- tests/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 1ab5766f..464bcec0 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -3,6 +3,7 @@ # DEBUG: # trying to get MSVC to compile add_compile_definitions(CATCH_CONFIG_WINDOWS_SEH) +add_compile_definitions(CATCH_CONFIG_DISABLE_STRINGIFICATION) add_executable(tests main.cpp From b6380cf59297a94f305ac1774155b2737009cb23 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Mon, 20 Apr 2026 23:38:35 -0400 Subject: [PATCH 09/15] Enable bigobj --- tests/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 464bcec0..ad26883c 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -15,6 +15,9 @@ target_compile_features(tests PUBLIC cxx_std_20) target_compile_options(tests PRIVATE $<$:/EHs> ) +target_compile_options(tests PRIVATE + $<$:/bigobj> +) add_subdirectory(unit) From d0b620fab2c83ab49749ce578c70c4d3e5f7f510 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Mon, 20 Apr 2026 23:39:42 -0400 Subject: [PATCH 10/15] bigobj in Catch2 too --- CMakeLists.txt | 3 +++ tests/CMakeLists.txt | 3 +++ 2 files changed, 6 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index f254a169..89900be3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -602,6 +602,9 @@ if (ENABLE_TESTING) target_compile_options(Catch2 PRIVATE $<$:/EHs> ) + target_compile_options(Catch2 PRIVATE + $<$:/bigobj> + ) # otherwise fail else() diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ad26883c..f3fb5483 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -18,6 +18,9 @@ target_compile_options(tests PRIVATE target_compile_options(tests PRIVATE $<$:/bigobj> ) +target_compile_options(Catch2 PRIVATE + $<$:/bigobj> +) add_subdirectory(unit) From b2b5b022f4781ea9a0e2c1ff0edd307a27222915 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Mon, 20 Apr 2026 23:52:37 -0400 Subject: [PATCH 11/15] Retain only bigobj --- CMakeLists.txt | 12 ------------ tests/CMakeLists.txt | 13 ++----------- 2 files changed, 2 insertions(+), 23 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 89900be3..dc209ca5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -315,11 +315,6 @@ target_compile_options(QuEST $<$:${WARNING_FLAG}> ) -target_compile_options(QuEST - PRIVATE - $<$:/EHs> -) - # ============================ @@ -598,13 +593,6 @@ if (ENABLE_TESTING) ) FetchContent_MakeAvailable(Catch2) - - target_compile_options(Catch2 PRIVATE - $<$:/EHs> - ) - target_compile_options(Catch2 PRIVATE - $<$:/bigobj> - ) # otherwise fail else() diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index f3fb5483..6289e616 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,10 +1,5 @@ # @author Oliver Thomson Brown -# DEBUG: -# trying to get MSVC to compile -add_compile_definitions(CATCH_CONFIG_WINDOWS_SEH) -add_compile_definitions(CATCH_CONFIG_DISABLE_STRINGIFICATION) - add_executable(tests main.cpp ) @@ -12,12 +7,8 @@ target_link_libraries(tests PRIVATE QuEST::QuEST Catch2::Catch2) target_compile_features(tests PUBLIC cxx_std_20) -target_compile_options(tests PRIVATE - $<$:/EHs> -) -target_compile_options(tests PRIVATE - $<$:/bigobj> -) +# DEBUG: +# trying to get MSVC to compile target_compile_options(Catch2 PRIVATE $<$:/bigobj> ) From 215746a5b427b121ba9a8b3c980858bdc1ff7241 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Tue, 21 Apr 2026 00:03:55 -0400 Subject: [PATCH 12/15] just EHsc and bigobj --- tests/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6289e616..78534aef 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -10,6 +10,7 @@ target_compile_features(tests PUBLIC cxx_std_20) # DEBUG: # trying to get MSVC to compile target_compile_options(Catch2 PRIVATE + $<$:/EHsc> $<$:/bigobj> ) From f4356afdaaa39dda61528672b1f5ef77e9c7e07f Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Tue, 21 Apr 2026 00:15:20 -0400 Subject: [PATCH 13/15] increase mem for template instantiation --- .github/workflows/compile.yml | 2 +- tests/CMakeLists.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index 676085fe..595b585c 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -62,7 +62,7 @@ jobs: # compile QuEST with all combinations of below flags matrix: os: [windows-latest, ubuntu-latest, macos-latest] - precision: [1, 2, 4] + precision: [4, 2, 1] ################################### DEBUG! long double first for MSVC debug omp: [ON, OFF] mpi: [ON, OFF] cuda: [ON, OFF] diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 78534aef..0996bf75 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -12,6 +12,7 @@ target_compile_features(tests PUBLIC cxx_std_20) target_compile_options(Catch2 PRIVATE $<$:/EHsc> $<$:/bigobj> + $<$:/Zm500> ) From ef23df1a5c103c4983df440b04d46602878b20e3 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Tue, 21 Apr 2026 00:21:27 -0400 Subject: [PATCH 14/15] try fast compile --- tests/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 0996bf75..aba33339 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -14,6 +14,8 @@ target_compile_options(Catch2 PRIVATE $<$:/bigobj> $<$:/Zm500> ) +add_compile_definitions(CATCH_CONFIG_DISABLE_STRINGIFICATION) +add_compile_definitions(CATCH_CONFIG_FAST_COMPILE) add_subdirectory(unit) From aeb1c5bc6e896668aa377b985068d16e31006bb2 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Wed, 22 Apr 2026 02:01:39 -0400 Subject: [PATCH 15/15] avoiding template?! --- quest/src/core/fastmath.hpp | 154 ++++++++++++++++++------------------ quest/src/cpu/cpu_types.hpp | 81 +++++++++++++++++++ quest/src/gpu/gpu_types.cuh | 84 ++++++++++++++++++++ 3 files changed, 242 insertions(+), 77 deletions(-) diff --git a/quest/src/core/fastmath.hpp b/quest/src/core/fastmath.hpp index 0dbcee9e..b27429b4 100644 --- a/quest/src/core/fastmath.hpp +++ b/quest/src/core/fastmath.hpp @@ -100,83 +100,83 @@ INLINE void fast_getSubQuregValues(qindex basisStateIndex, int* numQubitsPerSubQ */ -// T = qcomp, cpu_qcomp, gpu_qcomp -template -INLINE T fast_getPauliStrElem(PauliStr str, qindex row, qindex col) { - - // this function is called by both fullstatediagmatr_setElemsToPauliStrSum() - // and densmatr_setAmpsToPauliStrSum_sub(). The former's PauliStr can have - // Paulis on any of the 64 sites, but the latter's PauliStr is always - // constrainted to the lower 32 sites (because a 32-qubit density matrix - // is already too large for the world's computers). As such, the latter - // scenario can be optimised since str.highPaulis == 0, making the second - // loop below redundant. Avoiding this loop can at most half the runtime, - // though opens the risk that the former caller erroneously has its upper - // Paulis ignore. We forego this optimisation in defensive design, and - // because this function is only invoked during data structure initilisation - // and ergo infrequently. - - // regrettably duplicated from paulis.cpp which is inaccessible here - constexpr int numPaulisPerMask = sizeof(PAULI_MASK_TYPE) * 8 / 2; - - // T-agnostic complex literals - T p0, p1,n1, pI,nI; - p0 = {0, 0}; // 0 - p1 = {+1, 0}; // 1 - n1 = {-1, 0}; // -1 - pI = {0, +1}; // i - nI = {0, -1}; // -i - - // 'matrices' below is not declared constexpr or static const, even though - // it is fixed/known at compile-time, because this makes it incompatible - // with CUDA kernels/thrust. It is instead left as runtime innitialisation - // but this poses no real slowdown; this function, and its caller, are inlined - // so these 16 amps are re-processed one for each full enumeration of the - // PauliStrSum which is expected to have significantly more terms/coeffs - T matrices[][2][2] = { - {{p1,p0},{p0,p1}}, // I - {{p0,p1},{p1,p0}}, // X - {{p0,nI},{pI,p0}}, // Y - {{p1,p0},{p0,n1}}}; // Z - - T elem = p1; // 1 - - // could be compile-time unrolled into 32 iterations - for (int t=0; t -INLINE T fast_getPauliStrSumElem(T* coeffs, PauliStr* strings, qindex numTerms, qindex row, qindex col) { - - // this function accepts unpacked PauliStrSum fields since a PauliStrSum cannot - // be directly processed in CUDA kernels/thrust due to its 'qcomp' field. - // it also assumes str.highPaulis==0 for all str in strings, as per above func. - - T elem = {0, 0}; // type-agnostic complex literal - - // this loop is expected exponentially smaller than caller's loop - for (qindex n=0; n(strings[n], row, col); - - return elem; -} +// // T = qcomp, cpu_qcomp, gpu_qcomp +// template +// INLINE T fast_getPauliStrElem(PauliStr str, qindex row, qindex col) { + +// // this function is called by both fullstatediagmatr_setElemsToPauliStrSum() +// // and densmatr_setAmpsToPauliStrSum_sub(). The former's PauliStr can have +// // Paulis on any of the 64 sites, but the latter's PauliStr is always +// // constrainted to the lower 32 sites (because a 32-qubit density matrix +// // is already too large for the world's computers). As such, the latter +// // scenario can be optimised since str.highPaulis == 0, making the second +// // loop below redundant. Avoiding this loop can at most half the runtime, +// // though opens the risk that the former caller erroneously has its upper +// // Paulis ignore. We forego this optimisation in defensive design, and +// // because this function is only invoked during data structure initilisation +// // and ergo infrequently. + +// // regrettably duplicated from paulis.cpp which is inaccessible here +// constexpr int numPaulisPerMask = sizeof(PAULI_MASK_TYPE) * 8 / 2; + +// // T-agnostic complex literals +// T p0, p1,n1, pI,nI; +// p0 = {0, 0}; // 0 +// p1 = {+1, 0}; // 1 +// n1 = {-1, 0}; // -1 +// pI = {0, +1}; // i +// nI = {0, -1}; // -i + +// // 'matrices' below is not declared constexpr or static const, even though +// // it is fixed/known at compile-time, because this makes it incompatible +// // with CUDA kernels/thrust. It is instead left as runtime innitialisation +// // but this poses no real slowdown; this function, and its caller, are inlined +// // so these 16 amps are re-processed one for each full enumeration of the +// // PauliStrSum which is expected to have significantly more terms/coeffs +// T matrices[][2][2] = { +// {{p1,p0},{p0,p1}}, // I +// {{p0,p1},{p1,p0}}, // X +// {{p0,nI},{pI,p0}}, // Y +// {{p1,p0},{p0,n1}}}; // Z + +// T elem = p1; // 1 + +// // could be compile-time unrolled into 32 iterations +// for (int t=0; t +// INLINE T fast_getPauliStrSumElem(T* coeffs, PauliStr* strings, qindex numTerms, qindex row, qindex col) { + +// // this function accepts unpacked PauliStrSum fields since a PauliStrSum cannot +// // be directly processed in CUDA kernels/thrust due to its 'qcomp' field. +// // it also assumes str.highPaulis==0 for all str in strings, as per above func. + +// T elem = {0, 0}; // type-agnostic complex literal + +// // this loop is expected exponentially smaller than caller's loop +// for (qindex n=0; n(strings[n], row, col); + +// return elem; +// } #endif // FASTMATH_HPP \ No newline at end of file diff --git a/quest/src/cpu/cpu_types.hpp b/quest/src/cpu/cpu_types.hpp index 9539926c..651ad0e4 100644 --- a/quest/src/cpu/cpu_types.hpp +++ b/quest/src/cpu/cpu_types.hpp @@ -198,4 +198,85 @@ static_assert(std::is_trivially_copyable_v); // casting is safe for all circumstances (e.g. heap mem, static lists) + + + +INLINE cpu_qcomp fast_getPauliStrElem(PauliStr str, qindex row, qindex col) { + + // this function is called by both fullstatediagmatr_setElemsToPauliStrSum() + // and densmatr_setAmpsToPauliStrSum_sub(). The former's PauliStr can have + // Paulis on any of the 64 sites, but the latter's PauliStr is always + // constrainted to the lower 32 sites (because a 32-qubit density matrix + // is already too large for the world's computers). As such, the latter + // scenario can be optimised since str.highPaulis == 0, making the second + // loop below redundant. Avoiding this loop can at most half the runtime, + // though opens the risk that the former caller erroneously has its upper + // Paulis ignore. We forego this optimisation in defensive design, and + // because this function is only invoked during data structure initilisation + // and ergo infrequently. + + // regrettably duplicated from paulis.cpp which is inaccessible here + constexpr int numPaulisPerMask = sizeof(PAULI_MASK_TYPE) * 8 / 2; + + // T-agnostic complex literals + cpu_qcomp p0, p1,n1, pI,nI; + p0 = {0, 0}; // 0 + p1 = {+1, 0}; // 1 + n1 = {-1, 0}; // -1 + pI = {0, +1}; // i + nI = {0, -1}; // -i + + // 'matrices' below is not declared constexpr or static const, even though + // it is fixed/known at compile-time, because this makes it incompatible + // with CUDA kernels/thrust. It is instead left as runtime innitialisation + // but this poses no real slowdown; this function, and its caller, are inlined + // so these 16 amps are re-processed one for each full enumeration of the + // PauliStrSum which is expected to have significantly more terms/coeffs + cpu_qcomp matrices[][2][2] = { + {{p1,p0},{p0,p1}}, // I + {{p0,p1},{p1,p0}}, // X + {{p0,nI},{pI,p0}}, // Y + {{p1,p0},{p0,n1}}}; // Z + + cpu_qcomp elem = p1; // 1 + + // could be compile-time unrolled into 32 iterations + for (int t=0; t unpackMatrixToGpuQcomps(CompMatr2 in) { } + + + + + + + +INLINE gpu_qcomp fast_getPauliStrElem(PauliStr str, qindex row, qindex col) { + + // this function is called by both fullstatediagmatr_setElemsToPauliStrSum() + // and densmatr_setAmpsToPauliStrSum_sub(). The former's PauliStr can have + // Paulis on any of the 64 sites, but the latter's PauliStr is always + // constrainted to the lower 32 sites (because a 32-qubit density matrix + // is already too large for the world's computers). As such, the latter + // scenario can be optimised since str.highPaulis == 0, making the second + // loop below redundant. Avoiding this loop can at most half the runtime, + // though opens the risk that the former caller erroneously has its upper + // Paulis ignore. We forego this optimisation in defensive design, and + // because this function is only invoked during data structure initilisation + // and ergo infrequently. + + // regrettably duplicated from paulis.cpp which is inaccessible here + constexpr int numPaulisPerMask = sizeof(PAULI_MASK_TYPE) * 8 / 2; + + // T-agnostic complex literals + gpu_qcomp p0, p1,n1, pI,nI; + p0 = {0, 0}; // 0 + p1 = {+1, 0}; // 1 + n1 = {-1, 0}; // -1 + pI = {0, +1}; // i + nI = {0, -1}; // -i + + // 'matrices' below is not declared constexpr or static const, even though + // it is fixed/known at compile-time, because this makes it incompatible + // with CUDA kernels/thrust. It is instead left as runtime innitialisation + // but this poses no real slowdown; this function, and its caller, are inlined + // so these 16 amps are re-processed one for each full enumeration of the + // PauliStrSum which is expected to have significantly more terms/coeffs + gpu_qcomp matrices[][2][2] = { + {{p1,p0},{p0,p1}}, // I + {{p0,p1},{p1,p0}}, // X + {{p0,nI},{pI,p0}}, // Y + {{p1,p0},{p0,n1}}}; // Z + + gpu_qcomp elem = p1; // 1 + + // could be compile-time unrolled into 32 iterations + for (int t=0; t