QuEST-Kit · TysonRayJones · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026
diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml
@@ -28,6 +28,7 @@ on:
     branches:
       - main
       - devel
+      - msvc-so-debug
   pull_request:
     branches:
       - main
@@ -61,7 +62,7 @@ jobs:
       # compile QuEST with all combinations of below flags
       matrix:
         os: [windows-latest, ubuntu-latest, macos-latest]
-        precision: [1, 2, 4]
+        precision: [4, 2, 1] ################################### DEBUG! long double first for MSVC debug
         omp:       [ON, OFF]
         mpi:       [ON, OFF]
         cuda:      [ON, OFF]

diff --git a/quest/src/core/fastmath.hpp b/quest/src/core/fastmath.hpp
@@ -100,83 +100,83 @@ INLINE void fast_getSubQuregValues(qindex basisStateIndex, int* numQubitsPerSubQ
  */
 
 
-// T = qcomp, cpu_qcomp, gpu_qcomp
-template <typename T>
-INLINE T fast_getPauliStrElem(PauliStr str, qindex row, qindex col) {
-
-    // this function is called by both fullstatediagmatr_setElemsToPauliStrSum()
-    // and densmatr_setAmpsToPauliStrSum_sub(). The former's PauliStr can have
-    // Paulis on any of the 64 sites, but the latter's PauliStr is always
-    // constrainted to the lower 32 sites (because a 32-qubit density matrix
-    // is already too large for the world's computers). As such, the latter
-    // scenario can be optimised since str.highPaulis == 0, making the second
-    // loop below redundant. Avoiding this loop can at most half the runtime,
-    // though opens the risk that the former caller erroneously has its upper
-    // Paulis ignore. We forego this optimisation in defensive design, and
-    // because this function is only invoked during data structure initilisation
-    // and ergo infrequently.
-
-    // regrettably duplicated from paulis.cpp which is inaccessible here
-    constexpr int numPaulisPerMask = sizeof(PAULI_MASK_TYPE) * 8 / 2;
-
-    // T-agnostic complex literals
-    T p0, p1,n1, pI,nI;
-    p0 = {0,  0}; //  0
-    p1 = {+1, 0}; //  1
-    n1 = {-1, 0}; // -1
-    pI = {0, +1}; //  i
-    nI = {0, -1}; // -i
-
-    // 'matrices' below is not declared constexpr or static const, even though
-    // it is fixed/known at compile-time, because this makes it incompatible
-    // with CUDA kernels/thrust. It is instead left as runtime innitialisation
-    // but this poses no real slowdown; this function, and its caller, are inlined
-    // so these 16 amps are re-processed one for each full enumeration of the
-    // PauliStrSum which is expected to have significantly more terms/coeffs
-    T matrices[][2][2] = {
-        {{p1,p0},{p0,p1}},  // I
-        {{p0,p1},{p1,p0}},  // X
-        {{p0,nI},{pI,p0}},  // Y
-        {{p1,p0},{p0,n1}}}; // Z
-
-    T elem = p1; // 1
-
-    // could be compile-time unrolled into 32 iterations
-    for (int t=0; t<numPaulisPerMask; t++) {
-        int p = getTwoAdjacentBits(str.lowPaulis, 2*t);
-        int i = getBit(row, t);
-        int j = getBit(col, t);
-        elem *= matrices[p][i][j];
-    }
-
-    // could be compile-time unrolled into 32 iterations
-    for (int t=0; t<numPaulisPerMask; t++) {
-        int p = getTwoAdjacentBits(str.highPaulis, 2*t);
-        int i = getBit(row, t + numPaulisPerMask);
-        int j = getBit(col, t + numPaulisPerMask);
-        elem *= matrices[p][i][j];
-    }
-
-    return elem;
-}
-
-
-// T = qcomp, cpu_qcomp, gpu_qcomp
-template <typename T>
-INLINE T fast_getPauliStrSumElem(T* coeffs, PauliStr* strings, qindex numTerms, qindex row, qindex col) {
-
-    // this function accepts unpacked PauliStrSum fields since a PauliStrSum cannot 
-    // be directly processed in CUDA kernels/thrust due to its 'qcomp' field.
-    // it also assumes str.highPaulis==0 for all str in strings, as per above func.
-
-    T elem = {0, 0}; // type-agnostic complex literal
-
-    // this loop is expected exponentially smaller than caller's loop
-    for (qindex n=0; n<numTerms; n++)
-        elem += coeffs[n] * fast_getPauliStrElem<T>(strings[n], row, col);
-
-    return elem;
-}
+// // T = qcomp, cpu_qcomp, gpu_qcomp
+// template <typename T>
+// INLINE T fast_getPauliStrElem(PauliStr str, qindex row, qindex col) {
+
+//     // this function is called by both fullstatediagmatr_setElemsToPauliStrSum()
+//     // and densmatr_setAmpsToPauliStrSum_sub(). The former's PauliStr can have
+//     // Paulis on any of the 64 sites, but the latter's PauliStr is always
+//     // constrainted to the lower 32 sites (because a 32-qubit density matrix
+//     // is already too large for the world's computers). As such, the latter
+//     // scenario can be optimised since str.highPaulis == 0, making the second
+//     // loop below redundant. Avoiding this loop can at most half the runtime,
+//     // though opens the risk that the former caller erroneously has its upper
+//     // Paulis ignore. We forego this optimisation in defensive design, and
+//     // because this function is only invoked during data structure initilisation
+//     // and ergo infrequently.
+
+//     // regrettably duplicated from paulis.cpp which is inaccessible here
+//     constexpr int numPaulisPerMask = sizeof(PAULI_MASK_TYPE) * 8 / 2;
+
+//     // T-agnostic complex literals
+//     T p0, p1,n1, pI,nI;
+//     p0 = {0,  0}; //  0
+//     p1 = {+1, 0}; //  1
+//     n1 = {-1, 0}; // -1
+//     pI = {0, +1}; //  i
+//     nI = {0, -1}; // -i
+
+//     // 'matrices' below is not declared constexpr or static const, even though
+//     // it is fixed/known at compile-time, because this makes it incompatible
+//     // with CUDA kernels/thrust. It is instead left as runtime innitialisation
+//     // but this poses no real slowdown; this function, and its caller, are inlined
+//     // so these 16 amps are re-processed one for each full enumeration of the
+//     // PauliStrSum which is expected to have significantly more terms/coeffs
+//     T matrices[][2][2] = {
+//         {{p1,p0},{p0,p1}},  // I
+//         {{p0,p1},{p1,p0}},  // X
+//         {{p0,nI},{pI,p0}},  // Y
+//         {{p1,p0},{p0,n1}}}; // Z
+
+//     T elem = p1; // 1
+
+//     // could be compile-time unrolled into 32 iterations
+//     for (int t=0; t<numPaulisPerMask; t++) {
+//         int p = getTwoAdjacentBits(str.lowPaulis, 2*t);
+//         int i = getBit(row, t);
+//         int j = getBit(col, t);
+//         elem *= matrices[p][i][j];
+//     }
+
+//     // could be compile-time unrolled into 32 iterations
+//     for (int t=0; t<numPaulisPerMask; t++) {
+//         int p = getTwoAdjacentBits(str.highPaulis, 2*t);
+//         int i = getBit(row, t + numPaulisPerMask);
+//         int j = getBit(col, t + numPaulisPerMask);
+//         elem *= matrices[p][i][j];
+//     }
+
+//     return elem;
+// }
+
+
+// // T = qcomp, cpu_qcomp, gpu_qcomp
+// template <typename T>
+// INLINE T fast_getPauliStrSumElem(T* coeffs, PauliStr* strings, qindex numTerms, qindex row, qindex col) {
+
+//     // this function accepts unpacked PauliStrSum fields since a PauliStrSum cannot 
+//     // be directly processed in CUDA kernels/thrust due to its 'qcomp' field.
+//     // it also assumes str.highPaulis==0 for all str in strings, as per above func.
+
+//     T elem = {0, 0}; // type-agnostic complex literal
+
+//     // this loop is expected exponentially smaller than caller's loop
+//     for (qindex n=0; n<numTerms; n++)
+//         elem += coeffs[n] * fast_getPauliStrElem<T>(strings[n], row, col);
+
+//     return elem;
+// }
 
 
 #endif // FASTMATH_HPP
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
@@ -412,7 +412,11 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, v
 
     // use cpu_qcomp arithmetic overloads (avoid qcomp's)
     cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
-    auto elems = getCpuQcomps<2>(matr.elems); // MSVC requires explicit template param, bah!
+    // auto elems = getCpuQcompsMatr1(matr.elems);
+    cpu_qcomp m00 = getCpuQcomp(matr.elems[0][0]); // MSVC cannot handle 2D cpu_qcomps
+    cpu_qcomp m01 = getCpuQcomp(matr.elems[0][1]);
+    cpu_qcomp m10 = getCpuQcomp(matr.elems[1][0]);
+    cpu_qcomp m11 = getCpuQcomp(matr.elems[1][1]);
 
     auto sortedQubits   = util_getSorted(ctrls, {targ});
     auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0});
@@ -432,8 +436,10 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, v
         cpu_qcomp amp0 = amps[i0];
         cpu_qcomp amp1 = amps[i1];
 
-        amps[i0] = elems[0][0]*amp0 + elems[0][1]*amp1;
-        amps[i1] = elems[1][0]*amp0 + elems[1][1]*amp1;
+        // amps[i0] = elems[0][0]*amp0 + elems[0][1]*amp1;
+        // amps[i1] = elems[1][0]*amp0 + elems[1][1]*amp1;
+        amps[i0] = m00*amp0 + m01*amp1;
+        amps[i1] = m10*amp0 + m11*amp1;
     }
 }
 
@@ -495,7 +501,30 @@ void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
 
     // use cpu_qcomp arithmetic overloads (avoid qcomp's)
     cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
-    auto elems = getCpuQcomps<4>(matr.elems); // MSVC requires explicit template param, bah!
+
+    // auto elems = getCpuQcompsMatr2(matr.elems); // MSVC requires explicit template param, bah!
+
+    cpu_qcomp m00 = getCpuQcomp(matr.elems[0][0]);
+    cpu_qcomp m01 = getCpuQcomp(matr.elems[0][1]);
+    cpu_qcomp m02 = getCpuQcomp(matr.elems[0][2]);
+    cpu_qcomp m03 = getCpuQcomp(matr.elems[0][3]);
+
+    cpu_qcomp m10 = getCpuQcomp(matr.elems[1][0]);
+    cpu_qcomp m11 = getCpuQcomp(matr.elems[1][1]);
+    cpu_qcomp m12 = getCpuQcomp(matr.elems[1][2]);
+    cpu_qcomp m13 = getCpuQcomp(matr.elems[1][3]);
+
+    cpu_qcomp m20 = getCpuQcomp(matr.elems[2][0]);
+    cpu_qcomp m21 = getCpuQcomp(matr.elems[2][1]);
+    cpu_qcomp m22 = getCpuQcomp(matr.elems[2][2]);
+    cpu_qcomp m23 = getCpuQcomp(matr.elems[2][3]);
+
+    cpu_qcomp m30 = getCpuQcomp(matr.elems[3][0]);
+    cpu_qcomp m31 = getCpuQcomp(matr.elems[3][1]);
+    cpu_qcomp m32 = getCpuQcomp(matr.elems[3][2]);
+    cpu_qcomp m33 = getCpuQcomp(matr.elems[3][3]);
+
+
 
     auto sortedQubits   = util_getSorted(ctrls, {targ1, targ2});
     auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1, targ2}, {0, 0});
@@ -520,10 +549,15 @@ void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
         cpu_qcomp amp11 = amps[i11];
 
         // amps[i_n] = sum_j matr.elems[n][j] amp[i_n]
-        amps[i00] = elems[0][0]*amp00 + elems[0][1]*amp01 + elems[0][2]*amp10 + elems[0][3]*amp11;
-        amps[i01] = elems[1][0]*amp00 + elems[1][1]*amp01 + elems[1][2]*amp10 + elems[1][3]*amp11;
-        amps[i10] = elems[2][0]*amp00 + elems[2][1]*amp01 + elems[2][2]*amp10 + elems[2][3]*amp11;
-        amps[i11] = elems[3][0]*amp00 + elems[3][1]*amp01 + elems[3][2]*amp10 + elems[3][3]*amp11;
+        // amps[i00] = elems[0][0]*amp00 + elems[0][1]*amp01 + elems[0][2]*amp10 + elems[0][3]*amp11;
+        // amps[i01] = elems[1][0]*amp00 + elems[1][1]*amp01 + elems[1][2]*amp10 + elems[1][3]*amp11;
+        // amps[i10] = elems[2][0]*amp00 + elems[2][1]*amp01 + elems[2][2]*amp10 + elems[2][3]*amp11;
+        // amps[i11] = elems[3][0]*amp00 + elems[3][1]*amp01 + elems[3][2]*amp10 + elems[3][3]*amp11;
+
+        amps[i00] = m00*amp00 + m01*amp01 + m02*amp10 + m03*amp11;
+        amps[i01] = m10*amp00 + m11*amp01 + m12*amp10 + m13*amp11;
+        amps[i10] = m20*amp00 + m21*amp01 + m22*amp10 + m23*amp11;
+        amps[i11] = m30*amp00 + m31*amp01 + m32*amp10 + m33*amp11;
     }
 }
 

diff --git a/quest/src/cpu/cpu_types.hpp b/quest/src/cpu/cpu_types.hpp
@@ -127,16 +127,27 @@ INLINE qcomp getQcomp(const cpu_qcomp& a) {
 
 
 // creator for fixed-size dense matrices (CompMatr1 and CompMatr2) ((not inlined!))
-template <int dim>
-std::array<std::array<cpu_qcomp,dim>,dim> getCpuQcomps(qcomp matr[dim][dim]) {
+std::array<std::array<cpu_qcomp,2>,2> getCpuQcompsMatr1(qcomp matr[2][2]) {
 
-    // detect brain-dead compiler inferencing (looking at you MSVC...)
-    static_assert(dim == 2 || dim == 4, "getCpuQcomps called with unexpected dim");
+    // dumb and explicit here because MSVC + OpenMP breaks
+    // when templating this - not worth fixing here because
+    // we are considering a refactor which merges cpu_types.hpp
+    // with gpu_types.cuh anyway
 
-    std::array<std::array<cpu_qcomp,dim>,dim> out;
+    std::array<std::array<cpu_qcomp,2>,2> out;
 
-    for (int i=0; i<dim; i++)
-        for (int j=0; j<dim; j++)
+    for (int i=0; i<2; i++)
+        for (int j=0; j<2; j++)
+            out[i][j] = getCpuQcomp(matr[i][j]);
+
+    return out;
+}
+std::array<std::array<cpu_qcomp,4>,4> getCpuQcompsMatr2(qcomp matr[4][4]) {
+
+    std::array<std::array<cpu_qcomp,4>,4> out;
+
+    for (int i=0; i<4; i++)
+        for (int j=0; j<4; j++)
             out[i][j] = getCpuQcomp(matr[i][j]);
 
     return out;
@@ -187,4 +198,85 @@ static_assert(std::is_trivially_copyable_v<cpu_qcomp>);
 // casting is safe for all circumstances (e.g. heap mem, static lists)
 
 
+
+
+
+INLINE cpu_qcomp fast_getPauliStrElem(PauliStr str, qindex row, qindex col) {
+
+    // this function is called by both fullstatediagmatr_setElemsToPauliStrSum()
+    // and densmatr_setAmpsToPauliStrSum_sub(). The former's PauliStr can have
+    // Paulis on any of the 64 sites, but the latter's PauliStr is always
+    // constrainted to the lower 32 sites (because a 32-qubit density matrix
+    // is already too large for the world's computers). As such, the latter
+    // scenario can be optimised since str.highPaulis == 0, making the second
+    // loop below redundant. Avoiding this loop can at most half the runtime,
+    // though opens the risk that the former caller erroneously has its upper
+    // Paulis ignore. We forego this optimisation in defensive design, and
+    // because this function is only invoked during data structure initilisation
+    // and ergo infrequently.
+
+    // regrettably duplicated from paulis.cpp which is inaccessible here
+    constexpr int numPaulisPerMask = sizeof(PAULI_MASK_TYPE) * 8 / 2;
+
+    // T-agnostic complex literals
+    cpu_qcomp p0, p1,n1, pI,nI;
+    p0 = {0,  0}; //  0
+    p1 = {+1, 0}; //  1
+    n1 = {-1, 0}; // -1
+    pI = {0, +1}; //  i
+    nI = {0, -1}; // -i
+
+    // 'matrices' below is not declared constexpr or static const, even though
+    // it is fixed/known at compile-time, because this makes it incompatible
+    // with CUDA kernels/thrust. It is instead left as runtime innitialisation
+    // but this poses no real slowdown; this function, and its caller, are inlined
+    // so these 16 amps are re-processed one for each full enumeration of the
+    // PauliStrSum which is expected to have significantly more terms/coeffs
+    cpu_qcomp matrices[][2][2] = {
+        {{p1,p0},{p0,p1}},  // I
+        {{p0,p1},{p1,p0}},  // X
+        {{p0,nI},{pI,p0}},  // Y
+        {{p1,p0},{p0,n1}}}; // Z
+
+    cpu_qcomp elem = p1; // 1
+
+    // could be compile-time unrolled into 32 iterations
+    for (int t=0; t<numPaulisPerMask; t++) {
+        int p = getTwoAdjacentBits(str.lowPaulis, 2*t);
+        int i = getBit(row, t);
+        int j = getBit(col, t);
+        elem *= matrices[p][i][j];
+    }
+
+    // could be compile-time unrolled into 32 iterations
+    for (int t=0; t<numPaulisPerMask; t++) {
+        int p = getTwoAdjacentBits(str.highPaulis, 2*t);
+        int i = getBit(row, t + numPaulisPerMask);
+        int j = getBit(col, t + numPaulisPerMask);
+        elem *= matrices[p][i][j];
+    }
+
+    return elem;
+}
+
+
+
+INLINE cpu_qcomp fast_getPauliStrSumElem(cpu_qcomp* coeffs, PauliStr* strings, qindex numTerms, qindex row, qindex col) {
+
+    // this function accepts unpacked PauliStrSum fields since a PauliStrSum cannot 
+    // be directly processed in CUDA kernels/thrust due to its 'qcomp' field.
+    // it also assumes str.highPaulis==0 for all str in strings, as per above func.
+
+    cpu_qcomp elem = {0, 0}; // type-agnostic complex literal
+
+    // this loop is expected exponentially smaller than caller's loop
+    for (qindex n=0; n<numTerms; n++)
+        elem += coeffs[n] * fast_getPauliStrElem(strings[n], row, col);
+
+    return elem;
+}
+
+
+
+
 #endif // CPU_TYPES_HPP