diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index d1901553..595b585c 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -28,6 +28,7 @@ on: branches: - main - devel + - msvc-so-debug pull_request: branches: - main @@ -61,7 +62,7 @@ jobs: # compile QuEST with all combinations of below flags matrix: os: [windows-latest, ubuntu-latest, macos-latest] - precision: [1, 2, 4] + precision: [4, 2, 1] ################################### DEBUG! long double first for MSVC debug omp: [ON, OFF] mpi: [ON, OFF] cuda: [ON, OFF] diff --git a/quest/src/core/fastmath.hpp b/quest/src/core/fastmath.hpp index 0dbcee9e..b27429b4 100644 --- a/quest/src/core/fastmath.hpp +++ b/quest/src/core/fastmath.hpp @@ -100,83 +100,83 @@ INLINE void fast_getSubQuregValues(qindex basisStateIndex, int* numQubitsPerSubQ */ -// T = qcomp, cpu_qcomp, gpu_qcomp -template -INLINE T fast_getPauliStrElem(PauliStr str, qindex row, qindex col) { - - // this function is called by both fullstatediagmatr_setElemsToPauliStrSum() - // and densmatr_setAmpsToPauliStrSum_sub(). The former's PauliStr can have - // Paulis on any of the 64 sites, but the latter's PauliStr is always - // constrainted to the lower 32 sites (because a 32-qubit density matrix - // is already too large for the world's computers). As such, the latter - // scenario can be optimised since str.highPaulis == 0, making the second - // loop below redundant. Avoiding this loop can at most half the runtime, - // though opens the risk that the former caller erroneously has its upper - // Paulis ignore. We forego this optimisation in defensive design, and - // because this function is only invoked during data structure initilisation - // and ergo infrequently. - - // regrettably duplicated from paulis.cpp which is inaccessible here - constexpr int numPaulisPerMask = sizeof(PAULI_MASK_TYPE) * 8 / 2; - - // T-agnostic complex literals - T p0, p1,n1, pI,nI; - p0 = {0, 0}; // 0 - p1 = {+1, 0}; // 1 - n1 = {-1, 0}; // -1 - pI = {0, +1}; // i - nI = {0, -1}; // -i - - // 'matrices' below is not declared constexpr or static const, even though - // it is fixed/known at compile-time, because this makes it incompatible - // with CUDA kernels/thrust. It is instead left as runtime innitialisation - // but this poses no real slowdown; this function, and its caller, are inlined - // so these 16 amps are re-processed one for each full enumeration of the - // PauliStrSum which is expected to have significantly more terms/coeffs - T matrices[][2][2] = { - {{p1,p0},{p0,p1}}, // I - {{p0,p1},{p1,p0}}, // X - {{p0,nI},{pI,p0}}, // Y - {{p1,p0},{p0,n1}}}; // Z - - T elem = p1; // 1 - - // could be compile-time unrolled into 32 iterations - for (int t=0; t -INLINE T fast_getPauliStrSumElem(T* coeffs, PauliStr* strings, qindex numTerms, qindex row, qindex col) { - - // this function accepts unpacked PauliStrSum fields since a PauliStrSum cannot - // be directly processed in CUDA kernels/thrust due to its 'qcomp' field. - // it also assumes str.highPaulis==0 for all str in strings, as per above func. - - T elem = {0, 0}; // type-agnostic complex literal - - // this loop is expected exponentially smaller than caller's loop - for (qindex n=0; n(strings[n], row, col); - - return elem; -} +// // T = qcomp, cpu_qcomp, gpu_qcomp +// template +// INLINE T fast_getPauliStrElem(PauliStr str, qindex row, qindex col) { + +// // this function is called by both fullstatediagmatr_setElemsToPauliStrSum() +// // and densmatr_setAmpsToPauliStrSum_sub(). The former's PauliStr can have +// // Paulis on any of the 64 sites, but the latter's PauliStr is always +// // constrainted to the lower 32 sites (because a 32-qubit density matrix +// // is already too large for the world's computers). As such, the latter +// // scenario can be optimised since str.highPaulis == 0, making the second +// // loop below redundant. Avoiding this loop can at most half the runtime, +// // though opens the risk that the former caller erroneously has its upper +// // Paulis ignore. We forego this optimisation in defensive design, and +// // because this function is only invoked during data structure initilisation +// // and ergo infrequently. + +// // regrettably duplicated from paulis.cpp which is inaccessible here +// constexpr int numPaulisPerMask = sizeof(PAULI_MASK_TYPE) * 8 / 2; + +// // T-agnostic complex literals +// T p0, p1,n1, pI,nI; +// p0 = {0, 0}; // 0 +// p1 = {+1, 0}; // 1 +// n1 = {-1, 0}; // -1 +// pI = {0, +1}; // i +// nI = {0, -1}; // -i + +// // 'matrices' below is not declared constexpr or static const, even though +// // it is fixed/known at compile-time, because this makes it incompatible +// // with CUDA kernels/thrust. It is instead left as runtime innitialisation +// // but this poses no real slowdown; this function, and its caller, are inlined +// // so these 16 amps are re-processed one for each full enumeration of the +// // PauliStrSum which is expected to have significantly more terms/coeffs +// T matrices[][2][2] = { +// {{p1,p0},{p0,p1}}, // I +// {{p0,p1},{p1,p0}}, // X +// {{p0,nI},{pI,p0}}, // Y +// {{p1,p0},{p0,n1}}}; // Z + +// T elem = p1; // 1 + +// // could be compile-time unrolled into 32 iterations +// for (int t=0; t +// INLINE T fast_getPauliStrSumElem(T* coeffs, PauliStr* strings, qindex numTerms, qindex row, qindex col) { + +// // this function accepts unpacked PauliStrSum fields since a PauliStrSum cannot +// // be directly processed in CUDA kernels/thrust due to its 'qcomp' field. +// // it also assumes str.highPaulis==0 for all str in strings, as per above func. + +// T elem = {0, 0}; // type-agnostic complex literal + +// // this loop is expected exponentially smaller than caller's loop +// for (qindex n=0; n(strings[n], row, col); + +// return elem; +// } #endif // FASTMATH_HPP \ No newline at end of file diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp index ad6bc3c4..e9689ac7 100644 --- a/quest/src/cpu/cpu_subroutines.cpp +++ b/quest/src/cpu/cpu_subroutines.cpp @@ -412,7 +412,11 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector ctrls, v // use cpu_qcomp arithmetic overloads (avoid qcomp's) cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); - auto elems = getCpuQcomps<2>(matr.elems); // MSVC requires explicit template param, bah! + // auto elems = getCpuQcompsMatr1(matr.elems); + cpu_qcomp m00 = getCpuQcomp(matr.elems[0][0]); // MSVC cannot handle 2D cpu_qcomps + cpu_qcomp m01 = getCpuQcomp(matr.elems[0][1]); + cpu_qcomp m10 = getCpuQcomp(matr.elems[1][0]); + cpu_qcomp m11 = getCpuQcomp(matr.elems[1][1]); auto sortedQubits = util_getSorted(ctrls, {targ}); auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0}); @@ -432,8 +436,10 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector ctrls, v cpu_qcomp amp0 = amps[i0]; cpu_qcomp amp1 = amps[i1]; - amps[i0] = elems[0][0]*amp0 + elems[0][1]*amp1; - amps[i1] = elems[1][0]*amp0 + elems[1][1]*amp1; + // amps[i0] = elems[0][0]*amp0 + elems[0][1]*amp1; + // amps[i1] = elems[1][0]*amp0 + elems[1][1]*amp1; + amps[i0] = m00*amp0 + m01*amp1; + amps[i1] = m10*amp0 + m11*amp1; } } @@ -495,7 +501,30 @@ void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve // use cpu_qcomp arithmetic overloads (avoid qcomp's) cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); - auto elems = getCpuQcomps<4>(matr.elems); // MSVC requires explicit template param, bah! + + // auto elems = getCpuQcompsMatr2(matr.elems); // MSVC requires explicit template param, bah! + + cpu_qcomp m00 = getCpuQcomp(matr.elems[0][0]); + cpu_qcomp m01 = getCpuQcomp(matr.elems[0][1]); + cpu_qcomp m02 = getCpuQcomp(matr.elems[0][2]); + cpu_qcomp m03 = getCpuQcomp(matr.elems[0][3]); + + cpu_qcomp m10 = getCpuQcomp(matr.elems[1][0]); + cpu_qcomp m11 = getCpuQcomp(matr.elems[1][1]); + cpu_qcomp m12 = getCpuQcomp(matr.elems[1][2]); + cpu_qcomp m13 = getCpuQcomp(matr.elems[1][3]); + + cpu_qcomp m20 = getCpuQcomp(matr.elems[2][0]); + cpu_qcomp m21 = getCpuQcomp(matr.elems[2][1]); + cpu_qcomp m22 = getCpuQcomp(matr.elems[2][2]); + cpu_qcomp m23 = getCpuQcomp(matr.elems[2][3]); + + cpu_qcomp m30 = getCpuQcomp(matr.elems[3][0]); + cpu_qcomp m31 = getCpuQcomp(matr.elems[3][1]); + cpu_qcomp m32 = getCpuQcomp(matr.elems[3][2]); + cpu_qcomp m33 = getCpuQcomp(matr.elems[3][3]); + + auto sortedQubits = util_getSorted(ctrls, {targ1, targ2}); auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1, targ2}, {0, 0}); @@ -520,10 +549,15 @@ void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve cpu_qcomp amp11 = amps[i11]; // amps[i_n] = sum_j matr.elems[n][j] amp[i_n] - amps[i00] = elems[0][0]*amp00 + elems[0][1]*amp01 + elems[0][2]*amp10 + elems[0][3]*amp11; - amps[i01] = elems[1][0]*amp00 + elems[1][1]*amp01 + elems[1][2]*amp10 + elems[1][3]*amp11; - amps[i10] = elems[2][0]*amp00 + elems[2][1]*amp01 + elems[2][2]*amp10 + elems[2][3]*amp11; - amps[i11] = elems[3][0]*amp00 + elems[3][1]*amp01 + elems[3][2]*amp10 + elems[3][3]*amp11; + // amps[i00] = elems[0][0]*amp00 + elems[0][1]*amp01 + elems[0][2]*amp10 + elems[0][3]*amp11; + // amps[i01] = elems[1][0]*amp00 + elems[1][1]*amp01 + elems[1][2]*amp10 + elems[1][3]*amp11; + // amps[i10] = elems[2][0]*amp00 + elems[2][1]*amp01 + elems[2][2]*amp10 + elems[2][3]*amp11; + // amps[i11] = elems[3][0]*amp00 + elems[3][1]*amp01 + elems[3][2]*amp10 + elems[3][3]*amp11; + + amps[i00] = m00*amp00 + m01*amp01 + m02*amp10 + m03*amp11; + amps[i01] = m10*amp00 + m11*amp01 + m12*amp10 + m13*amp11; + amps[i10] = m20*amp00 + m21*amp01 + m22*amp10 + m23*amp11; + amps[i11] = m30*amp00 + m31*amp01 + m32*amp10 + m33*amp11; } } diff --git a/quest/src/cpu/cpu_types.hpp b/quest/src/cpu/cpu_types.hpp index a426dfce..651ad0e4 100644 --- a/quest/src/cpu/cpu_types.hpp +++ b/quest/src/cpu/cpu_types.hpp @@ -127,16 +127,27 @@ INLINE qcomp getQcomp(const cpu_qcomp& a) { // creator for fixed-size dense matrices (CompMatr1 and CompMatr2) ((not inlined!)) -template -std::array,dim> getCpuQcomps(qcomp matr[dim][dim]) { +std::array,2> getCpuQcompsMatr1(qcomp matr[2][2]) { - // detect brain-dead compiler inferencing (looking at you MSVC...) - static_assert(dim == 2 || dim == 4, "getCpuQcomps called with unexpected dim"); + // dumb and explicit here because MSVC + OpenMP breaks + // when templating this - not worth fixing here because + // we are considering a refactor which merges cpu_types.hpp + // with gpu_types.cuh anyway - std::array,dim> out; + std::array,2> out; - for (int i=0; i,4> getCpuQcompsMatr2(qcomp matr[4][4]) { + + std::array,4> out; + + for (int i=0; i<4; i++) + for (int j=0; j<4; j++) out[i][j] = getCpuQcomp(matr[i][j]); return out; @@ -187,4 +198,85 @@ static_assert(std::is_trivially_copyable_v); // casting is safe for all circumstances (e.g. heap mem, static lists) + + + +INLINE cpu_qcomp fast_getPauliStrElem(PauliStr str, qindex row, qindex col) { + + // this function is called by both fullstatediagmatr_setElemsToPauliStrSum() + // and densmatr_setAmpsToPauliStrSum_sub(). The former's PauliStr can have + // Paulis on any of the 64 sites, but the latter's PauliStr is always + // constrainted to the lower 32 sites (because a 32-qubit density matrix + // is already too large for the world's computers). As such, the latter + // scenario can be optimised since str.highPaulis == 0, making the second + // loop below redundant. Avoiding this loop can at most half the runtime, + // though opens the risk that the former caller erroneously has its upper + // Paulis ignore. We forego this optimisation in defensive design, and + // because this function is only invoked during data structure initilisation + // and ergo infrequently. + + // regrettably duplicated from paulis.cpp which is inaccessible here + constexpr int numPaulisPerMask = sizeof(PAULI_MASK_TYPE) * 8 / 2; + + // T-agnostic complex literals + cpu_qcomp p0, p1,n1, pI,nI; + p0 = {0, 0}; // 0 + p1 = {+1, 0}; // 1 + n1 = {-1, 0}; // -1 + pI = {0, +1}; // i + nI = {0, -1}; // -i + + // 'matrices' below is not declared constexpr or static const, even though + // it is fixed/known at compile-time, because this makes it incompatible + // with CUDA kernels/thrust. It is instead left as runtime innitialisation + // but this poses no real slowdown; this function, and its caller, are inlined + // so these 16 amps are re-processed one for each full enumeration of the + // PauliStrSum which is expected to have significantly more terms/coeffs + cpu_qcomp matrices[][2][2] = { + {{p1,p0},{p0,p1}}, // I + {{p0,p1},{p1,p0}}, // X + {{p0,nI},{pI,p0}}, // Y + {{p1,p0},{p0,n1}}}; // Z + + cpu_qcomp elem = p1; // 1 + + // could be compile-time unrolled into 32 iterations + for (int t=0; t unpackMatrixToGpuQcomps(CompMatr2 in) { } + + + + + + + +INLINE gpu_qcomp fast_getPauliStrElem(PauliStr str, qindex row, qindex col) { + + // this function is called by both fullstatediagmatr_setElemsToPauliStrSum() + // and densmatr_setAmpsToPauliStrSum_sub(). The former's PauliStr can have + // Paulis on any of the 64 sites, but the latter's PauliStr is always + // constrainted to the lower 32 sites (because a 32-qubit density matrix + // is already too large for the world's computers). As such, the latter + // scenario can be optimised since str.highPaulis == 0, making the second + // loop below redundant. Avoiding this loop can at most half the runtime, + // though opens the risk that the former caller erroneously has its upper + // Paulis ignore. We forego this optimisation in defensive design, and + // because this function is only invoked during data structure initilisation + // and ergo infrequently. + + // regrettably duplicated from paulis.cpp which is inaccessible here + constexpr int numPaulisPerMask = sizeof(PAULI_MASK_TYPE) * 8 / 2; + + // T-agnostic complex literals + gpu_qcomp p0, p1,n1, pI,nI; + p0 = {0, 0}; // 0 + p1 = {+1, 0}; // 1 + n1 = {-1, 0}; // -1 + pI = {0, +1}; // i + nI = {0, -1}; // -i + + // 'matrices' below is not declared constexpr or static const, even though + // it is fixed/known at compile-time, because this makes it incompatible + // with CUDA kernels/thrust. It is instead left as runtime innitialisation + // but this poses no real slowdown; this function, and its caller, are inlined + // so these 16 amps are re-processed one for each full enumeration of the + // PauliStrSum which is expected to have significantly more terms/coeffs + gpu_qcomp matrices[][2][2] = { + {{p1,p0},{p0,p1}}, // I + {{p0,p1},{p1,p0}}, // X + {{p0,nI},{pI,p0}}, // Y + {{p1,p0},{p0,n1}}}; // Z + + gpu_qcomp elem = p1; // 1 + + // could be compile-time unrolled into 32 iterations + for (int t=0; t:/EHsc> + $<$:/bigobj> + $<$:/Zm500> +) +add_compile_definitions(CATCH_CONFIG_DISABLE_STRINGIFICATION) +add_compile_definitions(CATCH_CONFIG_FAST_COMPILE) + + add_subdirectory(unit) add_subdirectory(utils) add_subdirectory(integration)