From 15028bafe127e483f964c4be918c2fdb59e2e443 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Fri, 17 Apr 2026 15:52:06 -0400 Subject: [PATCH 01/16] created cpu_qcomp --- CMakeLists.txt | 65 ---- quest/include/types.h | 7 +- quest/src/core/fastmath.hpp | 51 +-- quest/src/cpu/cpu_subroutines.cpp | 598 ++++++++++++++++++++---------- quest/src/cpu/cpu_types.hpp | 174 +++++++++ utils/scripts/compile.sh | 2 - 6 files changed, 590 insertions(+), 307 deletions(-) create mode 100644 quest/src/cpu/cpu_types.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 5d3087950..dc209ca5f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -469,71 +469,6 @@ set(COMPILE_HIP ${ENABLE_HIP}) -# ============================ -# Patch CPU performance -# ============================ - - -# Patch performance of CPU std::complex arithmetic operator overloads. -# The cpu_subroutines.cpp file makes extensive use of std::complex operator -# overloads, and alas these are significantly slower than hand-rolled -# arithmetic, due to their NaN and inf checks, and interference with SIMD. -# It is crucial to pass additional optimisation flags to this file to restore -# hand-rolled performance (else QuEST v3 is faster than v4 eep). In theory, -# we can achieve this with specific, relatively 'safe' flags such as LLVM's: -# -ffinite-math-only -fno-signed-zeros -ffp-contract=fast -# However, it is a nuisance to find equivalent flags for different compilers -# and monitor their performance vs accuracy trade-offs. So instead, we use the -# much more aggressive and ubiquitous -Ofast flag to guarantee performance. -# This introduces many potentially dangerous optimisations, such as asserting -# associativity of flops, which would break techniques like Kahan summation. -# The cpu_subroutines.cpp must ergo be very conscious of these optimisations. -# We here also explicitly inform the file cpu_subroutines.cpp whether or not -# we are passing the flags, so it can detect/error when flags are forgotten. - -if (CMAKE_BUILD_TYPE STREQUAL "Release") - - # Release build will pass -Ofast when known for the given compiler, and - # fallback to giving a performance warning and proceeding with compilation - - if (CMAKE_CXX_COMPILER_ID MATCHES "AppleClang|Clang|Cray|CrayClang|GNU|HP|Intel|IntelLLVM|NVHPC|NVIDIA|XL|XLClang") - set(patch_flags "-Ofast") - set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=1") - elseif (CMAKE_CXX_COMPILER_ID MATCHES "HP") - set(patch_flags "+Ofast") - set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=1") - elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC") - set(patch_flags "/fp:fast") - set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=1") - else() - message(WARNING - "The compiler (${CMAKE_CXX_COMPILER_ID}) is unrecognised and so crucial optimisation flags have not been " - "passed to the CPU backend. These flags are necessary for full performance when performing complex algebra, " - "otherwise a slowdown of 3-50x may be observed. Please edit the root CMakeLists.txt to include flags which are " - "equivalent to GNU's -Ofast flag for your compiler (search this warning), or contact the QuEST developers for help." - ) - set(patch_flags "") - set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=0") - endif() - -else() - - # Non-release builds (e.g. Debug) will pass no optimisation flags, and will - # communicate to cpu_subroutines.cpp that this is intentional via a macro - - set(patch_flags "") - set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=0") - -endif() - -set_source_files_properties( - quest/src/cpu/cpu_subroutines.cpp - PROPERTIES - COMPILE_FLAGS "${patch_flags} ${patch_macro}" -) - - - # ============================ # Pass files to library # ============================ diff --git a/quest/include/types.h b/quest/include/types.h index 066d35e9d..f1f49315d 100644 --- a/quest/include/types.h +++ b/quest/include/types.h @@ -158,9 +158,10 @@ static inline qcomp getQcomp(qreal re, qreal im) { // not the same precision as qcomp, so compilation will fail depending // on the setting of PRECISION. To avoid this, we'll define overloads // between all type/precision permutations, always returning qcomp. These - // overloads are also used by the QuEST source code. Via the unholy macros - // below, we create 312 overloads; no doubt this is going to break something - // in the future, for which I am already sorry :'( + // overloads are also used by the QuEST source code (though incidentally, + // not the high performance backend which uses custom complex overloads). + // Via the unholy macros below, we create 312 overloads; no doubt this is + // going to break something in the future, for which I am already sorry :'( /// @cond EXCLUDE_FROM_DOXYGEN diff --git a/quest/src/core/fastmath.hpp b/quest/src/core/fastmath.hpp index 6367e116a..0dbcee9ee 100644 --- a/quest/src/core/fastmath.hpp +++ b/quest/src/core/fastmath.hpp @@ -18,27 +18,6 @@ -/* - * TYPE ALIASING - */ - - -// 'qcomp' cannot be used inside CUDA kernels/thrust, so must not appear in -// these inlined definitions. Instead, we create an alias which will resolve -// to 'qcomp' (defined in types.h) when parsed by the CPU backend, and 'cu_qcomp' -// (defined in gpu_types.cuh which is not explicitly resolved in this header) -// when parsed by the GPU backend, which will prior define USE_CU_QCOMP. It is -// essential this header is included after gpu_types.cuh is included by the -// GPU backend. Hacky, but avoids code duplication! - -#ifdef USE_CU_QCOMP - #define QCOMP_ALIAS cu_qcomp -#else - #define QCOMP_ALIAS qcomp -#endif - - - /* * ARITHMETIC */ @@ -121,7 +100,9 @@ INLINE void fast_getSubQuregValues(qindex basisStateIndex, int* numQubitsPerSubQ */ -INLINE QCOMP_ALIAS fast_getPauliStrElem(PauliStr str, qindex row, qindex col) { +// T = qcomp, cpu_qcomp, gpu_qcomp +template +INLINE T fast_getPauliStrElem(PauliStr str, qindex row, qindex col) { // this function is called by both fullstatediagmatr_setElemsToPauliStrSum() // and densmatr_setAmpsToPauliStrSum_sub(). The former's PauliStr can have @@ -133,13 +114,13 @@ INLINE QCOMP_ALIAS fast_getPauliStrElem(PauliStr str, qindex row, qindex col) { // though opens the risk that the former caller erroneously has its upper // Paulis ignore. We forego this optimisation in defensive design, and // because this function is only invoked during data structure initilisation - // and ergo infrequently.s + // and ergo infrequently. // regrettably duplicated from paulis.cpp which is inaccessible here constexpr int numPaulisPerMask = sizeof(PAULI_MASK_TYPE) * 8 / 2; - // QCOMP_ALIAS-agnostic literals - QCOMP_ALIAS p0, p1,n1, pI,nI; + // T-agnostic complex literals + T p0, p1,n1, pI,nI; p0 = {0, 0}; // 0 p1 = {+1, 0}; // 1 n1 = {-1, 0}; // -1 @@ -152,20 +133,20 @@ INLINE QCOMP_ALIAS fast_getPauliStrElem(PauliStr str, qindex row, qindex col) { // but this poses no real slowdown; this function, and its caller, are inlined // so these 16 amps are re-processed one for each full enumeration of the // PauliStrSum which is expected to have significantly more terms/coeffs - QCOMP_ALIAS matrices[][2][2] = { + T matrices[][2][2] = { {{p1,p0},{p0,p1}}, // I {{p0,p1},{p1,p0}}, // X {{p0,nI},{pI,p0}}, // Y {{p1,p0},{p0,n1}}}; // Z - QCOMP_ALIAS elem = p1; // 1 + T elem = p1; // 1 // could be compile-time unrolled into 32 iterations for (int t=0; t +INLINE T fast_getPauliStrSumElem(T* coeffs, PauliStr* strings, qindex numTerms, qindex row, qindex col) { // this function accepts unpacked PauliStrSum fields since a PauliStrSum cannot // be directly processed in CUDA kernels/thrust due to its 'qcomp' field. // it also assumes str.highPaulis==0 for all str in strings, as per above func. - QCOMP_ALIAS elem = {0, 0}; // type-agnostic literal + T elem = {0, 0}; // type-agnostic complex literal // this loop is expected exponentially smaller than caller's loop for (qindex n=0; n(strings[n], row, col); return elem; } - -// avoid exposing alias macro outside header -#undef QCOMP_ALIAS - #endif // FASTMATH_HPP \ No newline at end of file diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp index 13bf6eecf..8cde53104 100644 --- a/quest/src/cpu/cpu_subroutines.cpp +++ b/quest/src/cpu/cpu_subroutines.cpp @@ -2,11 +2,8 @@ * CPU OpenMP-accelerated definitions of the main backend simulation routines, * as mirrored by gpu_subroutines.cpp, and called by accelerator.cpp. * - * BEWARE that this specific file receives additional compiler optimisation flags - * in order to counteract a performance issue in the use of std::complex operator - * overloads. These flags (like -Ofast) may induce assumed associativity of qcomp - * algebra, breaking techniques like Kahan summation. As such, this file CANNOT - * assume IEEE floating-point behaviour. + * These 'hot-loop' functions use cpu_qcomp arithmetic operators, in lieu of qcomp + * (i.e. std::complex) which has compiler-specific performance pitfalls. * * Some of these definitions are templated, defining multiple versions optimised * (at compile-time) for handling different numbers of input qubits; such functions @@ -34,6 +31,7 @@ #include "quest/src/core/randomiser.hpp" #include "quest/src/core/accelerator.hpp" #include "quest/src/core/autodeployer.hpp" +#include "quest/src/cpu/cpu_types.hpp" #include "quest/src/cpu/cpu_config.hpp" #include "quest/src/cpu/cpu_subroutines.hpp" #include "quest/src/comm/comm_config.hpp" @@ -45,26 +43,16 @@ using std::vector; -/* - * Beware that this file makes extensive use of std::complex (qcomp) operator - * overloads and so requires additional compiler flags to achieve hand-rolled - * arithmetic performance; otherwise a 3-50x slowdown may be observed. We here - * enforce that these flags were not forgotton (but may be deliberatedly avoided). - * Beware these flags may induce associativity and break e.g. Kakan summation. - */ -#if !defined(COMPLEX_OVERLOADS_PATCHED) - #error "Crucial, bespoke optimisation flags were not passed (or acknowledged) to cpu_subroutines.cpp which are necessary for full complex arithmetic performance." - -#elif !COMPLEX_OVERLOADS_PATCHED - #if defined(_MSC_VER) - #pragma message("Warning: The CPU backend is being deliberately compiled without the necessary flags to obtain full complex arithmetic performance.") - #else - #warning "The CPU backend is being deliberately compiled without the necessary flags to obtain full complex arithmetic performance." - #endif -#endif + // TODO: + // try to restore OpenMP custom reduction for cpu_qcomp?? + // may work on modern MSVC compilers now, negating the need + // to do the indvidual real/imag reduction + + + // TODO: move numIts closer to the for loops! @@ -105,6 +93,10 @@ void cpu_densmatr_setAmpsToPauliStrSum_sub(Qureg qureg, PauliStrSum sum) { qindex numIts = qureg.numAmpsPerNode; qindex dim = powerOf2(qureg.numQubits); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* coeffs = getCpuQcompPtr(sum.coeffs); + #pragma omp parallel for if(qureg.isMultithreaded) for (qindex n=0; n qubitInds, vecto // amplitudes are packed at an offset into the buffer qindex offset = getSubBufferSendInd(qureg); + // use cpu_qcomp (in lieu of qcomp) even though no arithmetic happens below - just for consistency! + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer); + auto sortedQubitInds = util_getSorted(qubitInds); auto qubitStateMask = util_getBitMask(qubitInds, qubitStates); @@ -240,7 +241,7 @@ qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, vector qubitInds, vecto qindex i = insertBitsWithMaskedValues(n, sortedQubitInds.data(), numBits, qubitStateMask); // pack the potentially-strided amplitudes into a contiguous sub-buffer - qureg.cpuCommBuffer[offset + n] = qureg.cpuAmps[i]; + buffer[offset + n] = amps[i]; } // return the number of packed amps @@ -258,6 +259,10 @@ qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu // amplitudes are packed at an offset into the buffer qindex offset = getSubBufferSendInd(qureg); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer); + #pragma omp parallel for if(qureg.isMultithreaded) for (qindex n=0; n ctrls, vector c // each control qubit halves the number of iterations, each of which modifies 2 amplitudes, and skips 2 qindex numIts = qureg.numAmpsPerNode / powerOf2(2 + ctrls.size()); + // use cpu_qcomp (in lieu of qcomp) even though no arithmetic happens below - just for consistency! + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + auto sortedQubits = util_getSorted(ctrls, {targ2, targ1}); auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ2, targ1}, {0, 1}); @@ -305,7 +313,7 @@ void cpu_statevec_anyCtrlSwap_subA(Qureg qureg, vector ctrls, vector c qindex i01 = insertBitsWithMaskedValues(n, sortedQubits.data(), numQubitBits, qubitStateMask); qindex i10 = flipTwoBits(i01, targ2, targ1); - std::swap(qureg.cpuAmps[i01], qureg.cpuAmps[i10]); + std::swap(amps[i01], amps[i10]); } } @@ -321,6 +329,10 @@ void cpu_statevec_anyCtrlSwap_subB(Qureg qureg, vector ctrls, vector c // received amplitudes may begin at an arbitrary offset in the buffer qindex offset = getBufferRecvInd(); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer); + auto sortedCtrls = util_getSorted(ctrls); auto ctrlStateMask = util_getBitMask(ctrls, ctrlStates); @@ -337,7 +349,7 @@ void cpu_statevec_anyCtrlSwap_subB(Qureg qureg, vector ctrls, vector c qindex j = n + offset; // unpack the continuous sub-buffer among the strided local amplitudes - qureg.cpuAmps[i] = qureg.cpuCommBuffer[j]; + amps[i] = buffer[j]; } } @@ -353,6 +365,10 @@ void cpu_statevec_anyCtrlSwap_subC(Qureg qureg, vector ctrls, vector c // received amplitudes may begin at an arbitrary offset in the buffer qindex offset = getBufferRecvInd(); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer); + auto sortedQubits = util_getSorted(ctrls, {targ}); auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {targState}); @@ -370,7 +386,7 @@ void cpu_statevec_anyCtrlSwap_subC(Qureg qureg, vector ctrls, vector c qindex j = n + offset; // unpack the continuous sub-buffer among the strided local amplitudes - qureg.cpuAmps[i] = qureg.cpuCommBuffer[j]; + amps[i] = buffer[j]; } } @@ -394,6 +410,10 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector ctrls, v // each control qubit halves the needed iterations, and each iteration modifies two amplitudes qindex numIts = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 1); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + auto elems = getCpuQcomps(matr.elems); + auto sortedQubits = util_getSorted(ctrls, {targ}); auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0}); @@ -409,11 +429,11 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector ctrls, v qindex i1 = flipBit(i0, targ); // note the two amplitudes are likely strided and not adjacent (separated by 2^t) - qcomp amp0 = qureg.cpuAmps[i0]; - qcomp amp1 = qureg.cpuAmps[i1]; + cpu_qcomp amp0 = amps[i0]; + cpu_qcomp amp1 = amps[i1]; - qureg.cpuAmps[i0] = matr.elems[0][0]*amp0 + matr.elems[0][1]*amp1; - qureg.cpuAmps[i1] = matr.elems[1][0]*amp0 + matr.elems[1][1]*amp1; + amps[i0] = elems[0][0]*amp0 + elems[0][1]*amp1; + amps[i1] = elems[1][0]*amp0 + elems[1][1]*amp1; } } @@ -429,6 +449,12 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector ctrls, v // received amplitudes may begin at an arbitrary offset in the buffer qindex offset = getBufferRecvInd(); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer); + cpu_qcomp f0 = getCpuQcomp(fac0); + cpu_qcomp f1 = getCpuQcomp(fac1); + auto sortedCtrls = util_getSorted(ctrls); auto ctrlStateMask = util_getBitMask(ctrls, ctrlStates); @@ -444,7 +470,7 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector ctrls, v // j = index of nth received amplitude from pair rank in buffer qindex j = n + offset; - qureg.cpuAmps[i] = fac0*qureg.cpuAmps[i] + fac1*qureg.cpuCommBuffer[j]; + amps[i] = f0*amps[i] + f1*buffer[j]; } } @@ -467,6 +493,10 @@ void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve // each control qubit halves the needed iterations, and each iteration modifies four amplitudes qindex numIts = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 2); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + auto elems = getCpuQcomps(matr.elems); + auto sortedQubits = util_getSorted(ctrls, {targ1, targ2}); auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1, targ2}, {0, 0}); @@ -484,16 +514,16 @@ void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve qindex i11 = flipBit(i01, targ2); // note amplitudes are not necessarily adjacent, nor uniformly spaced - qcomp amp00 = qureg.cpuAmps[i00]; - qcomp amp01 = qureg.cpuAmps[i01]; - qcomp amp10 = qureg.cpuAmps[i10]; - qcomp amp11 = qureg.cpuAmps[i11]; - - // amps[i_n] = sum_j elems[n][j] amp[i_n] - qureg.cpuAmps[i00] = matr.elems[0][0]*amp00 + matr.elems[0][1]*amp01 + matr.elems[0][2]*amp10 + matr.elems[0][3]*amp11; - qureg.cpuAmps[i01] = matr.elems[1][0]*amp00 + matr.elems[1][1]*amp01 + matr.elems[1][2]*amp10 + matr.elems[1][3]*amp11; - qureg.cpuAmps[i10] = matr.elems[2][0]*amp00 + matr.elems[2][1]*amp01 + matr.elems[2][2]*amp10 + matr.elems[2][3]*amp11; - qureg.cpuAmps[i11] = matr.elems[3][0]*amp00 + matr.elems[3][1]*amp01 + matr.elems[3][2]*amp10 + matr.elems[3][3]*amp11; + cpu_qcomp amp00 = amps[i00]; + cpu_qcomp amp01 = amps[i01]; + cpu_qcomp amp10 = amps[i10]; + cpu_qcomp amp11 = amps[i11]; + + // amps[i_n] = sum_j matr.elems[n][j] amp[i_n] + amps[i00] = elems[0][0]*amp00 + elems[0][1]*amp01 + elems[0][2]*amp10 + elems[0][3]*amp11; + amps[i01] = elems[1][0]*amp00 + elems[1][1]*amp01 + elems[1][2]*amp10 + elems[1][3]*amp11; + amps[i10] = elems[2][0]*amp00 + elems[2][1]*amp01 + elems[2][2]*amp10 + elems[2][3]*amp11; + amps[i11] = elems[3][0]*amp00 + elems[3][1]*amp01 + elems[3][2]*amp10 + elems[3][3]*amp11; } } @@ -534,6 +564,10 @@ void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve // each control qubit halves iterations, each of which modifies 2^(targs.size()) amplitudes qindex numIts = qureg.numAmpsPerNode / powerOf2(ctrls.size() + targs.size()); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* elems = getCpuQcompPtr(matr.cpuElemsFlat); + // prepare a mask which yields ctrls in specified state, and targs in all-zero auto sortedQubits = util_getSorted(ctrls, targs); auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, targs, vector(targs.size(),0)); @@ -550,7 +584,7 @@ void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve #pragma omp parallel if(qureg.isMultithreaded) { // create a private cache for every thread (might be compile-time sized, and in heap or stack) - vector cache(numTargAmps); + vector cache(numTargAmps); #pragma omp for for (qindex n=0; n ctrls, ve // i = nth local index where ctrls are active and targs form value j qindex i = setBits(i0, targs.data(), numTargBits, j); // loop may be unrolled - cache[j] = qureg.cpuAmps[i]; + cache[j] = amps[i]; } // modify each amplitude (loop might be unrolled) @@ -571,7 +605,7 @@ void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve // i = nth local index where ctrls are active and targs form value k qindex i = setBits(i0, targs.data(), numTargBits, k); // loop may be unrolled - qureg.cpuAmps[i] = 0; + amps[i] = getCpuQcomp(0, 0); // loop may be unrolled for (qindex j=0; j ctrls, ve l = fast_getMatrixFlatIndex(j, k, numTargAmps); else l = fast_getMatrixFlatIndex(k, j, numTargAmps); - - qcomp elem = matr.cpuElemsFlat[l]; + cpu_qcomp elem = elems[l]; // optionally conjugate matrix elems on the fly to avoid pre-modifying heap structure if constexpr (ApplyConj) - elem = std::conj(elem); + elem = conj(elem); - qureg.cpuAmps[i] += elem * cache[j]; + amps[i] += elem * cache[j]; /// @todo /// qureg.cpuAmps[i] is being serially updated by only this thread, /// so is a candidate for Kahan summation for improved numerical /// stability. Explore whether this is time-free and worthwhile! - /// - /// BEWARE that Kahan summation is incompatible with the optimisation - /// flags currently passed to this file } } } @@ -622,6 +652,15 @@ void cpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector ctrls, vec // each control qubit halves the needed iterations, each of which will modify 1 amplitude qindex numIts = qureg.numAmpsPerNode / powerOf2(ctrls.size()); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* elems = getCpuQcompPtr(matr.elems); + + // TODO: + // I am paranoid the above casting from qcomp* to cpu_qcomp* is + // more dangerous for the static matr.elems than the heap pointer. + // Is it more vulnerable to misalignment? Check this! + auto sortedCtrls = util_getSorted(ctrls); auto ctrlStateMask = util_getBitMask(ctrls, ctrlStates); @@ -638,7 +677,7 @@ void cpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector ctrls, vec qindex i = concatenateBits(qureg.rank, j, qureg.logNumAmpsPerNode); int b = getBit(i, targ); - qureg.cpuAmps[j] *= matr.elems[b]; + amps[j] *= elems[b]; } } @@ -660,6 +699,15 @@ void cpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector ctrls, vec // each control qubit halves the needed iterations, each of which will modify 1 amplitude qindex numIts = qureg.numAmpsPerNode / powerOf2(ctrls.size()); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* elems = getCpuQcompPtr(matr.elems); + + // TODO: + // I am paranoid the above casting from qcomp* to cpu_qcomp* is + // more dangerous for the static matr.elems than the heap pointer. + // Is it more vulnerable to misalignment? Check this! + auto sortedCtrls = util_getSorted(ctrls); auto ctrlStateMask = util_getBitMask(ctrls, ctrlStates); @@ -676,7 +724,7 @@ void cpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector ctrls, vec qindex i = concatenateBits(qureg.rank, j, qureg.logNumAmpsPerNode); int k = getTwoBits(i, targ2, targ1); - qureg.cpuAmps[j] *= matr.elems[k]; + amps[j] *= elems[k]; } } @@ -700,6 +748,11 @@ void cpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vec // each control qubit halves the needed iterations, each of which will modify 1 amplitude qindex numIts = qureg.numAmpsPerNode / powerOf2(ctrls.size()); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* elems = getCpuQcompPtr(matr.cpuElems); + cpu_qcomp expo = getCpuQcomp(exponent); + auto sortedCtrls = util_getSorted(ctrls); auto ctrlStateMask = util_getBitMask(ctrls, ctrlStates); @@ -718,7 +771,7 @@ void cpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vec // t = value of targeted bits, which may be in the prefix substate qindex t = getValueOfBits(i, targs.data(), numTargBits); - qcomp elem = matr.cpuElems[t]; + cpu_qcomp elem = elems[t]; // decide whether to power and conj at compile-time, to avoid branching in hot-loop. // beware that pow(qcomp,qcomp) below gives notable error over pow(qreal,qreal) @@ -726,13 +779,13 @@ void cpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vec // and negative, and the exponent is an integer. We tolerate this heightened error // because we have no reason to think matr is real (it's not constrained Hermitian). if constexpr (HasPower) - elem = std::pow(elem, exponent); + elem = pow(elem, expo); // cautiously conjugate AFTER exponentiation, else we must also conj exponent if constexpr (ApplyConj) - elem = std::conj(elem); + elem = conj(elem); - qureg.cpuAmps[j] *= elem; + amps[j] *= elem; } } @@ -762,10 +815,15 @@ void cpu_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp // every iteration modifies one amp, using one element qindex numIts = qureg.numAmpsPerNode; + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* elems = getCpuQcompPtr(matr.cpuElems); + cpu_qcomp expo = getCpuQcomp(exponent); + #pragma omp parallel for if(qureg.isMultithreaded||qureg.isMultithreaded) for (qindex n=0; n (matr * rho) or (matr^exponent * rho) if constexpr (ApplyLeft) { // i = global row of nth local amp qindex i = fast_getQuregGlobalRowFromFlatIndex(n, matr.numElems); - qcomp term = matr.cpuElems[i]; + cpu_qcomp term = elems[i]; // compile-time decide if applying power to avoid in-loop branching... // (beware that complex pow() is numerically unstable as detailed below) if constexpr (HasPower) - term = std::pow(term, exponent); + term = pow(term, expo); fac = term; } @@ -826,23 +889,23 @@ void cpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp // j = global column corresponding to n qindex j = fast_getQuregGlobalColFromFlatIndex(m, matr.numElems); - qcomp term = matr.cpuElems[j]; + cpu_qcomp term = elems[j]; // beware that pow(qcomp,qcomp) below gives notable error over pow(qreal,qreal) // (by producing an unexpected non-zero imaginary component) when the base is real // and negative, and the exponent is an integer. We tolerate this heightened error // because we have no reason to think matr is real (it's not constrained Hermitian). if constexpr (HasPower) - term = std::pow(term, exponent); + term = pow(term, expo); // conj strictly after pow, to effect conj(matr^exponent) if constexpr (ConjRight) - term = std::conj(term); + term = conj(term); fac *= term; } - qureg.cpuAmps[n] *= fac; + amps[n] *= fac; } } @@ -866,8 +929,8 @@ template void cpu_densmatr_allTargDiagMatr_sub (Qure template INLINE void applyPauliUponAmpPair( - Qureg qureg, qindex v, qindex i0, int* indXY, int numXY, - qindex maskXY, qindex maskYZ, qcomp ampFac, qcomp pairAmpFac + cpu_qcomp* amps, qindex v, qindex i0, int* indXY, int numXY, + qindex maskXY, qindex maskYZ, cpu_qcomp ampFac, cpu_qcomp pairAmpFac ) { // this is a subroutine of cpu_statevector_anyCtrlPauliTensorOrGadget_subA() below // called in a hot-loop (hence it is here inlined) which exists because the caller @@ -885,14 +948,13 @@ INLINE void applyPauliUponAmpPair( int signB = fast_getPlusOrMinusMaskedBitParity(iB, maskYZ); // mix or swap scaled amp pair (where pairAmpFac includes Y's i factor) - qcomp ampA = qureg.cpuAmps[iA]; - qcomp ampB = qureg.cpuAmps[iB]; - qureg.cpuAmps[iA] = (ampFac * ampA) + (pairAmpFac * signB * ampB); - qureg.cpuAmps[iB] = (ampFac * ampB) + (pairAmpFac * signA * ampA); + cpu_qcomp ampA = amps[iA]; + cpu_qcomp ampB = amps[iB]; + amps[iA] = (ampFac * ampA) + (pairAmpFac * signB * ampB); + amps[iB] = (ampFac * ampB) + (pairAmpFac * signA * ampA); } - template void cpu_statevector_anyCtrlPauliTensorOrGadget_subA( Qureg qureg, vector ctrls, vector ctrlStates, @@ -900,6 +962,14 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subA( ) { assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls); assert_numTargsMatchesTemplateParam(x.size() + y.size(), NumTargs); + + // we will scale pairAmp below by i^numY, so that each amp need only choose the +-1 sign + pairAmpFac *= util_getPowerOfI(y.size()); + + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp f0 = getCpuQcomp(ampFac); + cpu_qcomp f1 = getCpuQcomp(pairAmpFac); // only X and Y count as targets vector sortedTargsXY = util_getSorted(util_getConcatenated(x, y)); @@ -912,9 +982,6 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subA( auto maskXY = util_getBitMask(sortedTargsXY); auto maskYZ = util_getBitMask(util_getConcatenated(y, z)); - // we will scale pairAmp below by i^numY, so that each amp need only choose the +-1 sign - pairAmpFac *= util_getPowerOfI(y.size()); - // use template params to compile-time unroll loops in insertBits() and inner-loop below SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size()); SET_VAR_AT_COMPILE_TIME(int, numTargBits, NumTargs, sortedTargsXY.size()); @@ -950,7 +1017,7 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subA( // serial for (qindex v=0; v( - qureg, v, i0, sortedTargsXY.data(), numTargBits, maskXY, maskYZ, ampFac, pairAmpFac); + amps, v, i0, sortedTargsXY.data(), numTargBits, maskXY, maskYZ, f0, f1); } } else { @@ -965,7 +1032,7 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subA( #pragma omp parallel for for (qindex v=0; v( - qureg, v, i0, sortedTargsXY.data(), numTargBits, maskXY, maskYZ, ampFac, pairAmpFac); + amps, v, i0, sortedTargsXY.data(), numTargBits, maskXY, maskYZ, f0, f1); } } } @@ -978,8 +1045,17 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subB( ) { assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls); + // we will scale pairAmp by i^numY, so that each amp need only choose the +-1 sign + pairAmpFac *= util_getPowerOfI(y.size()); + // each control qubit halves the needed iterations qindex numIts = qureg.numAmpsPerNode / powerOf2(ctrls.size()); + + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer); + cpu_qcomp f0 = getCpuQcomp(ampFac); + cpu_qcomp f1 = getCpuQcomp(pairAmpFac); // received amplitudes may begin at an arbitrary offset in the buffer qindex offset = getBufferRecvInd(); @@ -992,9 +1068,6 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subB( // use template param to compile-time unroll loop in insertBits() SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size()); - // we will scale pairAmp by i^numY, so that each amp need only choose the +-1 sign - pairAmpFac *= util_getPowerOfI(y.size()); - #pragma omp parallel for if(qureg.isMultithreaded) for (qindex n=0; n coeffs qindex numIts = outQureg.numAmpsPerNode; + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* outAmps = getCpuQcompPtr(outQureg.cpuAmps); + cpu_qcomp* inFacs = getCpuQcompPtr(coeffs.data()); + // use template param to compile-time unroll inner loop below SET_VAR_AT_COMPILE_TIME(int, numQuregs, NumQuregs, inQuregs.size()); @@ -1076,13 +1156,13 @@ void cpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector coeffs for (qindex n=0; n coeffs void cpu_densmatr_mixQureg_subA(qreal outProb, Qureg outQureg, qreal inProb, Qureg inDensMatr) { qindex numIts = outQureg.numAmpsPerNode; - qcomp* out = outQureg.cpuAmps; - qcomp* in = inDensMatr.cpuAmps; + + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* outAmps = getCpuQcompPtr(outQureg.cpuAmps); + cpu_qcomp* inAmps = getCpuQcompPtr(inDensMatr.cpuAmps); #pragma omp parallel for if(outQureg.isMultithreaded) for (qindex n=0; n ta // each outer iteration sets one element of outQureg qindex numOuterIts = outQureg.numAmpsPerNode; + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* inAmps = getCpuQcompPtr(inQureg.cpuAmps); + cpu_qcomp* outAmps = getCpuQcompPtr(outQureg.cpuAmps); + // targs and allTargs are sorted, but pairTargs is arbitrarily ordered (though corresponding targs) auto allTargsSorted = util_getSorted(targs, pairTargs); @@ -1750,7 +1886,7 @@ void cpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector ta qindex k = insertBits(n, allTargsSorted.data(), numAllTargs, 0); // loop may be unrolled // each outQureg amp results from summing 2^targs inQureg amps - qcomp outAmp = 0; + cpu_qcomp outAmp = getCpuQcomp(0,0); // loop may be unrolled for (qindex j=0; j ta i = setBits(i, targs .data(), numTargPairs, j); // loop may be unrolled i = setBits(i, pairTargs.data(), numTargPairs, j); // loop may be unrolled - outAmp += inQureg.cpuAmps[i]; + outAmp += inAmps[i]; } - outQureg.cpuAmps[n] = outAmp; + outAmps[n] = outAmp; } } @@ -1797,9 +1933,12 @@ qreal cpu_statevec_calcTotalProb_sub(Qureg qureg) { // every amp, iterated independently, contributes to the probability qindex numIts = qureg.numAmpsPerNode; + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + #pragma omp parallel for reduction(+:prob) if(qureg.isMultithreaded) for (qindex n=0; n qubi // (>=1 since all qubits are in suffix, so qubits.size() <= suffix size) qindex numIts = qureg.numAmpsPerNode / powerOf2(qubits.size()); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + auto sortedQubits = util_getSorted(qubits); // all in suffix auto qubitStateMask = util_getBitMask(qubits, outcomes); @@ -1862,7 +2007,7 @@ qreal cpu_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector qubi // i = nth local index where qubits are in the specified outcome state qindex i = insertBitsWithMaskedValues(n, sortedQubits.data(), numBits, qubitStateMask); - prob += std::norm(qureg.cpuAmps[i]); + prob += norm(amps[i]); } return prob; @@ -1884,6 +2029,9 @@ qreal cpu_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector qubi qindex numAmpsPerCol = powerOf2(qureg.numQubits); qindex firstDiagInd = util_getLocalIndexOfFirstDiagonalAmp(qureg); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + auto sortedQubits = util_getSorted(qubits); // all in suffix, with corresponding bra's all in suffix auto qubitStateMask = util_getBitMask(qubits, outcomes); @@ -1899,7 +2047,7 @@ qreal cpu_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector qubi // j = local, flat, density-matrix index of diagonal amp corresponding to state i qindex j = fast_getQuregLocalIndexOfDiagonalAmp(i, firstDiagInd, numAmpsPerCol); - prob += std::real(qureg.cpuAmps[j]); + prob += real(amps[j]); } return prob; @@ -1914,6 +2062,9 @@ void cpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu // every amp contributes to a statevector prob qindex numIts = qureg.numAmpsPerNode; + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + // use template param to compile-time unroll loop in getValueOfBits() SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size()); qindex numOutcomes = powerOf2(numBits); @@ -1930,7 +2081,7 @@ void cpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu #pragma omp parallel for if(qureg.isMultithreaded) for (qindex n=0; n or if constexpr (Conj) { - rhoAmp = std::conj(rhoAmp); - colAmp = std::conj(colAmp); + rhoAmp = conj(rhoAmp); + colAmp = conj(colAmp); } else - rowAmp = std::conj(rowAmp); + rowAmp = conj(rowAmp); - qcomp term = rhoAmp * rowAmp * colAmp; - fidRe += std::real(term); - fidIm += std::imag(term); + cpu_qcomp term = rhoAmp * rowAmp * colAmp; + fidRe += real(term); + fidIm += imag(term); } return qcomp(fidRe, fidIm); @@ -2095,11 +2261,14 @@ qreal cpu_statevec_calcExpecAnyTargZ_sub(Qureg qureg, vector targs) { qindex numIts = qureg.numAmpsPerNode; qindex targMask = util_getBitMask(targs); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + #pragma omp parallel for reduction(+:value) if(qureg.isMultithreaded) for (qindex n=0; n targs) { qindex numAmpsPerCol = powerOf2(qureg.numQubits); qindex firstDiagInd = util_getLocalIndexOfFirstDiagonalAmp(qureg); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + qindex targMask = util_getBitMask(targs); #pragma omp parallel for reduction(+:valueRe,valueIm) if(qureg.isMultithreaded) @@ -2128,10 +2300,10 @@ qcomp cpu_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector targs) { // r = global row of nth local diagonal, which determines amp sign qindex r = n + firstDiagInd; int sign = fast_getPlusOrMinusMaskedBitParity(r, targMask); - qcomp term = sign * qureg.cpuAmps[i]; + cpu_qcomp term = sign * amps[i]; - valueRe += std::real(term); - valueIm += std::imag(term); + valueRe += real(term); + valueIm += imag(term); } return qcomp(valueRe, valueIm); @@ -2147,6 +2319,9 @@ qcomp cpu_statevec_calcExpecPauliStr_subA(Qureg qureg, vector x, vector x, vector x, vector x, vector x, vector qindex numAmpsPerCol = powerOf2(qureg.numQubits); qindex firstDiagInd = util_getLocalIndexOfFirstDiagonalAmp(qureg); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + // these masks indicate global paulis (i.e. not just suffix) qindex maskXY = util_getBitMask(util_getConcatenated(x, y)); qindex maskYZ = util_getBitMask(util_getConcatenated(y, z)); @@ -2237,10 +2419,10 @@ qcomp cpu_densmatr_calcExpecPauliStr_sub(Qureg qureg, vector x, vector // sign = +-1 induced by Y and Z (excludes Y's imaginary factors) int sign = fast_getPlusOrMinusMaskedBitParity(i, maskYZ); - qcomp term = sign * qureg.cpuAmps[m]; + cpu_qcomp term = sign * amps[m]; - valueRe += std::real(term); - valueIm += std::imag(term); + valueRe += real(term); + valueIm += imag(term); } // scale by i^numY (because sign above exlcuded i) @@ -2267,10 +2449,15 @@ qcomp cpu_statevec_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr // every amp, iterated independently, contributes to the expectation value qindex numIts = qureg.numAmpsPerNode; + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* elems = getCpuQcompPtr(matr.cpuElems); + cpu_qcomp expo = getCpuQcomp(exponent); + #pragma omp parallel for reduction(+:valueRe,valueIm) if(qureg.isMultithreaded||matr.isMultithreaded) for (qindex n=0; n qubits, vecto // visit every amp, setting to zero or multiplying it by renorm qindex numIts = qureg.numAmpsPerNode; - qreal renorm = 1 / std::sqrt(prob); + + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); // binary value of targeted qubits in basis states which are to be retained qindex retainValue = getIntegerFromBits(outcomes.data(), outcomes.size()); + qreal renorm = 1 / std::sqrt(prob); // use template param to compile-time unroll loop in getValueOfBits() SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size()); @@ -2383,8 +2578,7 @@ void cpu_statevec_multiQubitProjector_sub(Qureg qureg, vector qubits, vecto qindex val = getValueOfBits(n, qubits.data(), numBits); // multiply amp with renorm or zero, if qubit value matches or disagrees - qcomp fac = renorm * (val == retainValue); - qureg.cpuAmps[n] *= fac; + amps[n] *= renorm * (val == retainValue); } } @@ -2401,10 +2595,13 @@ void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, vector qubits, vecto // visit every amp, setting most to zero and multiplying the remainder by renorm qindex numIts = qureg.numAmpsPerNode; - qreal renorm = 1 / prob; + + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); // binary value of targeted qubits in basis states which are to be retained qindex retainValue = getIntegerFromBits(outcomes.data(), outcomes.size()); + qreal renorm = 1 / prob; // use template param to compile-time unroll loops in getValueOfBits() SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size()); @@ -2423,8 +2620,7 @@ void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, vector qubits, vecto qindex v2 = getValueOfBits(c, qubits.data(), numBits); // multiply amp with renorm or zero if values disagree with given outcomes - qcomp fac = renorm * (v1 == v2) * (retainValue == v1); - qureg.cpuAmps[n] *= fac; + amps[n] *= renorm * (v1 == v2) * (retainValue == v1); } } @@ -2453,7 +2649,7 @@ void cpu_statevec_initUniformState_sub(Qureg qureg, qcomp amp) { qureg.numAmpsPerNode, numAmpsPerPage, cpu_getOpenmpThreadInd(), cpu_getCurrentNumThreads()); - std::fill(qureg.cpuAmps + start, qureg.cpuAmps + end, amp); + std::fill(qureg.cpuAmps + start, qureg.cpuAmps + end, amp); // no cpu_qcomp cast needed } } @@ -2468,7 +2664,7 @@ void cpu_statevec_initDebugState_sub(Qureg qureg) { // i = global index of nth local amp qindex i = concatenateBits(qureg.rank, n, qureg.logNumAmpsPerNode); - qureg.cpuAmps[n] = qcomp(2*i/10., (2*i+1)/10.); + qureg.cpuAmps[n] = qcomp(2*i/10., (2*i+1)/10.); // no cpu_qcomp cast needed } } @@ -2493,6 +2689,6 @@ void cpu_statevec_initUnnormalisedUniformlyRandomPureStateAmps_sub(Qureg qureg) #pragma omp for for (qindex i=0; i + + + +/* + * COMPLEX SCALAR + * + * The user-facing qcomp (which in the QuEST middle-end, resolves to + * a std::complex) is not used by the CPU backend, since it creates + * performance pitfalls (e.g. expensive NaN checks within arithmetic + * operators) in some compilers. Instead, we use the below custom + * complex type and operator overloads. + */ + + +struct cpu_qcomp { + + // memory layout + qreal re; + qreal im; + + // in-place complex arithmetic overloads + INLINE cpu_qcomp& operator += (const cpu_qcomp& a) noexcept { + re += a.re; + im += a.im; + return *this; + } + INLINE cpu_qcomp& operator -= (const cpu_qcomp& a) noexcept { + re -= a.re; + im -= a.im; + return *this; + } + INLINE cpu_qcomp& operator *= (const cpu_qcomp& a) noexcept { + qreal re_ = re; + qreal im_ = im; + re = (re_ * a.re) - (im_ * a.im); + im = (re_ * a.im) + (im_ * a.re); + return *this; + } + + // in-place mixed-type arithmetic overloads + INLINE cpu_qcomp& operator *= (const int& a) noexcept { + re *= a; + im *= a; + return *this; + } + INLINE cpu_qcomp& operator *= (const qreal& a) noexcept { + re *= a; + im *= a; + return *this; + } +}; + + +// out-of-place complex arithmetic overloads (optimised) +INLINE cpu_qcomp operator + (cpu_qcomp a, const cpu_qcomp& b) noexcept { + a += b; + return a; +} +INLINE cpu_qcomp operator - (cpu_qcomp a, const cpu_qcomp& b) noexcept { + a -= b; + return a; +} +INLINE cpu_qcomp operator * (cpu_qcomp a, const cpu_qcomp& b) noexcept { + a *= b; + return a; +} + + +// out-of-place mixed-type arithmetic overloads +INLINE cpu_qcomp operator * (cpu_qcomp a, const int& b) noexcept { + a *= b; + return a; +} +INLINE cpu_qcomp operator * (cpu_qcomp a, const qreal& b) noexcept { + a *= b; + return a; +} + + +// reverse order of out-of-place mixed-type arithmetic (via commutation) +INLINE cpu_qcomp operator * (const int& a, const cpu_qcomp& b) noexcept { + return b * a; +} +INLINE cpu_qcomp operator * (const qreal& a, const cpu_qcomp& b) noexcept { + return b * a; +} + + +// no-op cast of pointers +INLINE cpu_qcomp* getCpuQcompPtr(qcomp* list) { + + return reinterpret_cast(list); +} + + +// creator for cpu_qcomp literals +INLINE cpu_qcomp getCpuQcomp(qreal re, qreal im) { + return { re, im }; +} + +// creator for qcomp conversion +INLINE cpu_qcomp getCpuQcomp(const qcomp& a) { + return { a.real(), a.imag() }; +} + +// creator for fixed-size dense matrices (CompMatr1 and CompMatr2) +template +INLINE std::array,dim> getCpuQcomps(qcomp matr[dim][dim]) { + + std::array,dim> out; + + for (int i=0; i(base); + auto& expo_ = reinterpret_cast(expo); + qcomp out = std::pow(base_, expo_); + return reinterpret_cast(out); +} + + +// check the memory layout of cpu_qcomp agrees with qcomp, since +// it is not formally gauranteed, unlike _Complex and std::complex +static_assert(sizeof (cpu_qcomp) == sizeof (qcomp)); +static_assert(alignof(cpu_qcomp) == alignof(qcomp)); +static_assert(std::is_standard_layout_v ); +static_assert(std::is_trivially_copyable_v); + + +// TODO: +// the above checks are potentially inadequate to identify an +// insidious incompatibility between qcomp and cpu_qcomp - perhaps +// we should perform a compile-time duck-check, casting a small +// array between them and checking no data is corrupted? Perhaps +// a runtime check in initQuESTEnv() is also necessary, checking the +// casting is safe for all circumstances (e.g. heap mem, static lists) + + +#endif // CPU_TYPES_HPP \ No newline at end of file diff --git a/utils/scripts/compile.sh b/utils/scripts/compile.sh index 8eca0ae27..609f85be3 100755 --- a/utils/scripts/compile.sh +++ b/utils/scripts/compile.sh @@ -312,8 +312,6 @@ echo "deployment modes:" ALL_LINK_FLAGS="${USER_LINK_FLAGS}" # choose compiler and flags for CPU/OMP files -CPU_FILES_FLAGS='-Ofast -DCOMPLEX_OVERLOADS_PATCHED=1' - if [ $ENABLE_MULTITHREADING == 1 ] then echo "${INDENT}(multithreading enabled)" From ab8215c9336bf2136233dc652b15f0c85e88699c Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Fri, 17 Apr 2026 16:51:16 -0400 Subject: [PATCH 02/16] renamed cu_qcomp to gpu_qcomp just to isolate this big diff - structural changes to gpu_qcomp are coming --- quest/src/cpu/cpu_subroutines.cpp | 2 +- quest/src/cpu/cpu_types.hpp | 3 + quest/src/gpu/gpu_cuquantum.cuh | 36 +++--- quest/src/gpu/gpu_kernels.cuh | 194 ++++++++++++++--------------- quest/src/gpu/gpu_subroutines.cpp | 144 ++++++++++----------- quest/src/gpu/gpu_thrust.cuh | 200 +++++++++++++++--------------- quest/src/gpu/gpu_types.cuh | 98 +++++++-------- 7 files changed, 336 insertions(+), 341 deletions(-) diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp index 8cde53104..077895f94 100644 --- a/quest/src/cpu/cpu_subroutines.cpp +++ b/quest/src/cpu/cpu_subroutines.cpp @@ -2399,7 +2399,7 @@ qcomp cpu_densmatr_calcExpecPauliStr_sub(Qureg qureg, vector x, vector qindex firstDiagInd = util_getLocalIndexOfFirstDiagonalAmp(qureg); // use cpu_qcomp arithmetic overloads (avoid qcomp's) - cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); // these masks indicate global paulis (i.e. not just suffix) qindex maskXY = util_getBitMask(util_getConcatenated(x, y)); diff --git a/quest/src/cpu/cpu_types.hpp b/quest/src/cpu/cpu_types.hpp index 3b5199c92..bc37a36df 100644 --- a/quest/src/cpu/cpu_types.hpp +++ b/quest/src/cpu/cpu_types.hpp @@ -112,11 +112,13 @@ INLINE cpu_qcomp getCpuQcomp(qreal re, qreal im) { return { re, im }; } + // creator for qcomp conversion INLINE cpu_qcomp getCpuQcomp(const qcomp& a) { return { a.real(), a.imag() }; } + // creator for fixed-size dense matrices (CompMatr1 and CompMatr2) template INLINE std::array,dim> getCpuQcomps(qcomp matr[dim][dim]) { @@ -130,6 +132,7 @@ INLINE std::array,dim> getCpuQcomps(qcomp matr[dim][di return out; } + // maths functions INLINE qreal real(const cpu_qcomp& a) { return a.re; diff --git a/quest/src/gpu/gpu_cuquantum.cuh b/quest/src/gpu/gpu_cuquantum.cuh index 6ba321000..f8343154a 100644 --- a/quest/src/gpu/gpu_cuquantum.cuh +++ b/quest/src/gpu/gpu_cuquantum.cuh @@ -182,7 +182,7 @@ void cuquantum_statevec_anyCtrlSwap_subA(Qureg qureg, vector ctrls, vector< CUDA_CHECK( custatevecSwapIndexBits( config.handle, - toCuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, + toGpuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, targPairs, numTargPairs, // swap mask params seem to be in the reverse order to the remainder of the cuStateVec API @@ -199,7 +199,7 @@ void cuquantum_statevec_anyCtrlSwap_subA(Qureg qureg, vector ctrls, vector< */ -void cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(Qureg qureg, vector ctrls, vector ctrlStates, vector targs, cu_qcomp* flatMatrElems, bool applyAdj) { +void cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(Qureg qureg, vector ctrls, vector ctrlStates, vector targs, gpu_qcomp* flatMatrElems, bool applyAdj) { // this funciton is called 'subA' instead of just 'sub', because it is also called in // the one-target case whereby it is strictly the embarrassingly parallel _subA scenario @@ -210,7 +210,7 @@ void cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(Qureg qureg, vector CUDA_CHECK( custatevecApplyMatrix( config.handle, - toCuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, + toGpuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, flatMatrElems, CUQUANTUM_QCOMP, CUSTATEVEC_MATRIX_LAYOUT_ROW, applyAdj, targs.data(), targs.size(), ctrls.data(), ctrlStates.data(), ctrls.size(), @@ -222,7 +222,7 @@ void cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(Qureg qureg, vector // there is no bespoke cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subB() -void cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vector ctrlStates, vector targs, cu_qcomp* flatMatrElems, bool conj) { +void cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vector ctrlStates, vector targs, gpu_qcomp* flatMatrElems, bool conj) { // beware that despite diagonal matrices being embarrassingly parallel, // the target qubits must still all be suffix-only to avoid a cuStateVec error @@ -239,7 +239,7 @@ void cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrl CUDA_CHECK( custatevecApplyGeneralizedPermutationMatrix( config.handle, - toCuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, + toGpuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, perm, flatMatrElems, CUQUANTUM_QCOMP, adj, targs.data(), targs.size(), ctrls.data(), ctrlStates.data(), ctrls.size(), @@ -259,9 +259,9 @@ void cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrl void cuquantum_densmatr_oneQubitDephasing_subA(Qureg qureg, int qubit, qreal prob) { // effect the superoperator as a two-qubit diagonal upon a statevector suffix state - cu_qcomp a = {1, 0}; - cu_qcomp b = {1-2*prob, 0}; - cu_qcomp elems[] = {a, b, b, a}; + gpu_qcomp a = {1, 0}; + gpu_qcomp b = {1-2*prob, 0}; + gpu_qcomp elems[] = {a, b, b, a}; vector targs {qubit, util_getBraQubit(qubit,qureg)}; bool conj = false; @@ -275,8 +275,8 @@ void cuquantum_densmatr_oneQubitDephasing_subB(Qureg qureg, int ketQubit, qreal // equivalent to a state-controlled global phase upon a statevector, which is // itself a same-element one-qubit diagonal applied to any target int braBit = getBit(qureg.rank, ketQubit - qureg.logNumColsPerNode); - cu_qcomp fac = {1 - 2*prob, 0}; - cu_qcomp elems[] = {fac, fac}; + gpu_qcomp fac = {1 - 2*prob, 0}; + gpu_qcomp elems[] = {fac, fac}; // we choose to target the largest possible qubit, expecting best cuStateVec performance; // note it must still be a suffix qubit since cuQuantum does not know qureg is distributed @@ -296,9 +296,9 @@ void cuquantum_densmatr_twoQubitDephasing_subA(Qureg qureg, int qubitA, int qubi /// are the same, i.e. we skip |00><00|, |01><01|, |10><10|, |11><11| // effect the superoperator as a four-qubit diagonal upon a statevector suffix state - cu_qcomp a = {1, 0}; - cu_qcomp b = {1-4*prob/3, 0}; - cu_qcomp elems[] = {a,b,b,b, b,a,b,b, b,b,a,b, b,b,b,a}; + gpu_qcomp a = {1, 0}; + gpu_qcomp b = {1-4*prob/3, 0}; + gpu_qcomp elems[] = {a,b,b,b, b,a,b,b, b,b,a,b, b,b,b,a}; vector targs {qubitA, qubitB, util_getBraQubit(qubitA,qureg), util_getBraQubit(qubitB,qureg)}; bool conj = false; @@ -331,7 +331,7 @@ qreal cuquantum_statevec_calcTotalProb_sub(Qureg qureg) { CUDA_CHECK( custatevecAbs2SumOnZBasis( config.handle, - toCuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, + toGpuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, &prob0, &prob1, &qubit, numQubits ) ); qreal total = prob0 + prob1; @@ -346,7 +346,7 @@ qreal cuquantum_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector(prob); @@ -367,7 +367,7 @@ void cuquantum_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qu CUDA_CHECK( custatevecAbs2SumArray( config.handle, - toCuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, + toGpuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, outPtr, qubits.data(), qubits.size(), nullptr, nullptr, 0) ); // serially cast and copy output probabilities, if necessary @@ -409,7 +409,7 @@ qreal cuquantum_statevec_calcExpecPauliStr_subA(Qureg qureg, vector x, vect CUDA_CHECK( custatevecComputeExpectationsOnPauliBasis( config.handle, - toCuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, + toGpuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, &value, termPaulis, numTerms, termTargets, numPaulisPerTerm) ); return static_cast(value); @@ -432,7 +432,7 @@ void cuquantum_statevec_multiQubitProjector_sub(Qureg qureg, vector qubits, CUDA_CHECK( custatevecCollapseByBitString( config.handle, - toCuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, + toGpuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, outcomes.data(), qubits.data(), qubits.size(), prob) ); } diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index 4f2a737e4..dedfb9c88 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -22,12 +22,8 @@ #include "quest/include/types.h" #include "quest/src/core/bitwise.hpp" -#include "quest/src/gpu/gpu_types.cuh" - -// kernels/thrust must use cu_qcomp, never qcomp -#define USE_CU_QCOMP #include "quest/src/core/fastmath.hpp" -#undef USE_CU_QCOMP +#include "quest/src/gpu/gpu_types.cuh" #if ! COMPILE_CUDA #error "A file being compiled somehow included gpu_kernels.hpp despite QuEST not being compiled in GPU-accelerated mode." @@ -93,7 +89,7 @@ __forceinline__ __device__ int cudaGetBitMaskParity(qindex mask) { template __global__ void kernel_statevec_packAmpsIntoBuffer( - cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, + gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, int* qubits, int numQubits, qindex qubitStateMask ) { GET_THREAD_IND(n, numThreads); @@ -110,7 +106,7 @@ __global__ void kernel_statevec_packAmpsIntoBuffer( __global__ void kernel_statevec_packPairSummedAmpsIntoBuffer( - cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, + gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, int qubit1, int qubit2, int qubit3, int bit2 ) { GET_THREAD_IND(n, numThreads); @@ -132,7 +128,7 @@ __global__ void kernel_statevec_packPairSummedAmpsIntoBuffer( template __global__ void kernel_statevec_anyCtrlSwap_subA( - cu_qcomp* amps, qindex numThreads, + gpu_qcomp* amps, qindex numThreads, int* ctrlsAndTargs, int numCtrls, qindex ctrlsAndTargsMask, int targ1, int targ2 ) { GET_THREAD_IND(n, numThreads); @@ -146,7 +142,7 @@ __global__ void kernel_statevec_anyCtrlSwap_subA( qindex i10 = flipTwoBits(i01, targ2, targ1); // swap amps - cu_qcomp amp01 = amps[i01]; + gpu_qcomp amp01 = amps[i01]; amps[i01] = amps[i10]; amps[i10] = amp01; } @@ -154,7 +150,7 @@ __global__ void kernel_statevec_anyCtrlSwap_subA( template __global__ void kernel_statevec_anyCtrlSwap_subB( - cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, + gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, int* ctrls, int numCtrls, qindex ctrlStateMask ) { GET_THREAD_IND(n, numThreads); @@ -172,7 +168,7 @@ __global__ void kernel_statevec_anyCtrlSwap_subB( template __global__ void kernel_statevec_anyCtrlSwap_subC( - cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, + gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, int* ctrlsAndTarg, int numCtrls, qindex ctrlsAndTargMask ) { GET_THREAD_IND(n, numThreads); @@ -197,9 +193,9 @@ __global__ void kernel_statevec_anyCtrlSwap_subC( template __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subA( - cu_qcomp* amps, qindex numThreads, + gpu_qcomp* amps, qindex numThreads, int* ctrlsAndTarg, int numCtrls, qindex ctrlStateMask, int targ, - cu_qcomp m00, cu_qcomp m01, cu_qcomp m10, cu_qcomp m11 + gpu_qcomp m00, gpu_qcomp m01, gpu_qcomp m10, gpu_qcomp m11 ) { GET_THREAD_IND(n, numThreads); @@ -211,8 +207,8 @@ __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subA( qindex i1 = flipBit(i0, targ); // note amps are strided by 2^targ - cu_qcomp amp0 = amps[i0]; - cu_qcomp amp1 = amps[i1]; + gpu_qcomp amp0 = amps[i0]; + gpu_qcomp amp1 = amps[i1]; amps[i0] = m00*amp0 + m01*amp1; amps[i1] = m10*amp0 + m11*amp1; @@ -221,9 +217,9 @@ __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subA( template __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subB( - cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, + gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, int* ctrls, int numCtrls, qindex ctrlStateMask, - cu_qcomp fac0, cu_qcomp fac1 + gpu_qcomp fac0, gpu_qcomp fac1 ) { GET_THREAD_IND(n, numThreads); @@ -246,12 +242,12 @@ __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subB( template __global__ void kernel_statevec_anyCtrlTwoTargDenseMatr_sub( - cu_qcomp* amps, qindex numThreads, + gpu_qcomp* amps, qindex numThreads, int* ctrlsAndTarg, int numCtrls, qindex ctrlStateMask, int targ1, int targ2, - cu_qcomp m00, cu_qcomp m01, cu_qcomp m02, cu_qcomp m03, - cu_qcomp m10, cu_qcomp m11, cu_qcomp m12, cu_qcomp m13, - cu_qcomp m20, cu_qcomp m21, cu_qcomp m22, cu_qcomp m23, - cu_qcomp m30, cu_qcomp m31, cu_qcomp m32, cu_qcomp m33 + gpu_qcomp m00, gpu_qcomp m01, gpu_qcomp m02, gpu_qcomp m03, + gpu_qcomp m10, gpu_qcomp m11, gpu_qcomp m12, gpu_qcomp m13, + gpu_qcomp m20, gpu_qcomp m21, gpu_qcomp m22, gpu_qcomp m23, + gpu_qcomp m30, gpu_qcomp m31, gpu_qcomp m32, gpu_qcomp m33 ) { GET_THREAD_IND(n, numThreads); @@ -265,10 +261,10 @@ __global__ void kernel_statevec_anyCtrlTwoTargDenseMatr_sub( qindex i11 = flipBit(i01, targ2); // note amps00 and amps01 are strided by 2^targ1, and amps00 and amps10 are strided by 2^targ2 - cu_qcomp amp00 = amps[i00]; - cu_qcomp amp01 = amps[i01]; - cu_qcomp amp10 = amps[i10]; - cu_qcomp amp11 = amps[i11]; + gpu_qcomp amp00 = amps[i00]; + gpu_qcomp amp01 = amps[i01]; + gpu_qcomp amp10 = amps[i10]; + gpu_qcomp amp11 = amps[i11]; // amps[i_n] = sum_j elems[n][j] amp[i_n] amps[i00] = m00*amp00 + m01*amp01 + m02*amp10 + m03*amp11; @@ -295,9 +291,9 @@ __forceinline__ __device__ qindex getThreadsNthGlobalArrInd(qindex n, qindex thr template __global__ void kernel_statevec_anyCtrlFewTargDenseMatr( - cu_qcomp* amps, qindex numThreads, + gpu_qcomp* amps, qindex numThreads, int* ctrlsAndTargs, int numCtrls, qindex ctrlsAndTargsMask, int* targs, - cu_qcomp* flatMatrElems + gpu_qcomp* flatMatrElems ) { GET_THREAD_IND(n, numThreads); @@ -310,7 +306,7 @@ __global__ void kernel_statevec_anyCtrlFewTargDenseMatr( // spill to local memory). Hence, this _subA() function is not a subroutine // despite some logic being common to non-compile-time _subB(), and hence // why the loops below are explicitly compile-time unrolled - REGISTER cu_qcomp privateCache[1 << NumTargs]; + REGISTER gpu_qcomp privateCache[1 << NumTargs]; // we know NumTargs <= 5, though NumCtrls is permitted anything (including -1) SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); @@ -335,7 +331,7 @@ __global__ void kernel_statevec_anyCtrlFewTargDenseMatr( // i = nth local index where ctrls are active and targs form value k qindex i = setBits(i0, targs, NumTargs, k); // loop will be unrolled - amps[i] = getCuQcomp(0, 0); + amps[i] = getGpuQcomp(0, 0); // force unroll to ensure compile-time cache indices #pragma unroll @@ -349,7 +345,7 @@ __global__ void kernel_statevec_anyCtrlFewTargDenseMatr( h = fast_getMatrixFlatIndex(k, l, numTargAmps); // optionally conjugate matrix elem - cu_qcomp elem = flatMatrElems[h]; + gpu_qcomp elem = flatMatrElems[h]; if constexpr (ApplyConj) elem.y *= -1; @@ -362,11 +358,11 @@ __global__ void kernel_statevec_anyCtrlFewTargDenseMatr( template __global__ void kernel_statevec_anyCtrlManyTargDenseMatr( - cu_qcomp* globalCache, - cu_qcomp* amps, qindex numThreads, qindex numBatchesPerThread, + gpu_qcomp* globalCache, + gpu_qcomp* amps, qindex numThreads, qindex numBatchesPerThread, int* ctrlsAndTargs, int numCtrls, qindex ctrlsAndTargsMask, int* targs, int numTargBits, qindex numTargAmps, - cu_qcomp* flatMatrElems + gpu_qcomp* flatMatrElems ) { GET_THREAD_IND(t, numThreads); @@ -398,7 +394,7 @@ __global__ void kernel_statevec_anyCtrlManyTargDenseMatr( // i = nth local index where ctrls are active and targs form value k qindex i = setBits(i0, targs, numTargBits, k); // loop may be unrolled - amps[i] = getCuQcomp(0, 0); + amps[i] = getGpuQcomp(0, 0); for (qindex l=0; l __global__ void kernel_statevec_anyCtrlOneTargDiagMatr_sub( - cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, + gpu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, int* ctrls, int numCtrls, qindex ctrlStateMask, int targ, - cu_qcomp m1, cu_qcomp m2 + gpu_qcomp m1, gpu_qcomp m2 ) { GET_THREAD_IND(n, numThreads); @@ -474,9 +470,9 @@ __global__ void kernel_statevec_anyCtrlOneTargDiagMatr_sub( template __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub( - cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, + gpu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, int* ctrls, int numCtrls, qindex ctrlStateMask, int targ1, int targ2, - cu_qcomp m1, cu_qcomp m2, cu_qcomp m3, cu_qcomp m4 + gpu_qcomp m1, gpu_qcomp m2, gpu_qcomp m3, gpu_qcomp m4 ) { GET_THREAD_IND(n, numThreads); @@ -502,7 +498,7 @@ __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub( // k = local elem index int k = getTwoBits(i, targ2, targ1); - cu_qcomp elems[] = {m1, m2, m3, m4}; + gpu_qcomp elems[] = {m1, m2, m3, m4}; amps[j] = amps[j] * elems[k]; } @@ -515,9 +511,9 @@ __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub( template __global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub( - cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, + gpu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, int* ctrls, int numCtrls, qindex ctrlStateMask, int* targs, int numTargs, - cu_qcomp* elems, cu_qcomp exponent + gpu_qcomp* elems, gpu_qcomp exponent ) { GET_THREAD_IND(n, numThreads); @@ -545,7 +541,7 @@ __global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub( // t = value of targeted bits, which may be in the prefix substate qindex t = getValueOfBits(i, targs, numTargBits); - cu_qcomp elem = elems[t]; + gpu_qcomp elem = elems[t]; if constexpr (HasPower) elem = getCompPower(elem, exponent); @@ -565,17 +561,17 @@ __global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub( template __global__ void kernel_densmatr_allTargDiagMatr_sub( - cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, - cu_qcomp* elems, qindex numElems, cu_qcomp exponent + gpu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, + gpu_qcomp* elems, qindex numElems, gpu_qcomp exponent ) { GET_THREAD_IND(n, numThreads); - cu_qcomp fac = getCuQcomp(1, 0); + gpu_qcomp fac = getGpuQcomp(1, 0); if constexpr (ApplyLeft) { qindex i = fast_getQuregGlobalRowFromFlatIndex(n, numElems); - cu_qcomp term = elems[i]; + gpu_qcomp term = elems[i]; if constexpr (HasPower) term = getCompPower(term, exponent); @@ -587,7 +583,7 @@ __global__ void kernel_densmatr_allTargDiagMatr_sub( qindex m = concatenateBits(rank, n, logNumAmpsPerNode); qindex j = fast_getQuregGlobalColFromFlatIndex(m, numElems); - cu_qcomp term = elems[j]; + gpu_qcomp term = elems[j]; if constexpr (HasPower) term = getCompPower(term, exponent); @@ -610,10 +606,10 @@ __global__ void kernel_densmatr_allTargDiagMatr_sub( template __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subA( - cu_qcomp* amps, qindex numThreads, + gpu_qcomp* amps, qindex numThreads, int* ctrlsAndTargs, int numCtrls, qindex ctrlsAndTargsStateMask, int* targsXY, int numXY, qindex maskXY, qindex maskYZ, - cu_qcomp powI, cu_qcomp ampFac, cu_qcomp pairAmpFac + gpu_qcomp powI, gpu_qcomp ampFac, gpu_qcomp pairAmpFac ) { GET_THREAD_IND(t, numThreads); @@ -636,11 +632,11 @@ __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subA( // determine whether to multiply amps by +-1 or +-i int parA = cudaGetBitMaskParity(iA & maskYZ); int parB = cudaGetBitMaskParity(iB & maskYZ); - cu_qcomp coeffA = powI * fast_getPlusOrMinusOne(parA); - cu_qcomp coeffB = powI * fast_getPlusOrMinusOne(parB); + gpu_qcomp coeffA = powI * fast_getPlusOrMinusOne(parA); + gpu_qcomp coeffB = powI * fast_getPlusOrMinusOne(parB); - cu_qcomp ampA = amps[iA]; - cu_qcomp ampB = amps[iB]; + gpu_qcomp ampA = amps[iA]; + gpu_qcomp ampB = amps[iB]; // mix or swap scaled amp pair amps[iA] = (ampFac * ampA) + (pairAmpFac * coeffB * ampB); @@ -650,10 +646,10 @@ __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subA( template __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subB( - cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, + gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, int* ctrls, int numCtrls, qindex ctrlStateMask, qindex maskXY, qindex maskYZ, qindex bufferMaskXY, - cu_qcomp powI, cu_qcomp thisAmpFac, cu_qcomp otherAmpFac + gpu_qcomp powI, gpu_qcomp thisAmpFac, gpu_qcomp otherAmpFac ) { GET_THREAD_IND(n, numThreads); @@ -671,7 +667,7 @@ __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subB( // determine whether to multiply buffer amp by +-1 or +-i int par = cudaGetBitMaskParity(k & maskYZ); - cu_qcomp coeff = powI * fast_getPlusOrMinusOne(par); + gpu_qcomp coeff = powI * fast_getPlusOrMinusOne(par); amps[i] = (thisAmpFac * amps[i]) + (otherAmpFac * coeff * buffer[j]); } @@ -685,9 +681,9 @@ __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subB( template __global__ void kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub( - cu_qcomp* amps, qindex numThreads, + gpu_qcomp* amps, qindex numThreads, int* ctrls, int numCtrls, qindex ctrlStateMask, qindex targMask, - cu_qcomp fac0, cu_qcomp fac1 + gpu_qcomp fac0, gpu_qcomp fac1 ) { GET_THREAD_IND(n, numThreads); @@ -700,7 +696,7 @@ __global__ void kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub( // apply phase to amp depending on parity of targets in global index int p = cudaGetBitMaskParity(i & targMask); - cu_qcomp facs[] = {fac0, fac1}; + gpu_qcomp facs[] = {fac0, fac1}; amps[i] = amps[i] * facs[p]; } @@ -713,15 +709,15 @@ __global__ void kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub( template __global__ void kernel_statevec_setQuregToWeightedSum_sub( - cu_qcomp* outAmps, qindex numThreads, - cu_qcomp* coeffs, cu_qcomp** inAmps, int numQuregs + gpu_qcomp* outAmps, qindex numThreads, + gpu_qcomp* coeffs, gpu_qcomp** inAmps, int numQuregs ) { GET_THREAD_IND(n, numThreads); // use template param to compile-time unroll below loop SET_VAR_AT_COMPILE_TIME(int, numInner, NumQuregs, numQuregs); - cu_qcomp amp = getCuQcomp(0, 0); + gpu_qcomp amp = getGpuQcomp(0, 0); for (int q=0; q __global__ void kernel_densmatr_partialTrace_sub( - cu_qcomp* ampsIn, cu_qcomp* ampsOut, qindex numThreads, + gpu_qcomp* ampsIn, gpu_qcomp* ampsOut, qindex numThreads, int* ketTargs, int* pairTargs, int* allTargs, int numKetTargs ) { GET_THREAD_IND(n, numThreads); @@ -1154,7 +1150,7 @@ __global__ void kernel_densmatr_partialTrace_sub( qindex k = insertBits(n, allTargs, numAllTargs, 0); // loop may be unrolled // each outQureg amp results from summing 2^targs inQureg amps - cu_qcomp outAmp = getCuQcomp(0, 0); + gpu_qcomp outAmp = getGpuQcomp(0, 0); // loop may be unrolled for (qindex j=0; j __global__ void kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub( - qreal* outProbs, cu_qcomp* amps, qindex numThreads, + qreal* outProbs, gpu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, int* qubits, int numQubits ) { @@ -1208,7 +1204,7 @@ __global__ void kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub( template __global__ void kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub( - qreal* outProbs, cu_qcomp* amps, qindex numThreads, + qreal* outProbs, gpu_qcomp* amps, qindex numThreads, qindex firstDiagInd, qindex numAmpsPerCol, int rank, qindex logNumAmpsPerNode, int* qubits, int numQubits diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index 5e18048f7..5c9aaa99a 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -148,7 +148,7 @@ qindex gpu_statevec_packAmpsIntoBuffer(Qureg qureg, vector qubits, vector <<>> ( - toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[sendInd], numThreads, + toGpuQcomps(qureg.gpuAmps), &toGpuQcomps(qureg.gpuCommBuffer)[sendInd], numThreads, getPtr(sortedQubits), qubits.size(), qubitStateMask ); @@ -173,7 +173,7 @@ qindex gpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu qindex sendInd = getSubBufferSendInd(qureg); kernel_statevec_packPairSummedAmpsIntoBuffer <<>> ( - toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[sendInd], numThreads, + toGpuQcomps(qureg.gpuAmps), &toGpuQcomps(qureg.gpuCommBuffer)[sendInd], numThreads, qubit1, qubit2, qubit3, bit2 ); @@ -214,7 +214,7 @@ void gpu_statevec_anyCtrlSwap_subA(Qureg qureg, vector ctrls, vector c qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ2, targ1}, {0, 1}); kernel_statevec_anyCtrlSwap_subA <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, + toGpuQcomps(qureg.gpuAmps), numThreads, getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2 ); @@ -239,7 +239,7 @@ void gpu_statevec_anyCtrlSwap_subB(Qureg qureg, vector ctrls, vector c qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); kernel_statevec_anyCtrlSwap_subB <<>> ( - toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, + toGpuQcomps(qureg.gpuAmps), &toGpuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, getPtr(sortedCtrls), ctrls.size(), ctrlStateMask ); @@ -264,7 +264,7 @@ void gpu_statevec_anyCtrlSwap_subC(Qureg qureg, vector ctrls, vector c qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {targState}); kernel_statevec_anyCtrlSwap_subC <<>> ( - toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, + toGpuQcomps(qureg.gpuAmps), &toGpuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, getPtr(sortedQubits), ctrls.size(), qubitStateMask ); @@ -293,7 +293,7 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector ctrls, v #if COMPILE_CUQUANTUM bool applyAdj = false; - auto arr = unpackMatrixToCuQcomps(matr); + auto arr = unpackMatrixToGpuQcomps(matr); cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(qureg, ctrls, ctrlStates, {targ}, arr.data(), applyAdj); #elif COMPILE_CUDA @@ -304,10 +304,10 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector ctrls, v devints sortedQubits = util_getSorted(ctrls, {targ}); qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0}); - auto [m00, m01, m10, m11] = unpackMatrixToCuQcomps(matr); + auto [m00, m01, m10, m11] = unpackMatrixToGpuQcomps(matr); kernel_statevec_anyCtrlOneTargDenseMatr_subA <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, + toGpuQcomps(qureg.gpuAmps), numThreads, getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ, m00, m01, m10, m11 ); @@ -333,9 +333,9 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector ctrls, v qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); kernel_statevec_anyCtrlOneTargDenseMatr_subB <<>> ( - toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, + toGpuQcomps(qureg.gpuAmps), &toGpuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, - toCuQcomp(fac0), toCuQcomp(fac1) + toGpuQcomp(fac0), toGpuQcomp(fac1) ); #else @@ -362,7 +362,7 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve #if COMPILE_CUQUANTUM bool applyAdj = false; - auto arr = unpackMatrixToCuQcomps(matr); + auto arr = unpackMatrixToGpuQcomps(matr); cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(qureg, ctrls, ctrlStates, {targ1, targ2}, arr.data(), applyAdj); #elif COMPILE_CUDA @@ -374,10 +374,10 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1,targ2}, {0,0}); // unpack matrix elems which are more efficiently accessed by kernels as args than shared mem (... maybe...) - auto m = unpackMatrixToCuQcomps(matr); + auto m = unpackMatrixToGpuQcomps(matr); kernel_statevec_anyCtrlTwoTargDenseMatr_sub <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, + toGpuQcomps(qureg.gpuAmps), numThreads, getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2, m[0], m[1], m[2], m[3], m[4], m[5], m[6], m[7], m[8], m[9], m[10], m[11], m[12], m[13], m[14], m[15] @@ -405,7 +405,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve #if COMPILE_CUQUANTUM - auto matrElemsPtr = toCuQcomps(matr.gpuElemsFlat); + auto matrElemsPtr = toGpuQcomps(matr.gpuElemsFlat); auto matrElemsLen = matr.numRows * matr.numRows; // assert the pre-condition assumed below @@ -437,8 +437,8 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, targs, vector(targs.size(),0)); // unpacking args (to better distinguish below signatures) - auto ampsPtr = toCuQcomps(qureg.gpuAmps); - auto matrPtr = toCuQcomps(matr.gpuElemsFlat); + auto ampsPtr = toGpuQcomps(qureg.gpuAmps); + auto matrPtr = toGpuQcomps(matr.gpuElemsFlat); auto qubitsPtr = getPtr(deviceQubits); auto targsPtr = getPtr(deviceTargs); auto nCtrls = ctrls.size(); @@ -506,7 +506,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve kernel_statevec_anyCtrlManyTargDenseMatr <<>> ( - toCuQcomps(cache), + toGpuQcomps(cache), ampsPtr, numThreads, numBatchesPerThread, qubitsPtr, nCtrls, qubitStateMask, targsPtr, targs.size(), powerOf2(targs.size()), matrPtr @@ -550,7 +550,7 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector ctrls, vec bool conj = false; // we can pass 1D CPU .elems array directly to cuQuantum which will recognise host pointers - cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, {targ}, toCuQcomps(matr.elems), conj); + cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, {targ}, toGpuQcomps(matr.elems), conj); // explicitly return to avoid re-simulation below return; @@ -570,10 +570,10 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector ctrls, vec devints deviceCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); - auto elems = unpackMatrixToCuQcomps(matr); + auto elems = unpackMatrixToGpuQcomps(matr); kernel_statevec_anyCtrlOneTargDiagMatr_sub <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, + toGpuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ, elems[0], elems[1] ); @@ -618,7 +618,7 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector ctrls, vec bool conj = false; // we can pass 1D CPU array directly to cuQuantum, and it will recognise host pointers - cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, {targ1, targ2}, toCuQcomps(matr.elems), conj); + cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, {targ1, targ2}, toGpuQcomps(matr.elems), conj); // explicitly return to avoid re-simulation below return; @@ -638,10 +638,10 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector ctrls, vec devints deviceCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); - auto elems = unpackMatrixToCuQcomps(matr); + auto elems = unpackMatrixToGpuQcomps(matr); kernel_statevec_anyCtrlTwoTargDiagMatr_sub <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, + toGpuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ1, targ2, elems[0], elems[1], elems[2], elems[3] ); @@ -686,7 +686,7 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vec // cuQuantum cannot handle HasPower, in which case we fall back to custom kernel if (!HasPower && util_areAllQubitsInSuffix(targs, qureg)) { - cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, targs, toCuQcomps(util_getGpuMemPtr(matr)), ApplyConj); + cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, targs, toGpuQcomps(util_getGpuMemPtr(matr)), ApplyConj); // must return to avoid re-simulation below return; @@ -709,9 +709,9 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vec qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); kernel_statevec_anyCtrlAnyTargDiagMatr_sub <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, + toGpuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, getPtr(deviceTargs), targs.size(), - toCuQcomps(util_getGpuMemPtr(matr)), toCuQcomp(exponent) + toGpuQcomps(util_getGpuMemPtr(matr)), toGpuQcomp(exponent) ); // must return to avoid runtime error below @@ -743,7 +743,7 @@ void gpu_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp // we always use Thrust because we are doubtful that cuQuantum's // diagonal-matrix facilities are optimised for the all-qubit case - thrust_statevec_allTargDiagMatr_sub(qureg, matr, toCuQcomp(exponent)); + thrust_statevec_allTargDiagMatr_sub(qureg, matr, toGpuQcomp(exponent)); #else error_gpuSimButGpuNotCompiled(); @@ -764,8 +764,8 @@ void gpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp kernel_densmatr_allTargDiagMatr_sub <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, - toCuQcomps(util_getGpuMemPtr(matr)), matr.numElems, toCuQcomp(exponent) + toGpuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, + toGpuQcomps(util_getGpuMemPtr(matr)), matr.numElems, toGpuQcomp(exponent) ); #else @@ -823,10 +823,10 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, vector ct qindex numThreads = (qureg.numAmpsPerNode / powerOf2(ctrls.size())) / 2; // divides evenly qindex numBlocks = getNumBlocks(numThreads); kernel_statevector_anyCtrlPauliTensorOrGadget_subA <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, + toGpuQcomps(qureg.gpuAmps), numThreads, getPtr(deviceQubits), ctrls.size(), qubitStateMask, getPtr(deviceTargs), deviceTargs.size(), - maskXY, maskYZ, toCuQcomp(powI), toCuQcomp(ampFac), toCuQcomp(pairAmpFac) + maskXY, maskYZ, toGpuQcomp(powI), toGpuQcomp(ampFac), toGpuQcomp(pairAmpFac) ); #else @@ -854,10 +854,10 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, vector ct qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); kernel_statevector_anyCtrlPauliTensorOrGadget_subB <<>> ( - toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, + toGpuQcomps(qureg.gpuAmps), &toGpuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, maskXY, maskYZ, bufferMaskXY, - toCuQcomp(powI), toCuQcomp(ampFac), toCuQcomp(pairAmpFac) + toGpuQcomp(powI), toGpuQcomp(ampFac), toGpuQcomp(pairAmpFac) ); #else @@ -891,9 +891,9 @@ void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, vector c qindex targMask = util_getBitMask(targs); kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, + toGpuQcomps(qureg.gpuAmps), numThreads, getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, targMask, - toCuQcomp(fac0), toCuQcomp(fac1) + toGpuQcomp(fac0), toGpuQcomp(fac1) ); #else @@ -920,17 +920,17 @@ void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector coeffs qindex numBlocks = getNumBlocks(numThreads); // extract amp ptrs from qureg list - vector ptrs; + vector ptrs; ptrs.reserve(inQuregs.size()); for (auto& qureg : inQuregs) - ptrs.push_back(toCuQcomps(qureg.gpuAmps)); + ptrs.push_back(toGpuQcomps(qureg.gpuAmps)); // copy coeff and qureg lists into GPU memory - devcuqcompptrs devQuregAmps = ptrs; + devgpuqcompptrs devQuregAmps = ptrs; devcomps devCoeffs = coeffs; kernel_statevec_setQuregToWeightedSum_sub <<>> ( - toCuQcomps(outQureg.gpuAmps), numThreads, + toGpuQcomps(outQureg.gpuAmps), numThreads, getPtr(devCoeffs), getPtr(devQuregAmps), inQuregs.size() ); @@ -960,7 +960,7 @@ void gpu_densmatr_mixQureg_subB(qreal outProb, Qureg outQureg, qreal inProb, Qur qindex numBlocks = getNumBlocks(numThreads); kernel_densmatr_mixQureg_subB <<>> ( - outProb, toCuQcomps(outQureg.gpuAmps), inProb, toCuQcomps(inQureg.gpuAmps), + outProb, toGpuQcomps(outQureg.gpuAmps), inProb, toGpuQcomps(inQureg.gpuAmps), numThreads, inQureg.numAmps ); @@ -978,7 +978,7 @@ void gpu_densmatr_mixQureg_subC(qreal outProb, Qureg outQureg, qreal inProb) { qindex numBlocks = getNumBlocks(numThreads); kernel_densmatr_mixQureg_subC <<>> ( - outProb, toCuQcomps(outQureg.gpuAmps), inProb, toCuQcomps(outQureg.gpuCommBuffer), + outProb, toGpuQcomps(outQureg.gpuAmps), inProb, toGpuQcomps(outQureg.gpuCommBuffer), numThreads, outQureg.rank, powerOf2(outQureg.numQubits), outQureg.logNumAmpsPerNode ); @@ -1013,7 +1013,7 @@ void gpu_densmatr_oneQubitDephasing_subA(Qureg qureg, int ketQubit, qreal prob) int braQubit = util_getBraQubit(ketQubit, qureg); kernel_densmatr_oneQubitDephasing_subA <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, fac + toGpuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, fac ); #else @@ -1039,7 +1039,7 @@ void gpu_densmatr_oneQubitDephasing_subB(Qureg qureg, int ketQubit, qreal prob) int braBit = util_getRankBitOfBraQubit(ketQubit, qureg); kernel_densmatr_oneQubitDephasing_subB <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braBit, fac + toGpuQcomps(qureg.gpuAmps), numThreads, ketQubit, braBit, fac ); #else @@ -1085,7 +1085,7 @@ void gpu_densmatr_twoQubitDephasing_subB(Qureg qureg, int ketQubitA, int ketQubi int braQubitB = util_getBraQubit(ketQubitB, qureg); kernel_densmatr_twoQubitDephasing_subB <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, // numAmps, not numCols + toGpuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, // numAmps, not numCols ketQubitA, ketQubitB, braQubitA, braQubitB, term ); @@ -1112,7 +1112,7 @@ void gpu_densmatr_oneQubitDepolarising_subA(Qureg qureg, int ketQubit, qreal pro auto factors = util_getOneQubitDepolarisingFactors(prob); kernel_densmatr_oneQubitDepolarising_subA <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, factors.c1, factors.c2, factors.c3 + toGpuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, factors.c1, factors.c2, factors.c3 ); #else @@ -1133,7 +1133,7 @@ void gpu_densmatr_oneQubitDepolarising_subB(Qureg qureg, int ketQubit, qreal pro auto factors = util_getOneQubitDepolarisingFactors(prob); kernel_densmatr_oneQubitDepolarising_subB <<>> ( - toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, + toGpuQcomps(qureg.gpuAmps), &toGpuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, ketQubit, braBit, factors.c1, factors.c2, factors.c3 ); @@ -1161,7 +1161,7 @@ void gpu_densmatr_twoQubitDepolarising_subA(Qureg qureg, int ketQb1, int ketQb2, auto c3 = util_getTwoQubitDepolarisingFactors(prob).c3; kernel_densmatr_twoQubitDepolarising_subA <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, + toGpuQcomps(qureg.gpuAmps), numThreads, ketQb1, ketQb2, braQb1, braQb2, c3 ); @@ -1186,7 +1186,7 @@ void gpu_densmatr_twoQubitDepolarising_subB(Qureg qureg, int ketQb1, int ketQb2, qreal altc1 = factors.c1 - factors.c2; kernel_densmatr_twoQubitDepolarising_subB <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, + toGpuQcomps(qureg.gpuAmps), numThreads, ketQb1, ketQb2, braQb1, braQb2, altc1, factors.c2 ); @@ -1208,7 +1208,7 @@ void gpu_densmatr_twoQubitDepolarising_subC(Qureg qureg, int ketQb1, int ketQb2, auto c3 = util_getTwoQubitDepolarisingFactors(prob).c3; kernel_densmatr_twoQubitDepolarising_subC <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, + toGpuQcomps(qureg.gpuAmps), numThreads, ketQb1, ketQb2, braQb1, braBit2, c3 ); @@ -1231,7 +1231,7 @@ void gpu_densmatr_twoQubitDepolarising_subD(Qureg qureg, int ketQb1, int ketQb2, auto factors = util_getTwoQubitDepolarisingFactors(prob); kernel_densmatr_twoQubitDepolarising_subD <<>> ( - toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[offset], numThreads, + toGpuQcomps(qureg.gpuAmps), &toGpuQcomps(qureg.gpuCommBuffer)[offset], numThreads, ketQb1, ketQb2, braQb1, braBit2, factors.c1, factors.c2 ); @@ -1256,7 +1256,7 @@ void gpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int ketQb1, int ketQb2, qreal fac1 = factors.c1 - fac0; kernel_densmatr_twoQubitDepolarising_subE <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, + toGpuQcomps(qureg.gpuAmps), numThreads, ketQb1, ketQb2, braBit1, braBit2, fac0, fac1 ); @@ -1279,7 +1279,7 @@ void gpu_densmatr_twoQubitDepolarising_subF(Qureg qureg, int ketQb1, int ketQb2, auto c2 = util_getTwoQubitDepolarisingFactors(prob).c2; kernel_densmatr_twoQubitDepolarising_subF <<>> ( - toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[offset], numThreads, + toGpuQcomps(qureg.gpuAmps), &toGpuQcomps(qureg.gpuCommBuffer)[offset], numThreads, ketQb1, ketQb2, braBit1, braBit2, c2 ); @@ -1306,7 +1306,7 @@ void gpu_densmatr_oneQubitPauliChannel_subA(Qureg qureg, int ketQubit, qreal pI, auto factors = util_getOneQubitPauliChannelFactors(pI, pX, pY, pZ); kernel_densmatr_oneQubitPauliChannel_subA <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, + toGpuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, factors.c1, factors.c2, factors.c3, factors.c4 ); @@ -1328,7 +1328,7 @@ void gpu_densmatr_oneQubitPauliChannel_subB(Qureg qureg, int ketQubit, qreal pI, auto factors = util_getOneQubitPauliChannelFactors(pI, pX, pY, pZ); kernel_densmatr_oneQubitPauliChannel_subB <<>> ( - toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, + toGpuQcomps(qureg.gpuAmps), &toGpuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, ketQubit, braBit, factors.c1, factors.c2, factors.c3, factors.c4 ); @@ -1355,7 +1355,7 @@ void gpu_densmatr_oneQubitDamping_subA(Qureg qureg, int ketQubit, qreal prob) { auto factors = util_getOneQubitDampingFactors(prob); kernel_densmatr_oneQubitDamping_subA <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, + toGpuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, prob, factors.c1, factors.c2 ); @@ -1375,7 +1375,7 @@ void gpu_densmatr_oneQubitDamping_subB(Qureg qureg, int qubit, qreal prob) { auto c2 = util_getOneQubitDampingFactors(prob).c2; kernel_densmatr_oneQubitDamping_subB <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, qubit, c2 + toGpuQcomps(qureg.gpuAmps), numThreads, qubit, c2 ); #else @@ -1395,7 +1395,7 @@ void gpu_densmatr_oneQubitDamping_subC(Qureg qureg, int ketQubit, qreal prob) { auto c1 = util_getOneQubitDampingFactors(prob).c1; kernel_densmatr_oneQubitDamping_subC <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braBit, c1 + toGpuQcomps(qureg.gpuAmps), numThreads, ketQubit, braBit, c1 ); #else @@ -1413,7 +1413,7 @@ void gpu_densmatr_oneQubitDamping_subD(Qureg qureg, int qubit, qreal prob) { qindex recvInd = getBufferRecvInd(); kernel_densmatr_oneQubitDamping_subD <<>> ( - toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, + toGpuQcomps(qureg.gpuAmps), &toGpuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, qubit, prob ); @@ -1444,7 +1444,7 @@ void gpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector ta devints devAllTargs = util_getSorted(targs, pairTargs); kernel_densmatr_partialTrace_sub <<>> ( - toCuQcomps(inQureg.gpuAmps), toCuQcomps(outQureg.gpuAmps), numThreads, + toGpuQcomps(inQureg.gpuAmps), toGpuQcomps(outQureg.gpuAmps), numThreads, getPtr(devTargs), getPtr(devPairTargs), getPtr(devAllTargs), targs.size() ); @@ -1564,7 +1564,7 @@ void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub <<>> ( - getPtr(devProbs), toCuQcomps(qureg.gpuAmps), numThreads, + getPtr(devProbs), toGpuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, getPtr(devQubits), devQubits.size() ); @@ -1601,7 +1601,7 @@ void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub <<>> ( - getPtr(devProbs), toCuQcomps(qureg.gpuAmps), + getPtr(devProbs), toGpuQcomps(qureg.gpuAmps), numThreads, firstDiagInd, numAmpsPerCol, qureg.rank, qureg.logNumAmpsPerNode, getPtr(devQubits), devQubits.size() @@ -1633,7 +1633,7 @@ qcomp gpu_statevec_calcInnerProduct_sub(Qureg quregA, Qureg quregB) { #if COMPILE_CUQUANTUM || COMPILE_CUDA - cu_qcomp prod = thrust_statevec_calcInnerProduct_sub(quregA, quregB); + gpu_qcomp prod = thrust_statevec_calcInnerProduct_sub(quregA, quregB); return toQcomp(prod); #else @@ -1661,7 +1661,7 @@ qcomp gpu_densmatr_calcFidelityWithPureState_sub(Qureg rho, Qureg psi) { #if COMPILE_CUQUANTUM || COMPILE_CUDA - cu_qcomp fid = thrust_densmatr_calcFidelityWithPureState_sub(rho, psi); + gpu_qcomp fid = thrust_densmatr_calcFidelityWithPureState_sub(rho, psi); return toQcomp(fid); #else @@ -1702,7 +1702,7 @@ qcomp gpu_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector targs) { #if COMPILE_CUQUANTUM || COMPILE_CUDA - cu_qcomp value = thrust_densmatr_calcExpecAnyTargZ_sub(qureg, targs); + gpu_qcomp value = thrust_densmatr_calcExpecAnyTargZ_sub(qureg, targs); return toQcomp(value); #else @@ -1720,7 +1720,7 @@ qcomp gpu_statevec_calcExpecPauliStr_subA(Qureg qureg, vector x, vector x, vector x, vector #if COMPILE_CUQUANTUM || COMPILE_CUDA - cu_qcomp value = thrust_densmatr_calcExpecPauliStr_sub(qureg, x, y, z); + gpu_qcomp value = thrust_densmatr_calcExpecPauliStr_sub(qureg, x, y, z); return toQcomp(value); #else @@ -1770,8 +1770,8 @@ qcomp gpu_statevec_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr #if COMPILE_CUQUANTUM || COMPILE_CUDA - cu_qcomp expo = toCuQcomp(exponent); - cu_qcomp value = thrust_statevec_calcExpecFullStateDiagMatr_sub(qureg, matr, expo); + gpu_qcomp expo = toGpuQcomp(exponent); + gpu_qcomp value = thrust_statevec_calcExpecFullStateDiagMatr_sub(qureg, matr, expo); return toQcomp(value); #else @@ -1787,8 +1787,8 @@ qcomp gpu_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr #if COMPILE_CUQUANTUM || COMPILE_CUDA - cu_qcomp expo = toCuQcomp(exponent); - cu_qcomp value = thrust_densmatr_calcExpecFullStateDiagMatr_sub(qureg, matr, expo); + gpu_qcomp expo = toGpuQcomp(exponent); + gpu_qcomp value = thrust_densmatr_calcExpecFullStateDiagMatr_sub(qureg, matr, expo); return toQcomp(value); #else @@ -1866,7 +1866,7 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_densmatr_multiQubitProjector void gpu_statevec_initUniformState_sub(Qureg qureg, qcomp amp) { #if COMPILE_CUDA || COMPILE_CUQUANTUM - thrust_statevec_initUniformState(qureg, toCuQcomp(amp)); + thrust_statevec_initUniformState(qureg, toGpuQcomp(amp)); #else error_gpuSimButGpuNotCompiled(); diff --git a/quest/src/gpu/gpu_thrust.cuh b/quest/src/gpu/gpu_thrust.cuh index 3b653dfd1..8bcc5e58d 100644 --- a/quest/src/gpu/gpu_thrust.cuh +++ b/quest/src/gpu/gpu_thrust.cuh @@ -39,12 +39,8 @@ #include "quest/src/core/constants.hpp" #include "quest/src/core/utilities.hpp" #include "quest/src/core/randomiser.hpp" -#include "quest/src/comm/comm_config.hpp" - -// kernels/thrust must use cu_qcomp, never qcomp -#define USE_CU_QCOMP #include "quest/src/core/fastmath.hpp" -#undef USE_CU_QCOMP +#include "quest/src/comm/comm_config.hpp" #include #include @@ -68,7 +64,7 @@ * copy constructor (devicevec d_vec = hostvec). The pointer * to the data (d_vec.data()) can be cast into a raw pointer * and passed directly to CUDA kernels (though qcomp must be - * reinterpreted to cu_qcomp) + * reinterpreted to gpu_qcomp) */ @@ -110,18 +106,18 @@ devreals getDeviceRealsVec(qindex dim) { using devcomps = thrust::device_vector; -cu_qcomp* getPtr(devcomps& comps) { +gpu_qcomp* getPtr(devcomps& comps) { - // devcomps -> qcomp -> cu_qcomp + // devcomps -> qcomp -> gpu_qcomp qcomp* ptr = thrust::raw_pointer_cast(comps.data()); - return toCuQcomps(ptr); + return toGpuQcomps(ptr); } // father forgive me for I have sinned -using devcuqcompptrs = thrust::device_vector; +using devgpuqcompptrs = thrust::device_vector; -cu_qcomp** getPtr(devcuqcompptrs& ptrs) { +gpu_qcomp** getPtr(devgpuqcompptrs& ptrs) { return thrust::raw_pointer_cast(ptrs.data()); } @@ -136,13 +132,13 @@ cu_qcomp** getPtr(devcuqcompptrs& ptrs) { */ -thrust::device_ptr getStartPtr(cu_qcomp* amps) { +thrust::device_ptr getStartPtr(gpu_qcomp* amps) { return thrust::device_pointer_cast(amps); } auto getStartPtr(qcomp* amps) { - return getStartPtr(toCuQcomps(amps)); + return getStartPtr(toGpuQcomps(amps)); } @@ -177,21 +173,21 @@ auto getEndPtr(FullStateDiagMatr matr) { struct functor_getAmpConj { - __host__ __device__ cu_qcomp operator()(cu_qcomp amp) { + __host__ __device__ gpu_qcomp operator()(gpu_qcomp amp) { return getCompConj(amp); } }; struct functor_getAmpNorm { - __host__ __device__ qreal operator()(cu_qcomp amp) { + __host__ __device__ qreal operator()(gpu_qcomp amp) { return getCompNorm(amp); } }; struct functor_getAmpReal { - __host__ __device__ qreal operator()(cu_qcomp amp) { + __host__ __device__ qreal operator()(gpu_qcomp amp) { return getCompReal(amp); } }; @@ -199,14 +195,14 @@ struct functor_getAmpReal { struct functor_getAmpConjProd { - __host__ __device__ cu_qcomp operator()(cu_qcomp braAmp, cu_qcomp ketAmp) { + __host__ __device__ gpu_qcomp operator()(gpu_qcomp braAmp, gpu_qcomp ketAmp) { return getCompConj(braAmp) * ketAmp; } }; struct functor_getNormOfAmpDif { - __host__ __device__ qreal operator()(cu_qcomp amp1, cu_qcomp amp2) { + __host__ __device__ qreal operator()(gpu_qcomp amp1, gpu_qcomp amp2) { return getCompNorm(amp1 - amp2); } }; @@ -220,7 +216,7 @@ struct functor_getExpecStateVecZTerm { qindex targMask; functor_getExpecStateVecZTerm(qindex mask) : targMask(mask) {} - __device__ qreal operator()(qindex ind, cu_qcomp amp) { + __device__ qreal operator()(qindex ind, gpu_qcomp amp) { int par = cudaGetBitMaskParity(ind & targMask); // device-only int sign = fast_getPlusOrMinusOne(par); @@ -235,12 +231,12 @@ struct functor_getExpecDensMatrZTerm { // in the expectation value of Z of a density matrix qindex numAmpsPerCol, firstDiagInd, targMask; - cu_qcomp* amps; + gpu_qcomp* amps; - functor_getExpecDensMatrZTerm(qindex dim, qindex diagInd, qindex mask, cu_qcomp* _amps) : + functor_getExpecDensMatrZTerm(qindex dim, qindex diagInd, qindex mask, gpu_qcomp* _amps) : numAmpsPerCol(dim), firstDiagInd(diagInd), targMask(mask), amps(_amps) {} - __device__ cu_qcomp operator()(qindex n) { + __device__ gpu_qcomp operator()(qindex n) { qindex i = fast_getQuregLocalIndexOfDiagonalAmp(n, firstDiagInd, numAmpsPerCol); qindex r = n + firstDiagInd; @@ -259,12 +255,12 @@ struct functor_getExpecStateVecPauliTerm { // at least one X or Y) of a statevector qindex maskXY, maskYZ; - cu_qcomp *amps, *pairAmps; + gpu_qcomp *amps, *pairAmps; - functor_getExpecStateVecPauliTerm(qindex _maskXY, qindex _maskYZ, cu_qcomp* _amps, cu_qcomp* _pairAmps) : + functor_getExpecStateVecPauliTerm(qindex _maskXY, qindex _maskYZ, gpu_qcomp* _amps, gpu_qcomp* _pairAmps) : maskXY(_maskXY), maskYZ(_maskYZ), amps(_amps), pairAmps(_pairAmps) {} - __device__ cu_qcomp operator()(qindex n) { + __device__ gpu_qcomp operator()(qindex n) { qindex j = flipBits(n, maskXY); int par = cudaGetBitMaskParity(j & maskYZ); // device-only @@ -284,12 +280,12 @@ struct functor_getExpecDensMatrPauliTerm { qindex maskXY, maskYZ; qindex numAmpsPerCol, firstDiagInd; - cu_qcomp *amps; + gpu_qcomp *amps; - functor_getExpecDensMatrPauliTerm(qindex _maskXY, qindex _maskYZ, qindex _numAmpsPerCol, qindex _firstDiagInd, cu_qcomp* _amps) : + functor_getExpecDensMatrPauliTerm(qindex _maskXY, qindex _maskYZ, qindex _numAmpsPerCol, qindex _firstDiagInd, gpu_qcomp* _amps) : maskXY(_maskXY), maskYZ(_maskYZ), numAmpsPerCol(_numAmpsPerCol), firstDiagInd(_firstDiagInd), amps(_amps) {} - __device__ cu_qcomp operator()(qindex n) { + __device__ gpu_qcomp operator()(qindex n) { qindex r = n + firstDiagInd; qindex i = flipBits(r, maskXY); @@ -310,19 +306,19 @@ struct functor_getExpecDensMatrDiagMatrTerm { // value of a FullStateDiagMatr upon a density matrix qindex numAmpsPerCol, firstDiagInd; - cu_qcomp *amps, *elems, expo; + gpu_qcomp *amps, *elems, expo; - functor_getExpecDensMatrDiagMatrTerm(qindex dim, qindex diagInd, cu_qcomp* _amps, cu_qcomp* _elems, cu_qcomp _expo) : + functor_getExpecDensMatrDiagMatrTerm(qindex dim, qindex diagInd, gpu_qcomp* _amps, gpu_qcomp* _elems, gpu_qcomp _expo) : numAmpsPerCol(dim), firstDiagInd(diagInd), amps(_amps), elems(_elems), expo(_expo) {} - __device__ cu_qcomp operator()(qindex n) { + __device__ gpu_qcomp operator()(qindex n) { - cu_qcomp elem = elems[n]; + gpu_qcomp elem = elems[n]; if constexpr (HasPower && ! UseRealPow) elem = getCompPower(elem, expo); if constexpr (HasPower && UseRealPow) - elem = getCuQcomp(pow(getCompReal(elem), getCompReal(expo)),0); // CUDA pow(qreal,qreal) + elem = getGpuQcomp(pow(getCompReal(elem), getCompReal(expo)),0); // CUDA pow(qreal,qreal) qindex i = fast_getQuregLocalIndexOfDiagonalAmp(n, firstDiagInd, numAmpsPerCol); @@ -339,13 +335,13 @@ struct functor_setAmpToPauliStrSumElem { qindex suffixLen; qindex numTerms; - cu_qcomp* amps; - cu_qcomp* coeffs; + gpu_qcomp* amps; + gpu_qcomp* coeffs; PauliStr* strings; functor_setAmpToPauliStrSumElem( int rank, qindex dim, qindex suffixLen, qindex numTerms, - cu_qcomp* amps, cu_qcomp* coeffs, PauliStr* strings + gpu_qcomp* amps, gpu_qcomp* coeffs, PauliStr* strings ) : rank(rank), dim(dim), suffixLen(suffixLen), numTerms(numTerms), amps(amps), coeffs(coeffs), strings(strings) @@ -381,7 +377,7 @@ struct functor_mixAmps { qreal outProb, inProb; functor_mixAmps(qreal out, qreal in) : outProb(out), inProb(in) {} - __host__ __device__ cu_qcomp operator()(cu_qcomp outAmp, cu_qcomp inAmp) { + __host__ __device__ gpu_qcomp operator()(gpu_qcomp outAmp, gpu_qcomp inAmp) { return (outProb * outAmp) + (inProb * inAmp); } @@ -397,18 +393,18 @@ struct functor_multiplyElemPowerWithAmpOrNorm { // a statevector amp (used when modifying the state) // or its norm (used when calculating expected values) - cu_qcomp exponent; - functor_multiplyElemPowerWithAmpOrNorm(cu_qcomp power) : exponent(power) {} + gpu_qcomp exponent; + functor_multiplyElemPowerWithAmpOrNorm(gpu_qcomp power) : exponent(power) {} - __host__ __device__ cu_qcomp operator()(cu_qcomp quregAmp, cu_qcomp matrElem) { + __host__ __device__ gpu_qcomp operator()(gpu_qcomp quregAmp, gpu_qcomp matrElem) { if constexpr (HasPower && ! UseRealPow) matrElem = getCompPower(matrElem, exponent); if constexpr (HasPower && UseRealPow) - matrElem = getCuQcomp(pow(getCompReal(matrElem), getCompReal(exponent)),0); // CUDA pow(qreal,qreal) + matrElem = getGpuQcomp(pow(getCompReal(matrElem), getCompReal(exponent)),0); // CUDA pow(qreal,qreal) if constexpr (Norm) - quregAmp = getCuQcomp(getCompNorm(quregAmp), 0); + quregAmp = getGpuQcomp(getCompNorm(quregAmp), 0); return matrElem * quregAmp; } @@ -466,15 +462,15 @@ template struct functor_getFidelityTerm { int rank, numQubits; qindex logNumAmpsPerNode, numAmpsPerCol; - cu_qcomp *rho, *psi; + gpu_qcomp *rho, *psi; functor_getFidelityTerm( - int _rank, int _numQubits, qindex _logNumAmpsPerNode, qindex _numAmpsPerCol, cu_qcomp* _rho, cu_qcomp* _psi + int _rank, int _numQubits, qindex _logNumAmpsPerNode, qindex _numAmpsPerCol, gpu_qcomp* _rho, gpu_qcomp* _psi ) : rank(_rank), numQubits(_numQubits), logNumAmpsPerNode(_logNumAmpsPerNode), numAmpsPerCol(_numAmpsPerCol), rho(_rho), psi(_psi) {} - __host__ __device__ cu_qcomp operator()(qindex n) { + __host__ __device__ gpu_qcomp operator()(qindex n) { // i = global index of nth local amp of rho qindex i = concatenateBits(rank, n, logNumAmpsPerNode); @@ -484,9 +480,9 @@ struct functor_getFidelityTerm { qindex c = getBitsLeftOfIndex(i, numQubits-1); // collect amps involved in this term - cu_qcomp rhoAmp = rho[n]; - cu_qcomp rowAmp = psi[r]; - cu_qcomp colAmp = psi[c]; + gpu_qcomp rhoAmp = rho[n]; + gpu_qcomp rowAmp = psi[r]; + gpu_qcomp colAmp = psi[c]; // compute term of or if constexpr (Conj) { @@ -495,7 +491,7 @@ struct functor_getFidelityTerm { } else rowAmp = getCompConj(rowAmp); - cu_qcomp fid = rhoAmp * rowAmp * colAmp; + gpu_qcomp fid = rhoAmp * rowAmp * colAmp; return fid; } }; @@ -525,7 +521,7 @@ struct functor_projectStateVec { assert_numTargsMatchesTemplateParam(numTargets, NumTargets); } - __host__ __device__ cu_qcomp operator()(qindex n, cu_qcomp amp) { + __host__ __device__ gpu_qcomp operator()(qindex n, gpu_qcomp amp) { // use the compile-time value if possible, to auto-unroll the getValueOfBits() loop below SET_VAR_AT_COMPILE_TIME(int, numBits, NumTargets, numTargets); @@ -562,7 +558,7 @@ struct functor_projectDensMatr { assert_numTargsMatchesTemplateParam(numTargets, NumTargets); } - __host__ __device__ cu_qcomp operator()(qindex n, cu_qcomp amp) { + __host__ __device__ gpu_qcomp operator()(qindex n, gpu_qcomp amp) { // use the compile-time value if possible, to auto-unroll the getValueOfBits() loop below SET_VAR_AT_COMPILE_TIME(int, numBits, NumTargets, numTargets); @@ -603,7 +599,7 @@ struct functor_setRandomStateVecAmp { unsigned baseSeed; functor_setRandomStateVecAmp(unsigned seed) : baseSeed(seed) {} - __host__ __device__ cu_qcomp operator()(qindex ampInd) { + __host__ __device__ gpu_qcomp operator()(qindex ampInd) { // wastefully create new distributions for every amp thrust::random::normal_distribution normDist(0, 1); // mean=0, var=1 @@ -634,8 +630,8 @@ struct functor_setRandomStateVecAmp { auto iphase = thrust::complex(0, phase); auto amp = sqrt(prob) * thrust::exp(iphase); // CUDA sqrt - // cast thrust::complex to cu_qcomp - return getCuQcomp(amp.real(), amp.imag()); + // cast thrust::complex to gpu_qcomp + return getGpuQcomp(amp.real(), amp.imag()); } }; @@ -654,7 +650,7 @@ void thrust_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, Pauli thrust::device_vector devStrings(in.strings, in.strings + in.numTerms); // obtain raw pointers which can be passed to fastmath.hpp routines - cu_qcomp* devCoeffsPtr = toCuQcomps(thrust::raw_pointer_cast(devCoeffs.data())); + gpu_qcomp* devCoeffsPtr = toGpuQcomps(thrust::raw_pointer_cast(devCoeffs.data())); PauliStr* devStringsPtr = thrust::raw_pointer_cast(devStrings.data()); int rank = out.isDistributed? comm_getRank() : 0; @@ -663,7 +659,7 @@ void thrust_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, Pauli // indicates the PauliStrSum is diagonal (contains only I or Z) auto functor = functor_setAmpToPauliStrSumElem( rank, out.numElems, logNumElemsPerNode, - in.numTerms, toCuQcomps(out.gpuElems), devCoeffsPtr, devStringsPtr); + in.numTerms, toGpuQcomps(out.gpuElems), devCoeffsPtr, devStringsPtr); auto indIter = thrust::make_counting_iterator(QINDEX_ZERO); auto endIter = indIter + out.numElemsPerNode; @@ -677,7 +673,7 @@ void thrust_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, Pauli */ -void thrust_setElemsToConjugate(cu_qcomp* matrElemsPtr, qindex matrElemsLen) { +void thrust_setElemsToConjugate(gpu_qcomp* matrElemsPtr, qindex matrElemsLen) { auto ptr = getStartPtr(matrElemsPtr); thrust::transform(ptr, ptr + matrElemsLen, ptr, functor_getAmpConj()); @@ -700,13 +696,13 @@ void thrust_densmatr_setAmpsToPauliStrSum_sub(Qureg qureg, PauliStrSum sum) { thrust::device_vector devStrings(sum.strings, sum.strings + sum.numTerms); // obtain raw pointers which can be passed to fastmath.hpp routines - cu_qcomp* devCoeffsPtr = toCuQcomps(thrust::raw_pointer_cast(devCoeffs.data())); + gpu_qcomp* devCoeffsPtr = toGpuQcomps(thrust::raw_pointer_cast(devCoeffs.data())); PauliStr* devStringsPtr = thrust::raw_pointer_cast(devStrings.data()); // indicates the PauliStrSum is not diagonal (contains X or Y) auto functor = functor_setAmpToPauliStrSumElem( qureg.rank, powerOf2(qureg.numQubits), qureg.logNumAmpsPerNode, - sum.numTerms, toCuQcomps(qureg.gpuAmps), devCoeffsPtr, devStringsPtr); + sum.numTerms, toGpuQcomps(qureg.gpuAmps), devCoeffsPtr, devStringsPtr); auto indIter = thrust::make_counting_iterator(QINDEX_ZERO); auto endIter = indIter + qureg.numAmpsPerNode; @@ -724,7 +720,7 @@ void thrust_densmatr_mixQureg_subA(qreal outProb, Qureg outQureg, qreal inProb, template -void thrust_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, cu_qcomp exponent) { +void thrust_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, gpu_qcomp exponent) { thrust::transform( getStartPtr(qureg), getEndPtr(qureg), @@ -832,13 +828,13 @@ qreal thrust_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector q */ -cu_qcomp thrust_statevec_calcInnerProduct_sub(Qureg quregA, Qureg quregB) { +gpu_qcomp thrust_statevec_calcInnerProduct_sub(Qureg quregA, Qureg quregB) { - cu_qcomp init = getCuQcomp(0, 0); + gpu_qcomp init = getGpuQcomp(0, 0); - cu_qcomp prod = thrust::inner_product( + gpu_qcomp prod = thrust::inner_product( getStartPtr(quregA), getEndPtr(quregA), getStartPtr(quregB), - init, thrust::plus(), functor_getAmpConjProd()); + init, thrust::plus(), functor_getAmpConjProd()); return prod; } @@ -857,20 +853,20 @@ qreal thrust_densmatr_calcHilbertSchmidtDistance_sub(Qureg quregA, Qureg quregB) template -cu_qcomp thrust_densmatr_calcFidelityWithPureState_sub(Qureg rho, Qureg psi) { +gpu_qcomp thrust_densmatr_calcFidelityWithPureState_sub(Qureg rho, Qureg psi) { - // functor accepts an index and produces a cu_qcomp + // functor accepts an index and produces a gpu_qcomp auto functor = functor_getFidelityTerm( rho.rank, rho.numQubits, rho.logNumAmpsPerNode, - psi.numAmps, toCuQcomps(rho.gpuAmps), toCuQcomps(psi.gpuAmps)); + psi.numAmps, toGpuQcomps(rho.gpuAmps), toGpuQcomps(psi.gpuAmps)); auto indIter = thrust::make_counting_iterator(QINDEX_ZERO); qindex numIts = rho.numAmpsPerNode; - cu_qcomp init = getCuQcomp(0, 0); - cu_qcomp fid = thrust::transform_reduce( + gpu_qcomp init = getGpuQcomp(0, 0); + gpu_qcomp fid = thrust::transform_reduce( indIter, indIter + numIts, - functor, init, thrust::plus()); + functor, init, thrust::plus()); return fid; } @@ -897,71 +893,71 @@ qreal thrust_statevec_calcExpecAnyTargZ_sub(Qureg qureg, vector targs) { } -cu_qcomp thrust_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector targs) { +gpu_qcomp thrust_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector targs) { qindex dim = powerOf2(qureg.numQubits); qindex ind = util_getLocalIndexOfFirstDiagonalAmp(qureg); qindex mask = util_getBitMask(targs); - auto functor = functor_getExpecDensMatrZTerm(dim, ind, mask, toCuQcomps(qureg.gpuAmps)); + auto functor = functor_getExpecDensMatrZTerm(dim, ind, mask, toGpuQcomps(qureg.gpuAmps)); - cu_qcomp init = getCuQcomp(0, 0); + gpu_qcomp init = getGpuQcomp(0, 0); auto indIter = thrust::make_counting_iterator(QINDEX_ZERO); auto endIter = indIter + powerOf2(qureg.logNumColsPerNode); - return thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); + return thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); } -cu_qcomp thrust_statevec_calcExpecPauliStr_subA(Qureg qureg, vector x, vector y, vector z) { +gpu_qcomp thrust_statevec_calcExpecPauliStr_subA(Qureg qureg, vector x, vector y, vector z) { qindex maskXY = util_getBitMask(util_getConcatenated(x, y)); qindex maskYZ = util_getBitMask(util_getConcatenated(y, z)); - auto ampsPtr = toCuQcomps(qureg.gpuAmps); + auto ampsPtr = toGpuQcomps(qureg.gpuAmps); auto functor = functor_getExpecStateVecPauliTerm(maskXY, maskYZ, ampsPtr, ampsPtr); // amps=pairAmps - cu_qcomp init = getCuQcomp(0, 0); + gpu_qcomp init = getGpuQcomp(0, 0); auto indIter = thrust::make_counting_iterator(QINDEX_ZERO); auto endIter = indIter + qureg.numAmpsPerNode; - cu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); + gpu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); - return value * toCuQcomp(util_getPowerOfI(y.size())); + return value * toGpuQcomp(util_getPowerOfI(y.size())); } -cu_qcomp thrust_statevec_calcExpecPauliStr_subB(Qureg qureg, vector x, vector y, vector z) { +gpu_qcomp thrust_statevec_calcExpecPauliStr_subB(Qureg qureg, vector x, vector y, vector z) { qindex maskXY = util_getBitMask(util_getConcatenated(x, y)); qindex maskYZ = util_getBitMask(util_getConcatenated(y, z)); - auto ampsPtr = toCuQcomps(qureg.gpuAmps); - auto buffPtr = toCuQcomps(qureg.gpuCommBuffer); + auto ampsPtr = toGpuQcomps(qureg.gpuAmps); + auto buffPtr = toGpuQcomps(qureg.gpuCommBuffer); auto functor = functor_getExpecStateVecPauliTerm(maskXY, maskYZ, ampsPtr, buffPtr); - cu_qcomp init = getCuQcomp(0, 0); + gpu_qcomp init = getGpuQcomp(0, 0); auto indIter = thrust::make_counting_iterator(QINDEX_ZERO); auto endIter = indIter + qureg.numAmpsPerNode; - cu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); + gpu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); - return value * toCuQcomp(util_getPowerOfI(y.size())); + return value * toGpuQcomp(util_getPowerOfI(y.size())); } -cu_qcomp thrust_densmatr_calcExpecPauliStr_sub(Qureg qureg, vector x, vector y, vector z) { +gpu_qcomp thrust_densmatr_calcExpecPauliStr_sub(Qureg qureg, vector x, vector y, vector z) { qindex mXY = util_getBitMask(util_getConcatenated(x, y)); qindex mYZ = util_getBitMask(util_getConcatenated(y, z)); qindex dim = powerOf2(qureg.numQubits); qindex ind = util_getLocalIndexOfFirstDiagonalAmp(qureg); - auto functor = functor_getExpecDensMatrPauliTerm(mXY, mYZ, dim, ind, toCuQcomps(qureg.gpuAmps)); + auto functor = functor_getExpecDensMatrPauliTerm(mXY, mYZ, dim, ind, toGpuQcomps(qureg.gpuAmps)); - cu_qcomp init = getCuQcomp(0, 0); + gpu_qcomp init = getGpuQcomp(0, 0); auto indIter = thrust::make_counting_iterator(QINDEX_ZERO); auto endIter = indIter + powerOf2(qureg.logNumColsPerNode); - cu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); + gpu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); - return value * toCuQcomp(util_getPowerOfI(y.size())); + return value * toGpuQcomp(util_getPowerOfI(y.size())); } @@ -972,33 +968,33 @@ cu_qcomp thrust_densmatr_calcExpecPauliStr_sub(Qureg qureg, vector x, vecto template -cu_qcomp thrust_statevec_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, cu_qcomp expo) { +gpu_qcomp thrust_statevec_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, gpu_qcomp expo) { - cu_qcomp init = getCuQcomp(0, 0); + gpu_qcomp init = getGpuQcomp(0, 0); auto functor = functor_multiplyElemPowerWithAmpOrNorm(expo); - cu_qcomp value = thrust::inner_product( + gpu_qcomp value = thrust::inner_product( getStartPtr(qureg), getEndPtr(qureg), getStartPtr(matr), - init, thrust::plus(), functor); + init, thrust::plus(), functor); return value; } template -cu_qcomp thrust_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, cu_qcomp expo) { +gpu_qcomp thrust_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, gpu_qcomp expo) { qindex dim = powerOf2(qureg.numQubits); qindex ind = util_getLocalIndexOfFirstDiagonalAmp(qureg); - auto ampsPtr = toCuQcomps(qureg.gpuAmps); - auto elemsPtr = toCuQcomps(matr.gpuElems); + auto ampsPtr = toGpuQcomps(qureg.gpuAmps); + auto elemsPtr = toGpuQcomps(matr.gpuElems); auto functor = functor_getExpecDensMatrDiagMatrTerm(dim, ind, ampsPtr, elemsPtr, expo); - cu_qcomp init = getCuQcomp(0, 0); + gpu_qcomp init = getGpuQcomp(0, 0); auto indIter = thrust::make_counting_iterator(QINDEX_ZERO); auto endIter = indIter + powerOf2(qureg.logNumColsPerNode); - return thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); + return thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); } @@ -1047,7 +1043,7 @@ void thrust_densmatr_multiQubitProjector_sub(Qureg qureg, vector qubits, ve */ -void thrust_statevec_initUniformState(Qureg qureg, cu_qcomp amp) { +void thrust_statevec_initUniformState(Qureg qureg, gpu_qcomp amp) { thrust::fill(getStartPtr(qureg), getEndPtr(qureg), amp); } @@ -1057,11 +1053,11 @@ void thrust_statevec_initDebugState_sub(Qureg qureg) { // globally, |n> gains coefficient 2n/10 + i(2n+1)/10, // which is a step-size of 2/10 + i(2/10)... - cu_qcomp step = getCuQcomp(2/10., 2/10.); + gpu_qcomp step = getGpuQcomp(2/10., 2/10.); // and each node begins from a unique n (if distributed) qindex n = util_getGlobalIndexOfFirstLocalAmp(qureg); - cu_qcomp init = getCuQcomp(2*n/10., (2*n+1)/10.); + gpu_qcomp init = getGpuQcomp(2*n/10., (2*n+1)/10.); thrust::sequence(getStartPtr(qureg), getEndPtr(qureg), init, step); } diff --git a/quest/src/gpu/gpu_types.cuh b/quest/src/gpu/gpu_types.cuh index a934ecef6..9ece0bcc6 100644 --- a/quest/src/gpu/gpu_types.cuh +++ b/quest/src/gpu/gpu_types.cuh @@ -36,7 +36,7 @@ /* - * CUDA-COMPATIBLE QCOMP ALIAS (cu_qcomp) + * CUDA-COMPATIBLE QCOMP ALIAS (gpu_qcomp) * * which we opt to use over a Thrust complex type to gaurantee * compatibility with cuQuantum, though this irritatingly @@ -45,10 +45,10 @@ #if (FLOAT_PRECISION == 1) - typedef cuFloatComplex cu_qcomp; + typedef cuFloatComplex gpu_qcomp; #elif (FLOAT_PRECISION == 2) - typedef cuDoubleComplex cu_qcomp; + typedef cuDoubleComplex gpu_qcomp; #else #error "Build bug; precision.h should have prevented non-float non-double qcomp precision on GPU." @@ -58,11 +58,11 @@ /* - * TRANSFORMING qcomp AND cu_qcomp + * TRANSFORMING qcomp AND gpu_qcomp */ -INLINE cu_qcomp getCuQcomp(qreal re, qreal im) { +INLINE gpu_qcomp getGpuQcomp(qreal re, qreal im) { #if (FLOAT_PRECISION == 1) return make_cuFloatComplex(re, im); @@ -72,61 +72,61 @@ INLINE cu_qcomp getCuQcomp(qreal re, qreal im) { } -__host__ inline cu_qcomp toCuQcomp(qcomp a) { - return getCuQcomp(std::real(a), std::imag(a)); +__host__ inline gpu_qcomp toGpuQcomp(qcomp a) { + return getGpuQcomp(std::real(a), std::imag(a)); } -__host__ inline qcomp toQcomp(cu_qcomp a) { +__host__ inline qcomp toQcomp(gpu_qcomp a) { return getQcomp(a.x, a.y); } -__host__ inline cu_qcomp* toCuQcomps(qcomp* a) { +__host__ inline gpu_qcomp* toGpuQcomps(qcomp* a) { - // reinterpret a qcomp ptr as a cu_qcomp ptr, - // which is ONLY SAFE when comp and cu_qcomp + // reinterpret a qcomp ptr as a gpu_qcomp ptr, + // which is ONLY SAFE when comp and gpu_qcomp // have identical memory layouts. Be very // careful; HIP stack arrays (e.g. qcomp[]) // seg-fault when passed here, so this funciton // should only ever be used on malloc'd data! // Stack objects should use the below unpacks. - return reinterpret_cast(a); + return reinterpret_cast(a); } -__host__ inline std::array unpackMatrixToCuQcomps(DiagMatr1 in) { +__host__ inline std::array unpackMatrixToGpuQcomps(DiagMatr1 in) { // it's crucial we explicitly copy over the elements, // rather than just reinterpret the pointer, to avoid // segmentation faults when memory misaligns (like on HIP) - return {toCuQcomp(in.elems[0]), toCuQcomp(in.elems[1])}; + return {toGpuQcomp(in.elems[0]), toGpuQcomp(in.elems[1])}; } -__host__ inline std::array unpackMatrixToCuQcomps(DiagMatr2 in) { +__host__ inline std::array unpackMatrixToGpuQcomps(DiagMatr2 in) { return { - toCuQcomp(in.elems[0]), toCuQcomp(in.elems[1]), - toCuQcomp(in.elems[2]), toCuQcomp(in.elems[3])}; + toGpuQcomp(in.elems[0]), toGpuQcomp(in.elems[1]), + toGpuQcomp(in.elems[2]), toGpuQcomp(in.elems[3])}; } -__host__ inline std::array unpackMatrixToCuQcomps(CompMatr1 in) { +__host__ inline std::array unpackMatrixToGpuQcomps(CompMatr1 in) { - std::array out{}; + std::array out{}; for (int i=0; i<4; i++) - out[i] = toCuQcomp(in.elems[i/2][i%2]); + out[i] = toGpuQcomp(in.elems[i/2][i%2]); return out; } -__host__ inline std::array unpackMatrixToCuQcomps(CompMatr2 in) { +__host__ inline std::array unpackMatrixToGpuQcomps(CompMatr2 in) { - std::array out{}; + std::array out{}; for (int i=0; i<16; i++) - out[i] = toCuQcomp(in.elems[i/4][i%4]); + out[i] = toGpuQcomp(in.elems[i/4][i%4]); return out; } @@ -135,7 +135,7 @@ __host__ inline std::array unpackMatrixToCuQcomps(CompMatr2 in) { /* - * cu_qcomp ARITHMETIC OVERLOADS + * gpu_qcomp ARITHMETIC OVERLOADS * * which are only needed by NVCC because * HIP defines them for us. This good deed @@ -147,30 +147,30 @@ __host__ inline std::array unpackMatrixToCuQcomps(CompMatr2 in) { /// @todo /// - clean this up (with templates?) -/// - use getCuQcomp() rather than struct creation, +/// - use getGpuQcomp() rather than struct creation, /// to make the algebra implementation-agnostic #if defined(__NVCC__) -INLINE cu_qcomp operator + (const cu_qcomp& a, const cu_qcomp& b) { - cu_qcomp out = { +INLINE gpu_qcomp operator + (const gpu_qcomp& a, const gpu_qcomp& b) { + gpu_qcomp out = { .x = a.x + b.x, .y = a.y + b.y }; return out; } -INLINE cu_qcomp operator - (const cu_qcomp& a, const cu_qcomp& b) { - cu_qcomp out = { +INLINE gpu_qcomp operator - (const gpu_qcomp& a, const gpu_qcomp& b) { + gpu_qcomp out = { .x = a.x - b.x, .y = a.y - b.y }; return out; } -INLINE cu_qcomp operator * (const cu_qcomp& a, const cu_qcomp& b) { - cu_qcomp out = { +INLINE gpu_qcomp operator * (const gpu_qcomp& a, const gpu_qcomp& b) { + gpu_qcomp out = { .x = a.x * b.x - a.y * b.y, .y = a.x * b.y + a.y * b.x }; @@ -178,45 +178,45 @@ INLINE cu_qcomp operator * (const cu_qcomp& a, const cu_qcomp& b) { } -INLINE cu_qcomp operator + (const cu_qcomp& a, const qreal& b) { - cu_qcomp out = { +INLINE gpu_qcomp operator + (const gpu_qcomp& a, const qreal& b) { + gpu_qcomp out = { .x = a.x + b, .y = a.y + b }; return out; } -INLINE cu_qcomp operator + (const qreal& b, const cu_qcomp& a) { - cu_qcomp out = { +INLINE gpu_qcomp operator + (const qreal& b, const gpu_qcomp& a) { + gpu_qcomp out = { .x = a.x + b, .y = a.y + b }; return out; } -INLINE cu_qcomp operator - (const cu_qcomp& a, const qreal& b) { - cu_qcomp out = { +INLINE gpu_qcomp operator - (const gpu_qcomp& a, const qreal& b) { + gpu_qcomp out = { .x = a.x - b, .y = a.y - b }; return out; } -INLINE cu_qcomp operator - (const qreal& b, const cu_qcomp& a) { - cu_qcomp out = { +INLINE gpu_qcomp operator - (const qreal& b, const gpu_qcomp& a) { + gpu_qcomp out = { .x = a.x - b, .y = a.y - b }; return out; } -INLINE cu_qcomp operator * (const cu_qcomp& a, const qreal& b) { - cu_qcomp out = { +INLINE gpu_qcomp operator * (const gpu_qcomp& a, const qreal& b) { + gpu_qcomp out = { .x = a.x * b, .y = a.y * b }; return out; } -INLINE cu_qcomp operator * (const qreal& b, const cu_qcomp& a) { - cu_qcomp out = { +INLINE gpu_qcomp operator * (const qreal& b, const gpu_qcomp& a) { + gpu_qcomp out = { .x = a.x * b, .y = a.y * b }; @@ -228,24 +228,24 @@ INLINE cu_qcomp operator * (const qreal& b, const cu_qcomp& a) { /* - * cu_qcomp UNARY FUNCTIONS + * gpu_qcomp UNARY FUNCTIONS */ -INLINE qreal getCompReal(cu_qcomp num) { +INLINE qreal getCompReal(gpu_qcomp num) { return num.x; } -INLINE cu_qcomp getCompConj(cu_qcomp num) { +INLINE gpu_qcomp getCompConj(gpu_qcomp num) { num.y *= -1; return num; } -INLINE qreal getCompNorm(cu_qcomp num) { +INLINE qreal getCompNorm(gpu_qcomp num) { return (num.x * num.x) + (num.y * num.y); } -INLINE cu_qcomp getCompPower(cu_qcomp base, cu_qcomp exponent) { +INLINE gpu_qcomp getCompPower(gpu_qcomp base, gpu_qcomp exponent) { // using https://mathworld.wolfram.com/ComplexExponentiation.html, // and the principal argument of 'base' @@ -266,7 +266,7 @@ INLINE cu_qcomp getCompPower(cu_qcomp base, cu_qcomp exponent) { // output scalar qreal re = fac * cos(ang); qreal im = fac * sin(ang); - return getCuQcomp(re, im); + return getGpuQcomp(re, im); } From 7353cbfd009cea5700d20c9dda4fcb49a7da258a Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Fri, 17 Apr 2026 17:04:39 -0400 Subject: [PATCH 03/16] Restoring gpu_qcomp in-place operators --- quest/src/gpu/gpu_kernels.cuh | 76 +++++++++++++++++------------------ 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index dedfb9c88..b7f8df424 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -347,10 +347,10 @@ __global__ void kernel_statevec_anyCtrlFewTargDenseMatr( // optionally conjugate matrix elem gpu_qcomp elem = flatMatrElems[h]; if constexpr (ApplyConj) - elem.y *= -1; + elem = conj(elem); // thread-private cache is accessed with compile-time known index - amps[i] = amps[i] + (elem * privateCache[l]); + amps[i] += elem * privateCache[l]; } } } @@ -409,9 +409,9 @@ __global__ void kernel_statevec_anyCtrlManyTargDenseMatr( gpu_qcomp elem = flatMatrElems[h]; if constexpr (ApplyConj) - elem.y *= -1; + elem = conj(elem); - amps[i] = amps[i] + (elem * globalCache[j]); + amps[i] += elem * globalCache[j]; /// @todo /// qureg.cpuAmps[i] is being serially updated by only this thread, @@ -458,7 +458,7 @@ __global__ void kernel_statevec_anyCtrlOneTargDiagMatr_sub( qindex i = concatenateBits(rank, j, logNumAmpsPerNode); int b = getBit(i, targ); - amps[j] = amps[j] * (m1 + b * (m2 - m1)); + amps[j] *= m1 + b * (m2 - m1); } @@ -499,7 +499,7 @@ __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub( // k = local elem index int k = getTwoBits(i, targ2, targ1); gpu_qcomp elems[] = {m1, m2, m3, m4}; - amps[j] = amps[j] * elems[k]; + amps[j] *= elems[k]; } @@ -544,12 +544,12 @@ __global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub( gpu_qcomp elem = elems[t]; if constexpr (HasPower) - elem = getCompPower(elem, exponent); + elem = pow(elem, exponent); if constexpr (ApplyConj) - elem.y *= -1; + elem = conj(elem); - amps[j] = amps[j] * elem; + amps[j] *= elem; } @@ -574,7 +574,7 @@ __global__ void kernel_densmatr_allTargDiagMatr_sub( gpu_qcomp term = elems[i]; if constexpr (HasPower) - term = getCompPower(term, exponent); + term = pow(term, exponent); fac = term; } @@ -586,15 +586,15 @@ __global__ void kernel_densmatr_allTargDiagMatr_sub( gpu_qcomp term = elems[j]; if constexpr (HasPower) - term = getCompPower(term, exponent); + term = pow(term, exponent); if constexpr (ConjRight) - term.y *= -1; + term = conj(term); - fac = fac * term; + fac *= term; } - amps[n] = amps[n] * fac; + amps[n] *= fac; } @@ -697,7 +697,7 @@ __global__ void kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub( int p = cudaGetBitMaskParity(i & targMask); gpu_qcomp facs[] = {fac0, fac1}; - amps[i] = amps[i] * facs[p]; + amps[i] *= facs[p]; } @@ -720,7 +720,7 @@ __global__ void kernel_statevec_setQuregToWeightedSum_sub( gpu_qcomp amp = getGpuQcomp(0, 0); for (int q=0; q Date: Fri, 17 Apr 2026 18:09:03 -0400 Subject: [PATCH 04/16] changed gpu_qcomp getter names to match CPU and changed &arr[ind] to arr + ind, for visual clarity --- quest/src/gpu/gpu_cuquantum.cuh | 16 ++-- quest/src/gpu/gpu_subroutines.cpp | 130 +++++++++++++++--------------- quest/src/gpu/gpu_thrust.cuh | 34 ++++---- 3 files changed, 90 insertions(+), 90 deletions(-) diff --git a/quest/src/gpu/gpu_cuquantum.cuh b/quest/src/gpu/gpu_cuquantum.cuh index f8343154a..261b29875 100644 --- a/quest/src/gpu/gpu_cuquantum.cuh +++ b/quest/src/gpu/gpu_cuquantum.cuh @@ -182,7 +182,7 @@ void cuquantum_statevec_anyCtrlSwap_subA(Qureg qureg, vector ctrls, vector< CUDA_CHECK( custatevecSwapIndexBits( config.handle, - toGpuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, + getGpuQcompPtr(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, targPairs, numTargPairs, // swap mask params seem to be in the reverse order to the remainder of the cuStateVec API @@ -210,7 +210,7 @@ void cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(Qureg qureg, vector CUDA_CHECK( custatevecApplyMatrix( config.handle, - toGpuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, + getGpuQcompPtr(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, flatMatrElems, CUQUANTUM_QCOMP, CUSTATEVEC_MATRIX_LAYOUT_ROW, applyAdj, targs.data(), targs.size(), ctrls.data(), ctrlStates.data(), ctrls.size(), @@ -239,7 +239,7 @@ void cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrl CUDA_CHECK( custatevecApplyGeneralizedPermutationMatrix( config.handle, - toGpuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, + getGpuQcompPtr(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, perm, flatMatrElems, CUQUANTUM_QCOMP, adj, targs.data(), targs.size(), ctrls.data(), ctrlStates.data(), ctrls.size(), @@ -331,7 +331,7 @@ qreal cuquantum_statevec_calcTotalProb_sub(Qureg qureg) { CUDA_CHECK( custatevecAbs2SumOnZBasis( config.handle, - toGpuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, + getGpuQcompPtr(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, &prob0, &prob1, &qubit, numQubits ) ); qreal total = prob0 + prob1; @@ -346,7 +346,7 @@ qreal cuquantum_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector(prob); @@ -367,7 +367,7 @@ void cuquantum_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qu CUDA_CHECK( custatevecAbs2SumArray( config.handle, - toGpuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, + getGpuQcompPtr(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, outPtr, qubits.data(), qubits.size(), nullptr, nullptr, 0) ); // serially cast and copy output probabilities, if necessary @@ -409,7 +409,7 @@ qreal cuquantum_statevec_calcExpecPauliStr_subA(Qureg qureg, vector x, vect CUDA_CHECK( custatevecComputeExpectationsOnPauliBasis( config.handle, - toGpuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, + getGpuQcompPtr(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, &value, termPaulis, numTerms, termTargets, numPaulisPerTerm) ); return static_cast(value); @@ -432,7 +432,7 @@ void cuquantum_statevec_multiQubitProjector_sub(Qureg qureg, vector qubits, CUDA_CHECK( custatevecCollapseByBitString( config.handle, - toGpuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, + getGpuQcompPtr(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, outcomes.data(), qubits.data(), qubits.size(), prob) ); } diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index 5c9aaa99a..50a5b66a0 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -86,7 +86,7 @@ qcomp gpu_statevec_getAmp_sub(Qureg qureg, qindex ind) { qcomp amp; // compiler guards harmlessly duplicated therein - gpu_copyGpuToCpu(&qureg.gpuAmps[ind], &, 1); + gpu_copyGpuToCpu(qureg.gpuAmps + ind, &, 1); return amp; @@ -148,7 +148,7 @@ qindex gpu_statevec_packAmpsIntoBuffer(Qureg qureg, vector qubits, vector <<>> ( - toGpuQcomps(qureg.gpuAmps), &toGpuQcomps(qureg.gpuCommBuffer)[sendInd], numThreads, + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + sendInd, numThreads, getPtr(sortedQubits), qubits.size(), qubitStateMask ); @@ -173,7 +173,7 @@ qindex gpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu qindex sendInd = getSubBufferSendInd(qureg); kernel_statevec_packPairSummedAmpsIntoBuffer <<>> ( - toGpuQcomps(qureg.gpuAmps), &toGpuQcomps(qureg.gpuCommBuffer)[sendInd], numThreads, + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer + sendInd, numThreads, qubit1, qubit2, qubit3, bit2 ); @@ -214,7 +214,7 @@ void gpu_statevec_anyCtrlSwap_subA(Qureg qureg, vector ctrls, vector c qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ2, targ1}, {0, 1}); kernel_statevec_anyCtrlSwap_subA <<>> ( - toGpuQcomps(qureg.gpuAmps), numThreads, + getGpuQcompPtr(qureg.gpuAmps), numThreads, getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2 ); @@ -239,7 +239,7 @@ void gpu_statevec_anyCtrlSwap_subB(Qureg qureg, vector ctrls, vector c qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); kernel_statevec_anyCtrlSwap_subB <<>> ( - toGpuQcomps(qureg.gpuAmps), &toGpuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, getPtr(sortedCtrls), ctrls.size(), ctrlStateMask ); @@ -264,7 +264,7 @@ void gpu_statevec_anyCtrlSwap_subC(Qureg qureg, vector ctrls, vector c qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {targState}); kernel_statevec_anyCtrlSwap_subC <<>> ( - toGpuQcomps(qureg.gpuAmps), &toGpuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, getPtr(sortedQubits), ctrls.size(), qubitStateMask ); @@ -307,7 +307,7 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector ctrls, v auto [m00, m01, m10, m11] = unpackMatrixToGpuQcomps(matr); kernel_statevec_anyCtrlOneTargDenseMatr_subA <<>> ( - toGpuQcomps(qureg.gpuAmps), numThreads, + getGpuQcompPtr(qureg.gpuAmps), numThreads, getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ, m00, m01, m10, m11 ); @@ -333,9 +333,9 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector ctrls, v qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); kernel_statevec_anyCtrlOneTargDenseMatr_subB <<>> ( - toGpuQcomps(qureg.gpuAmps), &toGpuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, - toGpuQcomp(fac0), toGpuQcomp(fac1) + getGpuQcomp(fac0), getGpuQcomp(fac1) ); #else @@ -377,7 +377,7 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve auto m = unpackMatrixToGpuQcomps(matr); kernel_statevec_anyCtrlTwoTargDenseMatr_sub <<>> ( - toGpuQcomps(qureg.gpuAmps), numThreads, + getGpuQcompPtr(qureg.gpuAmps), numThreads, getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2, m[0], m[1], m[2], m[3], m[4], m[5], m[6], m[7], m[8], m[9], m[10], m[11], m[12], m[13], m[14], m[15] @@ -405,7 +405,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve #if COMPILE_CUQUANTUM - auto matrElemsPtr = toGpuQcomps(matr.gpuElemsFlat); + auto matrElemsPtr = getGpuQcompPtr(matr.gpuElemsFlat); auto matrElemsLen = matr.numRows * matr.numRows; // assert the pre-condition assumed below @@ -437,8 +437,8 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, targs, vector(targs.size(),0)); // unpacking args (to better distinguish below signatures) - auto ampsPtr = toGpuQcomps(qureg.gpuAmps); - auto matrPtr = toGpuQcomps(matr.gpuElemsFlat); + auto ampsPtr = getGpuQcompPtr(qureg.gpuAmps); + auto matrPtr = getGpuQcompPtr(matr.gpuElemsFlat); auto qubitsPtr = getPtr(deviceQubits); auto targsPtr = getPtr(deviceTargs); auto nCtrls = ctrls.size(); @@ -506,7 +506,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve kernel_statevec_anyCtrlManyTargDenseMatr <<>> ( - toGpuQcomps(cache), + getGpuQcompPtr(cache), ampsPtr, numThreads, numBatchesPerThread, qubitsPtr, nCtrls, qubitStateMask, targsPtr, targs.size(), powerOf2(targs.size()), matrPtr @@ -550,7 +550,7 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector ctrls, vec bool conj = false; // we can pass 1D CPU .elems array directly to cuQuantum which will recognise host pointers - cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, {targ}, toGpuQcomps(matr.elems), conj); + cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, {targ}, getGpuQcompPtr(matr.elems), conj); // explicitly return to avoid re-simulation below return; @@ -573,7 +573,7 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector ctrls, vec auto elems = unpackMatrixToGpuQcomps(matr); kernel_statevec_anyCtrlOneTargDiagMatr_sub <<>> ( - toGpuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, + getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ, elems[0], elems[1] ); @@ -618,7 +618,7 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector ctrls, vec bool conj = false; // we can pass 1D CPU array directly to cuQuantum, and it will recognise host pointers - cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, {targ1, targ2}, toGpuQcomps(matr.elems), conj); + cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, {targ1, targ2}, getGpuQcompPtr(matr.elems), conj); // explicitly return to avoid re-simulation below return; @@ -641,7 +641,7 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector ctrls, vec auto elems = unpackMatrixToGpuQcomps(matr); kernel_statevec_anyCtrlTwoTargDiagMatr_sub <<>> ( - toGpuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, + getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ1, targ2, elems[0], elems[1], elems[2], elems[3] ); @@ -686,7 +686,7 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vec // cuQuantum cannot handle HasPower, in which case we fall back to custom kernel if (!HasPower && util_areAllQubitsInSuffix(targs, qureg)) { - cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, targs, toGpuQcomps(util_getGpuMemPtr(matr)), ApplyConj); + cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, targs, getGpuQcompPtr(util_getGpuMemPtr(matr)), ApplyConj); // must return to avoid re-simulation below return; @@ -709,9 +709,9 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vec qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); kernel_statevec_anyCtrlAnyTargDiagMatr_sub <<>> ( - toGpuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, + getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, getPtr(deviceTargs), targs.size(), - toGpuQcomps(util_getGpuMemPtr(matr)), toGpuQcomp(exponent) + getGpuQcompPtr(util_getGpuMemPtr(matr)), getGpuQcomp(exponent) ); // must return to avoid runtime error below @@ -743,7 +743,7 @@ void gpu_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp // we always use Thrust because we are doubtful that cuQuantum's // diagonal-matrix facilities are optimised for the all-qubit case - thrust_statevec_allTargDiagMatr_sub(qureg, matr, toGpuQcomp(exponent)); + thrust_statevec_allTargDiagMatr_sub(qureg, matr, getGpuQcomp(exponent)); #else error_gpuSimButGpuNotCompiled(); @@ -764,8 +764,8 @@ void gpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp kernel_densmatr_allTargDiagMatr_sub <<>> ( - toGpuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, - toGpuQcomps(util_getGpuMemPtr(matr)), matr.numElems, toGpuQcomp(exponent) + getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, + getGpuQcompPtr(util_getGpuMemPtr(matr)), matr.numElems, getGpuQcomp(exponent) ); #else @@ -823,10 +823,10 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, vector ct qindex numThreads = (qureg.numAmpsPerNode / powerOf2(ctrls.size())) / 2; // divides evenly qindex numBlocks = getNumBlocks(numThreads); kernel_statevector_anyCtrlPauliTensorOrGadget_subA <<>> ( - toGpuQcomps(qureg.gpuAmps), numThreads, + getGpuQcompPtr(qureg.gpuAmps), numThreads, getPtr(deviceQubits), ctrls.size(), qubitStateMask, getPtr(deviceTargs), deviceTargs.size(), - maskXY, maskYZ, toGpuQcomp(powI), toGpuQcomp(ampFac), toGpuQcomp(pairAmpFac) + maskXY, maskYZ, getGpuQcomp(powI), getGpuQcomp(ampFac), getGpuQcomp(pairAmpFac) ); #else @@ -854,10 +854,10 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, vector ct qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); kernel_statevector_anyCtrlPauliTensorOrGadget_subB <<>> ( - toGpuQcomps(qureg.gpuAmps), &toGpuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, maskXY, maskYZ, bufferMaskXY, - toGpuQcomp(powI), toGpuQcomp(ampFac), toGpuQcomp(pairAmpFac) + getGpuQcomp(powI), getGpuQcomp(ampFac), getGpuQcomp(pairAmpFac) ); #else @@ -891,9 +891,9 @@ void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, vector c qindex targMask = util_getBitMask(targs); kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub <<>> ( - toGpuQcomps(qureg.gpuAmps), numThreads, + getGpuQcompPtr(qureg.gpuAmps), numThreads, getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, targMask, - toGpuQcomp(fac0), toGpuQcomp(fac1) + getGpuQcomp(fac0), getGpuQcomp(fac1) ); #else @@ -923,14 +923,14 @@ void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector coeffs vector ptrs; ptrs.reserve(inQuregs.size()); for (auto& qureg : inQuregs) - ptrs.push_back(toGpuQcomps(qureg.gpuAmps)); + ptrs.push_back(getGpuQcompPtr(qureg.gpuAmps)); // copy coeff and qureg lists into GPU memory devgpuqcompptrs devQuregAmps = ptrs; devcomps devCoeffs = coeffs; kernel_statevec_setQuregToWeightedSum_sub <<>> ( - toGpuQcomps(outQureg.gpuAmps), numThreads, + getGpuQcompPtr(outQureg.gpuAmps), numThreads, getPtr(devCoeffs), getPtr(devQuregAmps), inQuregs.size() ); @@ -960,7 +960,7 @@ void gpu_densmatr_mixQureg_subB(qreal outProb, Qureg outQureg, qreal inProb, Qur qindex numBlocks = getNumBlocks(numThreads); kernel_densmatr_mixQureg_subB <<>> ( - outProb, toGpuQcomps(outQureg.gpuAmps), inProb, toGpuQcomps(inQureg.gpuAmps), + outProb, getGpuQcompPtr(outQureg.gpuAmps), inProb, getGpuQcompPtr(inQureg.gpuAmps), numThreads, inQureg.numAmps ); @@ -978,7 +978,7 @@ void gpu_densmatr_mixQureg_subC(qreal outProb, Qureg outQureg, qreal inProb) { qindex numBlocks = getNumBlocks(numThreads); kernel_densmatr_mixQureg_subC <<>> ( - outProb, toGpuQcomps(outQureg.gpuAmps), inProb, toGpuQcomps(outQureg.gpuCommBuffer), + outProb, getGpuQcompPtr(outQureg.gpuAmps), inProb, getGpuQcompPtr(outQureg.gpuCommBuffer), numThreads, outQureg.rank, powerOf2(outQureg.numQubits), outQureg.logNumAmpsPerNode ); @@ -1013,7 +1013,7 @@ void gpu_densmatr_oneQubitDephasing_subA(Qureg qureg, int ketQubit, qreal prob) int braQubit = util_getBraQubit(ketQubit, qureg); kernel_densmatr_oneQubitDephasing_subA <<>> ( - toGpuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, fac + getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braQubit, fac ); #else @@ -1039,7 +1039,7 @@ void gpu_densmatr_oneQubitDephasing_subB(Qureg qureg, int ketQubit, qreal prob) int braBit = util_getRankBitOfBraQubit(ketQubit, qureg); kernel_densmatr_oneQubitDephasing_subB <<>> ( - toGpuQcomps(qureg.gpuAmps), numThreads, ketQubit, braBit, fac + getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braBit, fac ); #else @@ -1085,7 +1085,7 @@ void gpu_densmatr_twoQubitDephasing_subB(Qureg qureg, int ketQubitA, int ketQubi int braQubitB = util_getBraQubit(ketQubitB, qureg); kernel_densmatr_twoQubitDephasing_subB <<>> ( - toGpuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, // numAmps, not numCols + getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, // numAmps, not numCols ketQubitA, ketQubitB, braQubitA, braQubitB, term ); @@ -1112,7 +1112,7 @@ void gpu_densmatr_oneQubitDepolarising_subA(Qureg qureg, int ketQubit, qreal pro auto factors = util_getOneQubitDepolarisingFactors(prob); kernel_densmatr_oneQubitDepolarising_subA <<>> ( - toGpuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, factors.c1, factors.c2, factors.c3 + getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braQubit, factors.c1, factors.c2, factors.c3 ); #else @@ -1133,7 +1133,7 @@ void gpu_densmatr_oneQubitDepolarising_subB(Qureg qureg, int ketQubit, qreal pro auto factors = util_getOneQubitDepolarisingFactors(prob); kernel_densmatr_oneQubitDepolarising_subB <<>> ( - toGpuQcomps(qureg.gpuAmps), &toGpuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, ketQubit, braBit, factors.c1, factors.c2, factors.c3 ); @@ -1161,7 +1161,7 @@ void gpu_densmatr_twoQubitDepolarising_subA(Qureg qureg, int ketQb1, int ketQb2, auto c3 = util_getTwoQubitDepolarisingFactors(prob).c3; kernel_densmatr_twoQubitDepolarising_subA <<>> ( - toGpuQcomps(qureg.gpuAmps), numThreads, + getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQb1, ketQb2, braQb1, braQb2, c3 ); @@ -1186,7 +1186,7 @@ void gpu_densmatr_twoQubitDepolarising_subB(Qureg qureg, int ketQb1, int ketQb2, qreal altc1 = factors.c1 - factors.c2; kernel_densmatr_twoQubitDepolarising_subB <<>> ( - toGpuQcomps(qureg.gpuAmps), numThreads, + getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQb1, ketQb2, braQb1, braQb2, altc1, factors.c2 ); @@ -1208,7 +1208,7 @@ void gpu_densmatr_twoQubitDepolarising_subC(Qureg qureg, int ketQb1, int ketQb2, auto c3 = util_getTwoQubitDepolarisingFactors(prob).c3; kernel_densmatr_twoQubitDepolarising_subC <<>> ( - toGpuQcomps(qureg.gpuAmps), numThreads, + getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQb1, ketQb2, braQb1, braBit2, c3 ); @@ -1231,7 +1231,7 @@ void gpu_densmatr_twoQubitDepolarising_subD(Qureg qureg, int ketQb1, int ketQb2, auto factors = util_getTwoQubitDepolarisingFactors(prob); kernel_densmatr_twoQubitDepolarising_subD <<>> ( - toGpuQcomps(qureg.gpuAmps), &toGpuQcomps(qureg.gpuCommBuffer)[offset], numThreads, + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer + offset, numThreads, ketQb1, ketQb2, braQb1, braBit2, factors.c1, factors.c2 ); @@ -1256,7 +1256,7 @@ void gpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int ketQb1, int ketQb2, qreal fac1 = factors.c1 - fac0; kernel_densmatr_twoQubitDepolarising_subE <<>> ( - toGpuQcomps(qureg.gpuAmps), numThreads, + getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQb1, ketQb2, braBit1, braBit2, fac0, fac1 ); @@ -1279,7 +1279,7 @@ void gpu_densmatr_twoQubitDepolarising_subF(Qureg qureg, int ketQb1, int ketQb2, auto c2 = util_getTwoQubitDepolarisingFactors(prob).c2; kernel_densmatr_twoQubitDepolarising_subF <<>> ( - toGpuQcomps(qureg.gpuAmps), &toGpuQcomps(qureg.gpuCommBuffer)[offset], numThreads, + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + offset, numThreads, ketQb1, ketQb2, braBit1, braBit2, c2 ); @@ -1306,7 +1306,7 @@ void gpu_densmatr_oneQubitPauliChannel_subA(Qureg qureg, int ketQubit, qreal pI, auto factors = util_getOneQubitPauliChannelFactors(pI, pX, pY, pZ); kernel_densmatr_oneQubitPauliChannel_subA <<>> ( - toGpuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, + getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braQubit, factors.c1, factors.c2, factors.c3, factors.c4 ); @@ -1328,7 +1328,7 @@ void gpu_densmatr_oneQubitPauliChannel_subB(Qureg qureg, int ketQubit, qreal pI, auto factors = util_getOneQubitPauliChannelFactors(pI, pX, pY, pZ); kernel_densmatr_oneQubitPauliChannel_subB <<>> ( - toGpuQcomps(qureg.gpuAmps), &toGpuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, ketQubit, braBit, factors.c1, factors.c2, factors.c3, factors.c4 ); @@ -1355,7 +1355,7 @@ void gpu_densmatr_oneQubitDamping_subA(Qureg qureg, int ketQubit, qreal prob) { auto factors = util_getOneQubitDampingFactors(prob); kernel_densmatr_oneQubitDamping_subA <<>> ( - toGpuQcomps(qureg.gpuAmps), numThreads, + getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braQubit, prob, factors.c1, factors.c2 ); @@ -1375,7 +1375,7 @@ void gpu_densmatr_oneQubitDamping_subB(Qureg qureg, int qubit, qreal prob) { auto c2 = util_getOneQubitDampingFactors(prob).c2; kernel_densmatr_oneQubitDamping_subB <<>> ( - toGpuQcomps(qureg.gpuAmps), numThreads, qubit, c2 + getGpuQcompPtr(qureg.gpuAmps), numThreads, qubit, c2 ); #else @@ -1395,7 +1395,7 @@ void gpu_densmatr_oneQubitDamping_subC(Qureg qureg, int ketQubit, qreal prob) { auto c1 = util_getOneQubitDampingFactors(prob).c1; kernel_densmatr_oneQubitDamping_subC <<>> ( - toGpuQcomps(qureg.gpuAmps), numThreads, ketQubit, braBit, c1 + getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braBit, c1 ); #else @@ -1413,7 +1413,7 @@ void gpu_densmatr_oneQubitDamping_subD(Qureg qureg, int qubit, qreal prob) { qindex recvInd = getBufferRecvInd(); kernel_densmatr_oneQubitDamping_subD <<>> ( - toGpuQcomps(qureg.gpuAmps), &toGpuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, qubit, prob ); @@ -1444,7 +1444,7 @@ void gpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector ta devints devAllTargs = util_getSorted(targs, pairTargs); kernel_densmatr_partialTrace_sub <<>> ( - toGpuQcomps(inQureg.gpuAmps), toGpuQcomps(outQureg.gpuAmps), numThreads, + getGpuQcompPtr(inQureg.gpuAmps), getGpuQcompPtr(outQureg.gpuAmps), numThreads, getPtr(devTargs), getPtr(devPairTargs), getPtr(devAllTargs), targs.size() ); @@ -1564,7 +1564,7 @@ void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub <<>> ( - getPtr(devProbs), toGpuQcomps(qureg.gpuAmps), numThreads, + getPtr(devProbs), getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, getPtr(devQubits), devQubits.size() ); @@ -1601,7 +1601,7 @@ void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub <<>> ( - getPtr(devProbs), toGpuQcomps(qureg.gpuAmps), + getPtr(devProbs), getGpuQcompPtr(qureg.gpuAmps), numThreads, firstDiagInd, numAmpsPerCol, qureg.rank, qureg.logNumAmpsPerNode, getPtr(devQubits), devQubits.size() @@ -1634,7 +1634,7 @@ qcomp gpu_statevec_calcInnerProduct_sub(Qureg quregA, Qureg quregB) { #if COMPILE_CUQUANTUM || COMPILE_CUDA gpu_qcomp prod = thrust_statevec_calcInnerProduct_sub(quregA, quregB); - return toQcomp(prod); + return getQcomp(prod); #else error_gpuSimButGpuNotCompiled(); @@ -1662,7 +1662,7 @@ qcomp gpu_densmatr_calcFidelityWithPureState_sub(Qureg rho, Qureg psi) { #if COMPILE_CUQUANTUM || COMPILE_CUDA gpu_qcomp fid = thrust_densmatr_calcFidelityWithPureState_sub(rho, psi); - return toQcomp(fid); + return getQcomp(fid); #else error_gpuSimButGpuNotCompiled(); @@ -1703,7 +1703,7 @@ qcomp gpu_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector targs) { #if COMPILE_CUQUANTUM || COMPILE_CUDA gpu_qcomp value = thrust_densmatr_calcExpecAnyTargZ_sub(qureg, targs); - return toQcomp(value); + return getQcomp(value); #else error_gpuSimButGpuNotCompiled(); @@ -1721,7 +1721,7 @@ qcomp gpu_statevec_calcExpecPauliStr_subA(Qureg qureg, vector x, vector x, vector x, vector #if COMPILE_CUQUANTUM || COMPILE_CUDA gpu_qcomp value = thrust_densmatr_calcExpecPauliStr_sub(qureg, x, y, z); - return toQcomp(value); + return getQcomp(value); #else error_gpuSimButGpuNotCompiled(); @@ -1770,9 +1770,9 @@ qcomp gpu_statevec_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr #if COMPILE_CUQUANTUM || COMPILE_CUDA - gpu_qcomp expo = toGpuQcomp(exponent); + gpu_qcomp expo = getGpuQcomp(exponent); gpu_qcomp value = thrust_statevec_calcExpecFullStateDiagMatr_sub(qureg, matr, expo); - return toQcomp(value); + return getQcomp(value); #else error_gpuSimButGpuNotCompiled(); @@ -1787,9 +1787,9 @@ qcomp gpu_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr #if COMPILE_CUQUANTUM || COMPILE_CUDA - gpu_qcomp expo = toGpuQcomp(exponent); + gpu_qcomp expo = getGpuQcomp(exponent); gpu_qcomp value = thrust_densmatr_calcExpecFullStateDiagMatr_sub(qureg, matr, expo); - return toQcomp(value); + return getQcomp(value); #else error_gpuSimButGpuNotCompiled(); @@ -1866,7 +1866,7 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_densmatr_multiQubitProjector void gpu_statevec_initUniformState_sub(Qureg qureg, qcomp amp) { #if COMPILE_CUDA || COMPILE_CUQUANTUM - thrust_statevec_initUniformState(qureg, toGpuQcomp(amp)); + thrust_statevec_initUniformState(qureg, getGpuQcomp(amp)); #else error_gpuSimButGpuNotCompiled(); diff --git a/quest/src/gpu/gpu_thrust.cuh b/quest/src/gpu/gpu_thrust.cuh index 8bcc5e58d..4f6a6b10f 100644 --- a/quest/src/gpu/gpu_thrust.cuh +++ b/quest/src/gpu/gpu_thrust.cuh @@ -110,7 +110,7 @@ gpu_qcomp* getPtr(devcomps& comps) { // devcomps -> qcomp -> gpu_qcomp qcomp* ptr = thrust::raw_pointer_cast(comps.data()); - return toGpuQcomps(ptr); + return getGpuQcompPtr(ptr); } @@ -138,7 +138,7 @@ thrust::device_ptr getStartPtr(gpu_qcomp* amps) { } auto getStartPtr(qcomp* amps) { - return getStartPtr(toGpuQcomps(amps)); + return getStartPtr(getGpuQcompPtr(amps)); } @@ -650,7 +650,7 @@ void thrust_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, Pauli thrust::device_vector devStrings(in.strings, in.strings + in.numTerms); // obtain raw pointers which can be passed to fastmath.hpp routines - gpu_qcomp* devCoeffsPtr = toGpuQcomps(thrust::raw_pointer_cast(devCoeffs.data())); + gpu_qcomp* devCoeffsPtr = getGpuQcompPtr(thrust::raw_pointer_cast(devCoeffs.data())); PauliStr* devStringsPtr = thrust::raw_pointer_cast(devStrings.data()); int rank = out.isDistributed? comm_getRank() : 0; @@ -659,7 +659,7 @@ void thrust_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, Pauli // indicates the PauliStrSum is diagonal (contains only I or Z) auto functor = functor_setAmpToPauliStrSumElem( rank, out.numElems, logNumElemsPerNode, - in.numTerms, toGpuQcomps(out.gpuElems), devCoeffsPtr, devStringsPtr); + in.numTerms, getGpuQcompPtr(out.gpuElems), devCoeffsPtr, devStringsPtr); auto indIter = thrust::make_counting_iterator(QINDEX_ZERO); auto endIter = indIter + out.numElemsPerNode; @@ -696,13 +696,13 @@ void thrust_densmatr_setAmpsToPauliStrSum_sub(Qureg qureg, PauliStrSum sum) { thrust::device_vector devStrings(sum.strings, sum.strings + sum.numTerms); // obtain raw pointers which can be passed to fastmath.hpp routines - gpu_qcomp* devCoeffsPtr = toGpuQcomps(thrust::raw_pointer_cast(devCoeffs.data())); + gpu_qcomp* devCoeffsPtr = getGpuQcompPtr(thrust::raw_pointer_cast(devCoeffs.data())); PauliStr* devStringsPtr = thrust::raw_pointer_cast(devStrings.data()); // indicates the PauliStrSum is not diagonal (contains X or Y) auto functor = functor_setAmpToPauliStrSumElem( qureg.rank, powerOf2(qureg.numQubits), qureg.logNumAmpsPerNode, - sum.numTerms, toGpuQcomps(qureg.gpuAmps), devCoeffsPtr, devStringsPtr); + sum.numTerms, getGpuQcompPtr(qureg.gpuAmps), devCoeffsPtr, devStringsPtr); auto indIter = thrust::make_counting_iterator(QINDEX_ZERO); auto endIter = indIter + qureg.numAmpsPerNode; @@ -858,7 +858,7 @@ gpu_qcomp thrust_densmatr_calcFidelityWithPureState_sub(Qureg rho, Qureg psi) { // functor accepts an index and produces a gpu_qcomp auto functor = functor_getFidelityTerm( rho.rank, rho.numQubits, rho.logNumAmpsPerNode, - psi.numAmps, toGpuQcomps(rho.gpuAmps), toGpuQcomps(psi.gpuAmps)); + psi.numAmps, getGpuQcompPtr(rho.gpuAmps), getGpuQcompPtr(psi.gpuAmps)); auto indIter = thrust::make_counting_iterator(QINDEX_ZERO); qindex numIts = rho.numAmpsPerNode; @@ -898,7 +898,7 @@ gpu_qcomp thrust_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector targs) qindex dim = powerOf2(qureg.numQubits); qindex ind = util_getLocalIndexOfFirstDiagonalAmp(qureg); qindex mask = util_getBitMask(targs); - auto functor = functor_getExpecDensMatrZTerm(dim, ind, mask, toGpuQcomps(qureg.gpuAmps)); + auto functor = functor_getExpecDensMatrZTerm(dim, ind, mask, getGpuQcompPtr(qureg.gpuAmps)); gpu_qcomp init = getGpuQcomp(0, 0); auto indIter = thrust::make_counting_iterator(QINDEX_ZERO); @@ -912,7 +912,7 @@ gpu_qcomp thrust_statevec_calcExpecPauliStr_subA(Qureg qureg, vector x, vec qindex maskXY = util_getBitMask(util_getConcatenated(x, y)); qindex maskYZ = util_getBitMask(util_getConcatenated(y, z)); - auto ampsPtr = toGpuQcomps(qureg.gpuAmps); + auto ampsPtr = getGpuQcompPtr(qureg.gpuAmps); auto functor = functor_getExpecStateVecPauliTerm(maskXY, maskYZ, ampsPtr, ampsPtr); // amps=pairAmps gpu_qcomp init = getGpuQcomp(0, 0); @@ -921,7 +921,7 @@ gpu_qcomp thrust_statevec_calcExpecPauliStr_subA(Qureg qureg, vector x, vec gpu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); - return value * toGpuQcomp(util_getPowerOfI(y.size())); + return value * getGpuQcomp(util_getPowerOfI(y.size())); } @@ -929,8 +929,8 @@ gpu_qcomp thrust_statevec_calcExpecPauliStr_subB(Qureg qureg, vector x, vec qindex maskXY = util_getBitMask(util_getConcatenated(x, y)); qindex maskYZ = util_getBitMask(util_getConcatenated(y, z)); - auto ampsPtr = toGpuQcomps(qureg.gpuAmps); - auto buffPtr = toGpuQcomps(qureg.gpuCommBuffer); + auto ampsPtr = getGpuQcompPtr(qureg.gpuAmps); + auto buffPtr = getGpuQcompPtr(qureg.gpuCommBuffer); auto functor = functor_getExpecStateVecPauliTerm(maskXY, maskYZ, ampsPtr, buffPtr); gpu_qcomp init = getGpuQcomp(0, 0); @@ -939,7 +939,7 @@ gpu_qcomp thrust_statevec_calcExpecPauliStr_subB(Qureg qureg, vector x, vec gpu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); - return value * toGpuQcomp(util_getPowerOfI(y.size())); + return value * getGpuQcomp(util_getPowerOfI(y.size())); } @@ -949,7 +949,7 @@ gpu_qcomp thrust_densmatr_calcExpecPauliStr_sub(Qureg qureg, vector x, vect qindex mYZ = util_getBitMask(util_getConcatenated(y, z)); qindex dim = powerOf2(qureg.numQubits); qindex ind = util_getLocalIndexOfFirstDiagonalAmp(qureg); - auto functor = functor_getExpecDensMatrPauliTerm(mXY, mYZ, dim, ind, toGpuQcomps(qureg.gpuAmps)); + auto functor = functor_getExpecDensMatrPauliTerm(mXY, mYZ, dim, ind, getGpuQcompPtr(qureg.gpuAmps)); gpu_qcomp init = getGpuQcomp(0, 0); auto indIter = thrust::make_counting_iterator(QINDEX_ZERO); @@ -957,7 +957,7 @@ gpu_qcomp thrust_densmatr_calcExpecPauliStr_sub(Qureg qureg, vector x, vect gpu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); - return value * toGpuQcomp(util_getPowerOfI(y.size())); + return value * getGpuQcomp(util_getPowerOfI(y.size())); } @@ -986,8 +986,8 @@ gpu_qcomp thrust_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateD qindex dim = powerOf2(qureg.numQubits); qindex ind = util_getLocalIndexOfFirstDiagonalAmp(qureg); - auto ampsPtr = toGpuQcomps(qureg.gpuAmps); - auto elemsPtr = toGpuQcomps(matr.gpuElems); + auto ampsPtr = getGpuQcompPtr(qureg.gpuAmps); + auto elemsPtr = getGpuQcompPtr(matr.gpuElems); auto functor = functor_getExpecDensMatrDiagMatrTerm(dim, ind, ampsPtr, elemsPtr, expo); gpu_qcomp init = getGpuQcomp(0, 0); From d86c126b833b1b7b33cc1e4d18e79005a7e7a1dc Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Fri, 17 Apr 2026 18:15:33 -0400 Subject: [PATCH 05/16] created gpu_qcomp but there's so much boilerplate overlap with cpu_qcomp, I wonder if we should unify the two! We can retain separate cpu_qcomp and gpu_qcomp types (for clarity) through a typedef --- quest/src/cpu/cpu_subroutines.cpp | 6 + quest/src/cpu/cpu_types.hpp | 7 +- quest/src/gpu/gpu_types.cuh | 342 +++++++++++++++--------------- 3 files changed, 180 insertions(+), 175 deletions(-) diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp index 077895f94..8a1964e92 100644 --- a/quest/src/cpu/cpu_subroutines.cpp +++ b/quest/src/cpu/cpu_subroutines.cpp @@ -660,6 +660,9 @@ void cpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector ctrls, vec // I am paranoid the above casting from qcomp* to cpu_qcomp* is // more dangerous for the static matr.elems than the heap pointer. // Is it more vulnerable to misalignment? Check this! + // + // If this IS the case, we simply prepare a new cpu_qcomp + // instance for each element, like the GPU backend does auto sortedCtrls = util_getSorted(ctrls); auto ctrlStateMask = util_getBitMask(ctrls, ctrlStates); @@ -707,6 +710,9 @@ void cpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector ctrls, vec // I am paranoid the above casting from qcomp* to cpu_qcomp* is // more dangerous for the static matr.elems than the heap pointer. // Is it more vulnerable to misalignment? Check this! + // + // If this IS the case, we simply prepare a new cpu_qcomp + // instance for each element, like the GPU backend does auto sortedCtrls = util_getSorted(ctrls); auto ctrlStateMask = util_getBitMask(ctrls, ctrlStates); diff --git a/quest/src/cpu/cpu_types.hpp b/quest/src/cpu/cpu_types.hpp index bc37a36df..b35dc7ff4 100644 --- a/quest/src/cpu/cpu_types.hpp +++ b/quest/src/cpu/cpu_types.hpp @@ -22,7 +22,8 @@ * a std::complex) is not used by the CPU backend, since it creates * performance pitfalls (e.g. expensive NaN checks within arithmetic * operators) in some compilers. Instead, we use the below custom - * complex type and operator overloads. + * complex type and operator overloads, which must ergo crucially + * be POD and share the memory layout and alignment of qcomp */ @@ -107,13 +108,13 @@ INLINE cpu_qcomp* getCpuQcompPtr(qcomp* list) { } -// creator for cpu_qcomp literals +// get cpu_qcomp from components INLINE cpu_qcomp getCpuQcomp(qreal re, qreal im) { return { re, im }; } -// creator for qcomp conversion +// get cpu_qcomp from qcomp INLINE cpu_qcomp getCpuQcomp(const qcomp& a) { return { a.real(), a.imag() }; } diff --git a/quest/src/gpu/gpu_types.cuh b/quest/src/gpu/gpu_types.cuh index 9ece0bcc6..6639973dd 100644 --- a/quest/src/gpu/gpu_types.cuh +++ b/quest/src/gpu/gpu_types.cuh @@ -24,10 +24,8 @@ #error "A file being compiled somehow included gpu_types.hpp despite QuEST not being compiled in GPU-accelerated mode." #endif -#if defined(__NVCC__) - #include -#elif defined(__HIP__) - #include "quest/src/gpu/cuda_to_hip.hpp" +#if (FLOAT_PRECISION == 4) + #error "Build bug; precision.h should have prevented non-float non-double qcomp precision on GPU." #endif #include @@ -36,239 +34,239 @@ /* - * CUDA-COMPATIBLE QCOMP ALIAS (gpu_qcomp) + * COMPLEX SCALAR * - * which we opt to use over a Thrust complex type to gaurantee - * compatibility with cuQuantum, though this irritatingly - * requires explicitly defining operator overloads below + * The user-facing qcomp (which in the QuEST middle-end, resolves to + * a std::complex) is not used by the GPU backend, since incompatible + * with CUDA kernels. We use our own custom gpu_qcomp type below, in + * lieu of cuComplex or Thrust types, to workaround compatibility issues + * with HIP, and for better symmetry with cpu_qcomp. */ -#if (FLOAT_PRECISION == 1) - typedef cuFloatComplex gpu_qcomp; - -#elif (FLOAT_PRECISION == 2) - typedef cuDoubleComplex gpu_qcomp; +struct gpu_qcomp { + + // memory layout + qreal re; + qreal im; + + // in-place complex arithmetic overloads + INLINE gpu_qcomp& operator += (const gpu_qcomp& a) noexcept { + re += a.re; + im += a.im; + return *this; + } + INLINE gpu_qcomp& operator -= (const gpu_qcomp& a) noexcept { + re -= a.re; + im -= a.im; + return *this; + } + INLINE gpu_qcomp& operator *= (const gpu_qcomp& a) noexcept { + qreal re_ = re; + qreal im_ = im; + re = (re_ * a.re) - (im_ * a.im); + im = (re_ * a.im) + (im_ * a.re); + return *this; + } + + // in-place mixed-type arithmetic overloads + INLINE gpu_qcomp& operator *= (const int& a) noexcept { + re *= a; + im *= a; + return *this; + } + INLINE gpu_qcomp& operator *= (const qreal& a) noexcept { + re *= a; + im *= a; + return *this; + } +}; + + +// out-of-place complex arithmetic overloads (optimised) +INLINE gpu_qcomp operator + (gpu_qcomp a, const gpu_qcomp& b) noexcept { + a += b; + return a; +} +INLINE gpu_qcomp operator - (gpu_qcomp a, const gpu_qcomp& b) noexcept { + a -= b; + return a; +} +INLINE gpu_qcomp operator * (gpu_qcomp a, const gpu_qcomp& b) noexcept { + a *= b; + return a; +} -#else - #error "Build bug; precision.h should have prevented non-float non-double qcomp precision on GPU." -#endif +// out-of-place mixed-type arithmetic overloads +INLINE gpu_qcomp operator * (gpu_qcomp a, const int& b) noexcept { + a *= b; + return a; +} +INLINE gpu_qcomp operator * (gpu_qcomp a, const qreal& b) noexcept { + a *= b; + return a; +} +// reverse order of out-of-place mixed-type arithmetic (via commutation) +INLINE gpu_qcomp operator * (const int& a, const gpu_qcomp& b) noexcept { + return b * a; +} +INLINE gpu_qcomp operator * (const qreal& a, const gpu_qcomp& b) noexcept { + return b * a; +} -/* - * TRANSFORMING qcomp AND gpu_qcomp - */ +// no-op cast of pointers +INLINE gpu_qcomp* getGpuQcompPtr(qcomp* list) { -INLINE gpu_qcomp getGpuQcomp(qreal re, qreal im) { - -#if (FLOAT_PRECISION == 1) - return make_cuFloatComplex(re, im); -#else - return make_cuDoubleComplex(re, im); -#endif + return reinterpret_cast(list); } -__host__ inline gpu_qcomp toGpuQcomp(qcomp a) { - return getGpuQcomp(std::real(a), std::imag(a)); -} -__host__ inline qcomp toQcomp(gpu_qcomp a) { - return getQcomp(a.x, a.y); +// get gpu_qcomp from components +INLINE gpu_qcomp getGpuQcomp(qreal re, qreal im) { + return { re, im }; } -__host__ inline gpu_qcomp* toGpuQcomps(qcomp* a) { +// get gpu_qcomp from qcomp +INLINE gpu_qcomp getGpuQcomp(const qcomp& a) { + return { a.real(), a.imag() }; +} - // reinterpret a qcomp ptr as a gpu_qcomp ptr, - // which is ONLY SAFE when comp and gpu_qcomp - // have identical memory layouts. Be very - // careful; HIP stack arrays (e.g. qcomp[]) - // seg-fault when passed here, so this funciton - // should only ever be used on malloc'd data! - // Stack objects should use the below unpacks. - return reinterpret_cast(a); +// get qcomp from gpu_qcomp +INLINE qcomp getQcomp(const gpu_qcomp& a) { + return qcomp( a.re, a.im ); } + // // creator for fixed-size dense matrices (CompMatr1 and CompMatr2) + // template + // INLINE std::array,dim> getCpuQcomps(qcomp matr[dim][dim]) { -__host__ inline std::array unpackMatrixToGpuQcomps(DiagMatr1 in) { + // std::array,dim> out; - // it's crucial we explicitly copy over the elements, - // rather than just reinterpret the pointer, to avoid - // segmentation faults when memory misaligns (like on HIP) - - return {toGpuQcomp(in.elems[0]), toGpuQcomp(in.elems[1])}; -} + // for (int i=0; i unpackMatrixToGpuQcomps(DiagMatr2 in) { - return { - toGpuQcomp(in.elems[0]), toGpuQcomp(in.elems[1]), - toGpuQcomp(in.elems[2]), toGpuQcomp(in.elems[3])}; +// maths functions +INLINE qreal real(const gpu_qcomp& a) { + return a.re; } +INLINE qreal imag(const gpu_qcomp& a) { + return a.im; +} +INLINE gpu_qcomp conj(const gpu_qcomp& a) { + return {a.re, - a.im}; +} +INLINE qreal norm(const gpu_qcomp& a) noexcept { + return (a.re * a.re) + (a.im * a.im); +} +INLINE gpu_qcomp pow(gpu_qcomp base, gpu_qcomp exponent) { + // using https://mathworld.wolfram.com/ComplexExponentiation.html, + // and the principal argument of 'base' -__host__ inline std::array unpackMatrixToGpuQcomps(CompMatr1 in) { + // base = a + b i, exponent = c + d i + qreal a = base.re; + qreal b = base.im; + qreal c = exponent.re; + qreal d = exponent.im; - std::array out{}; - for (int i=0; i<4; i++) - out[i] = toGpuQcomp(in.elems[i/2][i%2]); + // intermediate quantities (uses CUDA atan2,log,pow,exp,cos,sin) + qreal arg = atan2(b, a); + qreal mag = a*a + b*b; + qreal ln = log(mag); + qreal fac = pow(mag, c/2) * exp(-d * arg); + qreal ang = c*arg + d*ln/2; - return out; + // output scalar + qreal re = fac * cos(ang); + qreal im = fac * sin(ang); + return getGpuQcomp(re, im); } -__host__ inline std::array unpackMatrixToGpuQcomps(CompMatr2 in) { +// check the memory layout of gpu_qcomp agrees with qcomp, since +// it is not formally gauranteed, unlike _Complex and std::complex +static_assert(sizeof (gpu_qcomp) == sizeof (qcomp)); +static_assert(alignof(gpu_qcomp) == alignof(qcomp)); +static_assert(std::is_standard_layout_v ); +static_assert(std::is_trivially_copyable_v); - std::array out{}; - for (int i=0; i<16; i++) - out[i] = toGpuQcomp(in.elems[i/4][i%4]); - return out; -} +// TODO: +// the above checks are potentially inadequate to identify an +// insidious incompatibility between qcomp and gpu_qcomp - perhaps +// we should perform a compile-time duck-check, casting a small +// array between them and checking no data is corrupted? Perhaps +// a runtime check in initQuESTEnv() is also necessary, checking the +// casting is safe for all circumstances (e.g. heap mem, static lists) /* - * gpu_qcomp ARITHMETIC OVERLOADS + * TODO: + * OLD UNPACKERS * - * which are only needed by NVCC because - * HIP defines them for us. This good deed - * goes punished; a HIP bug disables our - * use of *= and += overloads, so kernels.cuh - * has disgusting (x = x * y) statements. Bah! + * which I am hestitant to switch to the CPU-style until I better + * understand why the explicit gpu_qcomp instantiation is necessary + * (iirc static HIP structs have a different alignment than qcomp?!) */ -/// @todo -/// - clean this up (with templates?) -/// - use getGpuQcomp() rather than struct creation, -/// to make the algebra implementation-agnostic - - -#if defined(__NVCC__) +__host__ inline std::array unpackMatrixToGpuQcomps(DiagMatr1 in) { -INLINE gpu_qcomp operator + (const gpu_qcomp& a, const gpu_qcomp& b) { - gpu_qcomp out = { - .x = a.x + b.x, - .y = a.y + b.y - }; - return out; -} + // it's crucial we explicitly copy over the elements, + // rather than just reinterpret the pointer, to avoid + // segmentation faults when memory misaligns (like on HIP) -INLINE gpu_qcomp operator - (const gpu_qcomp& a, const gpu_qcomp& b) { - gpu_qcomp out = { - .x = a.x - b.x, - .y = a.y - b.y - }; - return out; -} + // oh YES we must not cast statically created HIP arrays + // like within kernels?!?! -INLINE gpu_qcomp operator * (const gpu_qcomp& a, const gpu_qcomp& b) { - gpu_qcomp out = { - .x = a.x * b.x - a.y * b.y, - .y = a.x * b.y + a.y * b.x - }; - return out; -} -INLINE gpu_qcomp operator + (const gpu_qcomp& a, const qreal& b) { - gpu_qcomp out = { - .x = a.x + b, - .y = a.y + b - }; - return out; -} -INLINE gpu_qcomp operator + (const qreal& b, const gpu_qcomp& a) { - gpu_qcomp out = { - .x = a.x + b, - .y = a.y + b - }; - return out; -} + // UMMMMMM is the above true?!?! + // Wen did I witness misalignment between std::complex and gpu_qcomp?! -INLINE gpu_qcomp operator - (const gpu_qcomp& a, const qreal& b) { - gpu_qcomp out = { - .x = a.x - b, - .y = a.y - b - }; - return out; -} -INLINE gpu_qcomp operator - (const qreal& b, const gpu_qcomp& a) { - gpu_qcomp out = { - .x = a.x - b, - .y = a.y - b - }; - return out; + return {getGpuQcomp(in.elems[0]), getGpuQcomp(in.elems[1])}; } -INLINE gpu_qcomp operator * (const gpu_qcomp& a, const qreal& b) { - gpu_qcomp out = { - .x = a.x * b, - .y = a.y * b - }; - return out; -} -INLINE gpu_qcomp operator * (const qreal& b, const gpu_qcomp& a) { - gpu_qcomp out = { - .x = a.x * b, - .y = a.y * b - }; - return out; -} - -#endif - +__host__ inline std::array unpackMatrixToGpuQcomps(DiagMatr2 in) { -/* - * gpu_qcomp UNARY FUNCTIONS - */ + return { + getGpuQcomp(in.elems[0]), getGpuQcomp(in.elems[1]), + getGpuQcomp(in.elems[2]), getGpuQcomp(in.elems[3])}; +} -INLINE qreal getCompReal(gpu_qcomp num) { - return num.x; -} +__host__ inline std::array unpackMatrixToGpuQcomps(CompMatr1 in) { -INLINE gpu_qcomp getCompConj(gpu_qcomp num) { - num.y *= -1; - return num; -} + std::array out{}; + for (int i=0; i<4; i++) + out[i] = getGpuQcomp(in.elems[i/2][i%2]); -INLINE qreal getCompNorm(gpu_qcomp num) { - return (num.x * num.x) + (num.y * num.y); + return out; } -INLINE gpu_qcomp getCompPower(gpu_qcomp base, gpu_qcomp exponent) { - - // using https://mathworld.wolfram.com/ComplexExponentiation.html, - // and the principal argument of 'base' - // base = a + b i, exponent = c + d i - qreal a = base.x; - qreal b = base.y; - qreal c = exponent.x; - qreal d = exponent.y; +__host__ inline std::array unpackMatrixToGpuQcomps(CompMatr2 in) { - // intermediate quantities (uses CUDA atan2,log,pow,exp,cos,sin) - qreal arg = atan2(b, a); - qreal mag = a*a + b*b; - qreal ln = log(mag); - qreal fac = pow(mag, c/2) * exp(-d * arg); - qreal ang = c*arg + d*ln/2; + std::array out{}; + for (int i=0; i<16; i++) + out[i] = getGpuQcomp(in.elems[i/4][i%4]); - // output scalar - qreal re = fac * cos(ang); - qreal im = fac * sin(ang); - return getGpuQcomp(re, im); + return out; } - #endif // GPU_TYPES_HPP \ No newline at end of file From 1d62e75f8eaf5e1a473ca00a6234ea094b38c210 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Fri, 17 Apr 2026 18:37:18 -0400 Subject: [PATCH 06/16] patch CPU MSVC compiler pitfall --- quest/src/cpu/cpu_subroutines.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp index 8a1964e92..215d5ab90 100644 --- a/quest/src/cpu/cpu_subroutines.cpp +++ b/quest/src/cpu/cpu_subroutines.cpp @@ -412,7 +412,7 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector ctrls, v // use cpu_qcomp arithmetic overloads (avoid qcomp's) cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); - auto elems = getCpuQcomps(matr.elems); + auto elems = getCpuQcomps<2>(matr.elems); // MSVC requires explicit template param, bah! auto sortedQubits = util_getSorted(ctrls, {targ}); auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0}); @@ -495,7 +495,7 @@ void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve // use cpu_qcomp arithmetic overloads (avoid qcomp's) cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); - auto elems = getCpuQcomps(matr.elems); + auto elems = getCpuQcomps<4>(matr.elems); // MSVC requires explicit template param, bah! auto sortedQubits = util_getSorted(ctrls, {targ1, targ2}); auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1, targ2}, {0, 0}); From 4a3b5afdb1305e076f74e62bec228b0d15895acf Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Sat, 18 Apr 2026 15:14:49 -0400 Subject: [PATCH 07/16] woopsiedoodle --- quest/src/gpu/gpu_kernels.cuh | 4 ++-- quest/src/gpu/gpu_subroutines.cpp | 4 ++-- quest/src/gpu/gpu_thrust.cuh | 30 +++++++++++++++--------------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index b7f8df424..35c9dcee5 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -1190,7 +1190,7 @@ __global__ void kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub( // use template param to compile-time unroll below loops SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, numQubits); - qreal prob = getCompNorm(amps[n]); + qreal prob = norm(amps[n]); // i = global index corresponding to n qindex i = concatenateBits(rank, n, logNumAmpsPerNode); @@ -1216,7 +1216,7 @@ __global__ void kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub( // i = index of nth local diagonal elem qindex i = fast_getQuregLocalIndexOfDiagonalAmp(n, firstDiagInd, numAmpsPerCol); - qreal prob = getCompReal(amps[i]); + qreal prob = real(amps[i]); // j = global index of i qindex j = concatenateBits(rank, i, logNumAmpsPerNode); diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index 50a5b66a0..b3675713f 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -173,7 +173,7 @@ qindex gpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu qindex sendInd = getSubBufferSendInd(qureg); kernel_statevec_packPairSummedAmpsIntoBuffer <<>> ( - getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer + sendInd, numThreads, + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + sendInd, numThreads, qubit1, qubit2, qubit3, bit2 ); @@ -1231,7 +1231,7 @@ void gpu_densmatr_twoQubitDepolarising_subD(Qureg qureg, int ketQb1, int ketQb2, auto factors = util_getTwoQubitDepolarisingFactors(prob); kernel_densmatr_twoQubitDepolarising_subD <<>> ( - getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer + offset, numThreads, + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + offset, numThreads, ketQb1, ketQb2, braQb1, braBit2, factors.c1, factors.c2 ); diff --git a/quest/src/gpu/gpu_thrust.cuh b/quest/src/gpu/gpu_thrust.cuh index 4f6a6b10f..ccdbb2130 100644 --- a/quest/src/gpu/gpu_thrust.cuh +++ b/quest/src/gpu/gpu_thrust.cuh @@ -174,21 +174,21 @@ auto getEndPtr(FullStateDiagMatr matr) { struct functor_getAmpConj { __host__ __device__ gpu_qcomp operator()(gpu_qcomp amp) { - return getCompConj(amp); + return conj(amp); } }; struct functor_getAmpNorm { __host__ __device__ qreal operator()(gpu_qcomp amp) { - return getCompNorm(amp); + return norm(amp); } }; struct functor_getAmpReal { __host__ __device__ qreal operator()(gpu_qcomp amp) { - return getCompReal(amp); + return real(amp); } }; @@ -196,14 +196,14 @@ struct functor_getAmpReal { struct functor_getAmpConjProd { __host__ __device__ gpu_qcomp operator()(gpu_qcomp braAmp, gpu_qcomp ketAmp) { - return getCompConj(braAmp) * ketAmp; + return conj(braAmp) * ketAmp; } }; struct functor_getNormOfAmpDif { __host__ __device__ qreal operator()(gpu_qcomp amp1, gpu_qcomp amp2) { - return getCompNorm(amp1 - amp2); + return norm(amp1 - amp2); } }; @@ -220,7 +220,7 @@ struct functor_getExpecStateVecZTerm { int par = cudaGetBitMaskParity(ind & targMask); // device-only int sign = fast_getPlusOrMinusOne(par); - return sign * getCompNorm(amp); + return sign * norm(amp); } }; @@ -267,7 +267,7 @@ struct functor_getExpecStateVecPauliTerm { int sign = fast_getPlusOrMinusOne(par); // sign excludes i^numY contribution - return sign * getCompConj(amps[n]) * pairAmps[j]; // pairAmps may be amps or buffer + return sign * conj(amps[n]) * pairAmps[j]; // pairAmps may be amps or buffer } }; @@ -316,9 +316,9 @@ struct functor_getExpecDensMatrDiagMatrTerm { gpu_qcomp elem = elems[n]; if constexpr (HasPower && ! UseRealPow) - elem = getCompPower(elem, expo); + elem = pow(elem, expo); if constexpr (HasPower && UseRealPow) - elem = getGpuQcomp(pow(getCompReal(elem), getCompReal(expo)),0); // CUDA pow(qreal,qreal) + elem = getGpuQcomp(pow(real(elem), real(expo)), 0); // CUDA pow(qreal,qreal) qindex i = fast_getQuregLocalIndexOfDiagonalAmp(n, firstDiagInd, numAmpsPerCol); @@ -399,12 +399,12 @@ struct functor_multiplyElemPowerWithAmpOrNorm { __host__ __device__ gpu_qcomp operator()(gpu_qcomp quregAmp, gpu_qcomp matrElem) { if constexpr (HasPower && ! UseRealPow) - matrElem = getCompPower(matrElem, exponent); + matrElem = pow(matrElem, exponent); if constexpr (HasPower && UseRealPow) - matrElem = getGpuQcomp(pow(getCompReal(matrElem), getCompReal(exponent)),0); // CUDA pow(qreal,qreal) + matrElem = getGpuQcomp(pow(real(matrElem), real(exponent)), 0); // CUDA pow(qreal,qreal) if constexpr (Norm) - quregAmp = getGpuQcomp(getCompNorm(quregAmp), 0); + quregAmp = getGpuQcomp(norm(quregAmp), 0); return matrElem * quregAmp; } @@ -486,10 +486,10 @@ struct functor_getFidelityTerm { // compute term of or if constexpr (Conj) { - rhoAmp = getCompConj(rhoAmp); - colAmp = getCompConj(colAmp); + rhoAmp = conj(rhoAmp); + colAmp = conj(colAmp); } else - rowAmp = getCompConj(rowAmp); + rowAmp = conj(rowAmp); gpu_qcomp fid = rhoAmp * rowAmp * colAmp; return fid; From 77076228ba601d45c63ee52c166f18c8922e7fbb Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Sun, 19 Apr 2026 14:03:31 -0400 Subject: [PATCH 08/16] CUDA patch --- quest/src/gpu/gpu_types.cuh | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/quest/src/gpu/gpu_types.cuh b/quest/src/gpu/gpu_types.cuh index 6639973dd..f4594b22e 100644 --- a/quest/src/gpu/gpu_types.cuh +++ b/quest/src/gpu/gpu_types.cuh @@ -107,6 +107,10 @@ INLINE gpu_qcomp operator * (gpu_qcomp a, const qreal& b) noexcept { a *= b; return a; } +INLINE gpu_qcomp operator * (gpu_qcomp a, const size_t& b) noexcept { + a *= static_cast(b); + return a; +} // reverse order of out-of-place mixed-type arithmetic (via commutation) @@ -131,14 +135,14 @@ INLINE gpu_qcomp getGpuQcomp(qreal re, qreal im) { } -// get gpu_qcomp from qcomp -INLINE gpu_qcomp getGpuQcomp(const qcomp& a) { +// get gpu_qcomp from qcomp (host only; qcomp forbiddin in device code) +__host__ gpu_qcomp getGpuQcomp(const qcomp& a) { return { a.real(), a.imag() }; } -// get qcomp from gpu_qcomp -INLINE qcomp getQcomp(const gpu_qcomp& a) { +// get qcomp from gpu_qcomp (host only; qcomp forbiddin in device code) +__host__ qcomp getQcomp(const gpu_qcomp& a) { return qcomp( a.re, a.im ); } From e1dc06c38679f4490f02de8d0149fc3232f12518 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Sun, 19 Apr 2026 15:10:25 -0400 Subject: [PATCH 09/16] patch HIP --- quest/src/gpu/gpu_types.cuh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/quest/src/gpu/gpu_types.cuh b/quest/src/gpu/gpu_types.cuh index f4594b22e..8b8d3dad9 100644 --- a/quest/src/gpu/gpu_types.cuh +++ b/quest/src/gpu/gpu_types.cuh @@ -28,6 +28,10 @@ #error "Build bug; precision.h should have prevented non-float non-double qcomp precision on GPU." #endif +#if defined(__HIP__) + #include "quest/src/gpu/cuda_to_hip.hpp" +#endif + #include #include From fddc7f2cd54e53183386e3a25e5001fe998f38dc Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Sun, 19 Apr 2026 15:10:41 -0400 Subject: [PATCH 10/16] patch Linux --- quest/src/cpu/cpu_types.hpp | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/quest/src/cpu/cpu_types.hpp b/quest/src/cpu/cpu_types.hpp index b35dc7ff4..bdfddd3de 100644 --- a/quest/src/cpu/cpu_types.hpp +++ b/quest/src/cpu/cpu_types.hpp @@ -120,6 +120,12 @@ INLINE cpu_qcomp getCpuQcomp(const qcomp& a) { } +// get qcomp from cpu_qcomp +INLINE qcomp getQcomp(const cpu_qcomp& a) { + return qcomp( a.re, a.im ); +} + + // creator for fixed-size dense matrices (CompMatr1 and CompMatr2) template INLINE std::array,dim> getCpuQcomps(qcomp matr[dim][dim]) { @@ -148,13 +154,16 @@ INLINE qreal norm(const cpu_qcomp& a) noexcept { return (a.re * a.re) + (a.im * a.im); } INLINE cpu_qcomp pow(cpu_qcomp base, cpu_qcomp expo) noexcept { - - // TODO: here, we re-use std::pow(std::complex), accepting NaN-check - // performance penalties - find a speedup without compiler flags! - auto& base_ = reinterpret_cast(base); - auto& expo_ = reinterpret_cast(expo); - qcomp out = std::pow(base_, expo_); - return reinterpret_cast(out); + + // Here, we re-use std::pow(std::complex) to avoid a custom definition, + // and so accept NaN-check performance penalties. Notice too we also + // create new qcomp(), rather than just reinterpreting the given cpu_qcomp, + // just to avoid any insiduous issues alignment/aliasing issues (since the + // creation time iss occluded by std::pow time). + qcomp base_ = getQcomp(base); + qcomp expo_ = getQcomp(expo); + qcomp out_ = std::pow(base_, expo_); + return getCpuQcomp(out_); } From b4fa4c453a26c7d866944835844eb7cb62e21e7c Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Sun, 19 Apr 2026 15:17:42 -0400 Subject: [PATCH 11/16] silence unused warning --- quest/src/cpu/cpu_subroutines.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp index 215d5ab90..ad6bc3c4f 100644 --- a/quest/src/cpu/cpu_subroutines.cpp +++ b/quest/src/cpu/cpu_subroutines.cpp @@ -758,6 +758,7 @@ void cpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vec cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); cpu_qcomp* elems = getCpuQcompPtr(matr.cpuElems); cpu_qcomp expo = getCpuQcomp(exponent); + (void) expo; // silence when unused auto sortedCtrls = util_getSorted(ctrls); auto ctrlStateMask = util_getBitMask(ctrls, ctrlStates); @@ -825,6 +826,7 @@ void cpu_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); cpu_qcomp* elems = getCpuQcompPtr(matr.cpuElems); cpu_qcomp expo = getCpuQcomp(exponent); + (void) expo; // silence when unused #pragma omp parallel for if(qureg.isMultithreaded||qureg.isMultithreaded) for (qindex n=0; n Date: Sun, 19 Apr 2026 15:31:34 -0400 Subject: [PATCH 12/16] attempt patch MSVC compiler stack-overflows when OpenMP is enabled - possibly due to thread-private instantiation of this 2D array? --- quest/src/cpu/cpu_types.hpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/quest/src/cpu/cpu_types.hpp b/quest/src/cpu/cpu_types.hpp index bdfddd3de..a426dfce3 100644 --- a/quest/src/cpu/cpu_types.hpp +++ b/quest/src/cpu/cpu_types.hpp @@ -126,9 +126,12 @@ INLINE qcomp getQcomp(const cpu_qcomp& a) { } -// creator for fixed-size dense matrices (CompMatr1 and CompMatr2) +// creator for fixed-size dense matrices (CompMatr1 and CompMatr2) ((not inlined!)) template -INLINE std::array,dim> getCpuQcomps(qcomp matr[dim][dim]) { +std::array,dim> getCpuQcomps(qcomp matr[dim][dim]) { + + // detect brain-dead compiler inferencing (looking at you MSVC...) + static_assert(dim == 2 || dim == 4, "getCpuQcomps called with unexpected dim"); std::array,dim> out; From 7fc91d89e973247b5f45906389b4f39d231aabf2 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Sun, 19 Apr 2026 16:28:39 -0400 Subject: [PATCH 13/16] temp disabling test compilation to debug MSVC + OpenMP compilation failure in CI --- .github/workflows/compile.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index 0950b7dbb..d2f7fe83d 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -235,12 +235,16 @@ jobs: sudo apt install rocm rocm-hip-sdk rocm-hip-runtime-dev echo "${{ env.rocm_path }}" >> $GITHUB_PATH + # DEBUG: + # temporarily disabling test-file compilation to debug MSVC with OpenMP failure + # (possibly just a Catch issue?!?!) + # invoke cmake, disabling LTO (it duplicates symbols with CUDA + MPI) - name: Configure CMake run: > cmake -B ${{ env.build_dir }} -DBUILD_EXAMPLES=ON - -DENABLE_TESTING=ON + -DENABLE_TESTING=OFF -DFLOAT_PRECISION=${{ matrix.precision }} -DENABLE_DEPRECATED_API=${{ matrix.deprecated }} -DDISABLE_DEPRECATION_WARNINGS=${{ matrix.deprecated }} From 35dcd7ce4355a33f4bca2320ec6a713db323a62d Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Sun, 19 Apr 2026 16:48:05 -0400 Subject: [PATCH 14/16] more msvc debug story of my life bruddah --- .github/workflows/compile.yml | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index d2f7fe83d..76b8d5c48 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -235,16 +235,12 @@ jobs: sudo apt install rocm rocm-hip-sdk rocm-hip-runtime-dev echo "${{ env.rocm_path }}" >> $GITHUB_PATH - # DEBUG: - # temporarily disabling test-file compilation to debug MSVC with OpenMP failure - # (possibly just a Catch issue?!?!) - # invoke cmake, disabling LTO (it duplicates symbols with CUDA + MPI) - name: Configure CMake run: > cmake -B ${{ env.build_dir }} -DBUILD_EXAMPLES=ON - -DENABLE_TESTING=OFF + -DENABLE_TESTING=ON -DFLOAT_PRECISION=${{ matrix.precision }} -DENABLE_DEPRECATED_API=${{ matrix.deprecated }} -DDISABLE_DEPRECATION_WARNINGS=${{ matrix.deprecated }} @@ -256,7 +252,13 @@ jobs: -DCMAKE_CUDA_ARCHITECTURES=${{ env.cuda_arch }} -DCMAKE_HIP_ARCHITECTURES=${{ env.hip_arch }} -DCMAKE_CXX_COMPILER=${{ matrix.compiler }} - -DCMAKE_CXX_FLAGS=${{ matrix.mpi == 'ON' && matrix.cuda == 'ON' && '-fno-lto' || '' }} + -DCMAKE_CXX_FLAGS=${{ + (matrix.mpi == 'ON' && matrix.cuda == 'ON' && '-fno-lto ' || '') + + (matrix.compiler == 'cl' && '/MP1' || '') + }} + + ### DEBUG: + ### above disables parallel compilation with MSVC # force 'Release' build (needed by MSVC to enable optimisations) - name: Compile From 9a4d11a2575f36da5339fd8337018166134e0460 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Sun, 19 Apr 2026 16:51:11 -0400 Subject: [PATCH 15/16] syntax error nice one chatgpt --- .github/workflows/compile.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index 76b8d5c48..d19015535 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -252,10 +252,9 @@ jobs: -DCMAKE_CUDA_ARCHITECTURES=${{ env.cuda_arch }} -DCMAKE_HIP_ARCHITECTURES=${{ env.hip_arch }} -DCMAKE_CXX_COMPILER=${{ matrix.compiler }} - -DCMAKE_CXX_FLAGS=${{ - (matrix.mpi == 'ON' && matrix.cuda == 'ON' && '-fno-lto ' || '') - + (matrix.compiler == 'cl' && '/MP1' || '') - }} + -DCMAKE_CXX_FLAGS=${{ matrix.mpi == 'ON' && matrix.cuda == 'ON' && '-fno-lto' || '' }} + -DCMAKE_CXX_FLAGS_DEBUG=${{ matrix.compiler == 'cl' && '/MP1' || '' }} + -DCMAKE_CXX_FLAGS_RELEASE=${{ matrix.compiler == 'cl' && '/MP1' || '' }} ### DEBUG: ### above disables parallel compilation with MSVC From 799b9f57909d99755148b6d42d95893eab3b5bb5 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Wed, 22 Apr 2026 02:06:49 -0400 Subject: [PATCH 16/16] Unify common cpu_qcomp and gpu_qcomp definitions into base_qcomp --- .github/workflows/compile.yml | 1 + quest/src/core/base_qcomp.hpp | 227 ++++++++++++++++++++++++ quest/src/core/fastmath.hpp | 26 +-- quest/src/cpu/cpu_qcomp.hpp | 94 ++++++++++ quest/src/cpu/cpu_subroutines.cpp | 6 +- quest/src/cpu/cpu_types.hpp | 190 -------------------- quest/src/gpu/gpu_cuquantum.cuh | 2 +- quest/src/gpu/gpu_kernels.cuh | 2 +- quest/src/gpu/gpu_qcomp.cuh | 131 ++++++++++++++ quest/src/gpu/gpu_subroutines.cpp | 14 +- quest/src/gpu/gpu_thrust.cuh | 2 +- quest/src/gpu/gpu_types.cuh | 280 ------------------------------ 12 files changed, 480 insertions(+), 495 deletions(-) create mode 100644 quest/src/core/base_qcomp.hpp create mode 100644 quest/src/cpu/cpu_qcomp.hpp delete mode 100644 quest/src/cpu/cpu_types.hpp create mode 100644 quest/src/gpu/gpu_qcomp.cuh delete mode 100644 quest/src/gpu/gpu_types.cuh diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index d19015535..fd74c2948 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -28,6 +28,7 @@ on: branches: - main - devel + - unifying-base-qcomp pull_request: branches: - main diff --git a/quest/src/core/base_qcomp.hpp b/quest/src/core/base_qcomp.hpp new file mode 100644 index 000000000..522983c14 --- /dev/null +++ b/quest/src/core/base_qcomp.hpp @@ -0,0 +1,227 @@ +/** @file + * Definition of base_qcomp, which is extended by the CPU and GPU + * backends (into cpu_qcomp and gpu_qcomp) and used in hot loops + * and kernels. + * + * The user-facing qcomp (which in the QuEST middle-end, resolves to + * std::complex) is not used by the CPU backend, since it creates + * performance pitfalls (e.g. expensive NaN checks within arithmetic + * operators) in some compilers, and is furthermore illegal in the + * GPU backend (i.e. within CUDA kernels). So the backends instead + * use custom complex types with identical memory layouts/alignment + * to qcomp. Those types extend base_qcomp defined in this file, + * since they otherwise share all the same arithmetic boilerplate. + * + * @author Tyson Jones + */ + +#ifndef BASE_QCOMP_HPP +#define BASE_QCOMP_HPP + +#include "quest/include/types.h" + +#include "quest/src/core/inliner.hpp" + + + +/* + * BASE DEFINITION + * + * which must remain POD (a simple {re,im}) and with an identical + * memory layout and alignment to qcomp (i.e. std::complex). Only + * the in-place arithmetic overloads are defined below which are + * reused by the subsequent out-of-place overloads, to avoid + * code duplication. + */ + +struct base_qcomp { + + qreal re; + qreal im; + + + /* + * IN-PLACE COMPLEX ARITHMETIC + */ + + INLINE base_qcomp& operator += (const base_qcomp& a) noexcept { + re += a.re; + im += a.im; + return *this; + } + + INLINE base_qcomp& operator -= (const base_qcomp& a) noexcept { + re -= a.re; + im -= a.im; + return *this; + } + + INLINE base_qcomp& operator *= (const base_qcomp& a) noexcept { + qreal re_ = re; + qreal im_ = im; + re = (re_ * a.re) - (im_ * a.im); + im = (re_ * a.im) + (im_ * a.re); + return *this; + } + + + /* + * IN-PLACE MIXED-TYPE ARITHMETIC + */ + + INLINE base_qcomp& operator *= (const int& a) noexcept { + re *= a; + im *= a; + return *this; + } + + INLINE base_qcomp& operator *= (const qreal& a) noexcept { + re *= a; + im *= a; + return *this; + } + + INLINE base_qcomp& operator *= (const size_t& a) noexcept { + re *= a; + im *= a; + return *this; + } + +}; // base_qcomp + + + +/* + * OUT-OF-PLACE COMPLEX ARITHMETIC + * + * which avoid code duplication by re-using the + * in-place arithmetic operator overloads above + */ + +INLINE base_qcomp operator + (base_qcomp a, const base_qcomp& b) noexcept { + a += b; + return a; +} + +INLINE base_qcomp operator - (base_qcomp a, const base_qcomp& b) noexcept { + a -= b; + return a; +} + +INLINE base_qcomp operator * (base_qcomp a, const base_qcomp& b) noexcept { + a *= b; + return a; +} + + + +/* + * OUT-OF-PLACE MIXED-TYPE ARITHMETIC + * + * which avoid code duplication by re-using the + * in-place arithmetic operator overloads above + */ + + +// base_qcomp * other + +INLINE base_qcomp operator * (base_qcomp a, const int& b) noexcept { + a *= b; + return a; +} + +INLINE base_qcomp operator * (base_qcomp a, const qreal& b) noexcept { + a *= b; + return a; +} + +INLINE base_qcomp operator * (base_qcomp a, const size_t& b) noexcept { + a *= b; + return a; +} + + +// other * base_qcomp (via commutation) + +INLINE base_qcomp operator * (const int& a, const base_qcomp& b) noexcept { + return b * a; +} + +INLINE base_qcomp operator * (const qreal& a, const base_qcomp& b) noexcept { + return b * a; +} + +INLINE base_qcomp operator * (const size_t& a, const base_qcomp& b) noexcept { + return b * a; +} + + + +/* + * BACKEND-AGNOSTIC MATHS + */ + +INLINE qreal real(const base_qcomp& a) { + return a.re; +} + +INLINE qreal imag(const base_qcomp& a) { + return a.im; +} + +INLINE base_qcomp conj(const base_qcomp& a) { + return {a.re, - a.im}; +} + +INLINE qreal norm(const base_qcomp& a) noexcept { + return (a.re * a.re) + (a.im * a.im); +} + + + +/* + * CONVERTERS + */ + +INLINE base_qcomp* getBaseQcompPtr(qcomp* list) { + return reinterpret_cast(list); +} + +INLINE base_qcomp getBaseQcomp(qreal re, qreal im) { + return { re, im }; +} + +INLINE base_qcomp getBaseQcomp(const qcomp& a) { + return { a.real(), a.imag() }; +} + +INLINE qcomp getQcomp(const base_qcomp& a) { + return qcomp( a.re, a.im ); +} + + + +/* + * CHECK COMPATIBILITY WITH QCOMP + */ + + +// check the memory layout of base_qcomp agrees with qcomp, since +// it is not formally gauranteed, unlike _Complex and std::complex +static_assert(sizeof (base_qcomp) == sizeof (qcomp)); +static_assert(alignof(base_qcomp) == alignof(qcomp)); +static_assert(std::is_standard_layout_v ); +static_assert(std::is_trivially_copyable_v); + + +// TODO: +// the above checks are potentially inadequate to identify an +// insidious incompatibility between qcomp and base_qcomp - perhaps +// we should perform a compile-time duck-check, casting a small +// array between them and checking no data is corrupted? Perhaps +// a runtime check in initQuESTEnv() is also necessary, checking the +// casting is safe for all circumstances (e.g. heap mem, static lists) + + + +#endif // BASE_QCOMP_HPP \ No newline at end of file diff --git a/quest/src/core/fastmath.hpp b/quest/src/core/fastmath.hpp index 0dbcee9ee..79884659f 100644 --- a/quest/src/core/fastmath.hpp +++ b/quest/src/core/fastmath.hpp @@ -1,7 +1,12 @@ /** @file * Oerations used by all deployment modes for fast, * low-level maths, inlined and callable within hot - * loops (i.e OpenMP loops and CUDA kernels) + * loops (i.e OpenMP loops and CUDA kernels). + * + * Note this file uses the backend OpenMP/CUDA-agnostic + * base_qcomp, in lieu of the C++-user-facing qcomp (i.e. + * std::complex), to avoid its compiler-specific performance + * pitfalls. * * @author Tyson Jones */ @@ -15,6 +20,7 @@ #include "quest/src/core/inliner.hpp" #include "quest/src/core/bitwise.hpp" +#include "quest/src/core/base_qcomp.hpp" @@ -100,9 +106,7 @@ INLINE void fast_getSubQuregValues(qindex basisStateIndex, int* numQubitsPerSubQ */ -// T = qcomp, cpu_qcomp, gpu_qcomp -template -INLINE T fast_getPauliStrElem(PauliStr str, qindex row, qindex col) { +INLINE base_qcomp fast_getPauliStrElem(PauliStr str, qindex row, qindex col) { // this function is called by both fullstatediagmatr_setElemsToPauliStrSum() // and densmatr_setAmpsToPauliStrSum_sub(). The former's PauliStr can have @@ -120,7 +124,7 @@ INLINE T fast_getPauliStrElem(PauliStr str, qindex row, qindex col) { constexpr int numPaulisPerMask = sizeof(PAULI_MASK_TYPE) * 8 / 2; // T-agnostic complex literals - T p0, p1,n1, pI,nI; + base_qcomp p0, p1,n1, pI,nI; p0 = {0, 0}; // 0 p1 = {+1, 0}; // 1 n1 = {-1, 0}; // -1 @@ -133,13 +137,13 @@ INLINE T fast_getPauliStrElem(PauliStr str, qindex row, qindex col) { // but this poses no real slowdown; this function, and its caller, are inlined // so these 16 amps are re-processed one for each full enumeration of the // PauliStrSum which is expected to have significantly more terms/coeffs - T matrices[][2][2] = { + base_qcomp matrices[][2][2] = { {{p1,p0},{p0,p1}}, // I {{p0,p1},{p1,p0}}, // X {{p0,nI},{pI,p0}}, // Y {{p1,p0},{p0,n1}}}; // Z - T elem = p1; // 1 + base_qcomp elem = p1; // 1 // could be compile-time unrolled into 32 iterations for (int t=0; t -INLINE T fast_getPauliStrSumElem(T* coeffs, PauliStr* strings, qindex numTerms, qindex row, qindex col) { +INLINE base_qcomp fast_getPauliStrSumElem(base_qcomp* coeffs, PauliStr* strings, qindex numTerms, qindex row, qindex col) { // this function accepts unpacked PauliStrSum fields since a PauliStrSum cannot // be directly processed in CUDA kernels/thrust due to its 'qcomp' field. // it also assumes str.highPaulis==0 for all str in strings, as per above func. - T elem = {0, 0}; // type-agnostic complex literal + base_qcomp elem = {0, 0}; // this loop is expected exponentially smaller than caller's loop for (qindex n=0; n(strings[n], row, col); + elem += coeffs[n] * fast_getPauliStrElem(strings[n], row, col); return elem; } diff --git a/quest/src/cpu/cpu_qcomp.hpp b/quest/src/cpu/cpu_qcomp.hpp new file mode 100644 index 000000000..70de46947 --- /dev/null +++ b/quest/src/cpu/cpu_qcomp.hpp @@ -0,0 +1,94 @@ +/** @file + * Definition of cpu_qcomp, an extension of base_qcomp and a + * compatible alternative to the user-facing qcomp, used + * exclusively by the CPU backend. This custom complex type + * avoids performance pitfalls of qcomp (i.e. std::complex), + * e.g. due to NaN checks, without resorting to platform + * and compiler-specific flags. + * + * @author Tyson Jones + */ + +#ifndef CPU_QCOMP_HPP +#define CPU_QCOMP_HPP + +#include "quest/include/types.h" + +#include "quest/src/core/inliner.hpp" +#include "quest/src/core/base_qcomp.hpp" + +#include + + + +/* + * DEFINE CPU_QCOMP + * + * which is safe to typdef and define additional overloads + * below, since never witnessed outside the CPU Backend + */ + +typedef base_qcomp cpu_qcomp; + + + +/* + * CONVERTERS + * + * which merely wrap the base_qcomp functions for clarity + * in the CPU source code, disambiguating from gpu_qcomp + */ + +INLINE cpu_qcomp* getCpuQcompPtr(qcomp* ptr) { + return getBaseQcompPtr(ptr); +} + +INLINE cpu_qcomp getCpuQcomp(qreal re, qreal im) { + return getBaseQcomp(re, im); +} + +INLINE cpu_qcomp getCpuQcomp(const qcomp& a) { + return getBaseQcomp(a); +} + +template +std::array,Dim> getCpuQcompsMatrix(qcomp matr[Dim][Dim]) { + + // Creator for fixed-size dense matrices CompMatr1 and CompMatr2, + // which are respectively 2x2 and 4x4 - deliberately not inlined! + // We create new cpu_qcomp in lieu of reinterpreting a 2D pointer + // in fear of alignment and static array nightmares + static_assert(Dim == 2 || Dim == 4); + + std::array,Dim> out; + + for (int i=0; i ctrls, v // use cpu_qcomp arithmetic overloads (avoid qcomp's) cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); - auto elems = getCpuQcomps<2>(matr.elems); // MSVC requires explicit template param, bah! + auto elems = getCpuQcompsMatrix<2>(matr.elems); // MSVC requires explicit template param, bah! auto sortedQubits = util_getSorted(ctrls, {targ}); auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0}); @@ -495,7 +495,7 @@ void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve // use cpu_qcomp arithmetic overloads (avoid qcomp's) cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); - auto elems = getCpuQcomps<4>(matr.elems); // MSVC requires explicit template param, bah! + auto elems = getCpuQcompsMatrix<4>(matr.elems); // MSVC requires explicit template param, bah! auto sortedQubits = util_getSorted(ctrls, {targ1, targ2}); auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1, targ2}, {0, 0}); diff --git a/quest/src/cpu/cpu_types.hpp b/quest/src/cpu/cpu_types.hpp deleted file mode 100644 index a426dfce3..000000000 --- a/quest/src/cpu/cpu_types.hpp +++ /dev/null @@ -1,190 +0,0 @@ -/** @file - * Custom types used exclusively by the CPU backend. - * - * @author Tyson Jones - */ - -#ifndef CPU_TYPES_HPP -#define CPU_TYPES_HPP - -#include "quest/include/types.h" - -#include "quest/src/core/inliner.hpp" - -#include - - - -/* - * COMPLEX SCALAR - * - * The user-facing qcomp (which in the QuEST middle-end, resolves to - * a std::complex) is not used by the CPU backend, since it creates - * performance pitfalls (e.g. expensive NaN checks within arithmetic - * operators) in some compilers. Instead, we use the below custom - * complex type and operator overloads, which must ergo crucially - * be POD and share the memory layout and alignment of qcomp - */ - - -struct cpu_qcomp { - - // memory layout - qreal re; - qreal im; - - // in-place complex arithmetic overloads - INLINE cpu_qcomp& operator += (const cpu_qcomp& a) noexcept { - re += a.re; - im += a.im; - return *this; - } - INLINE cpu_qcomp& operator -= (const cpu_qcomp& a) noexcept { - re -= a.re; - im -= a.im; - return *this; - } - INLINE cpu_qcomp& operator *= (const cpu_qcomp& a) noexcept { - qreal re_ = re; - qreal im_ = im; - re = (re_ * a.re) - (im_ * a.im); - im = (re_ * a.im) + (im_ * a.re); - return *this; - } - - // in-place mixed-type arithmetic overloads - INLINE cpu_qcomp& operator *= (const int& a) noexcept { - re *= a; - im *= a; - return *this; - } - INLINE cpu_qcomp& operator *= (const qreal& a) noexcept { - re *= a; - im *= a; - return *this; - } -}; - - -// out-of-place complex arithmetic overloads (optimised) -INLINE cpu_qcomp operator + (cpu_qcomp a, const cpu_qcomp& b) noexcept { - a += b; - return a; -} -INLINE cpu_qcomp operator - (cpu_qcomp a, const cpu_qcomp& b) noexcept { - a -= b; - return a; -} -INLINE cpu_qcomp operator * (cpu_qcomp a, const cpu_qcomp& b) noexcept { - a *= b; - return a; -} - - -// out-of-place mixed-type arithmetic overloads -INLINE cpu_qcomp operator * (cpu_qcomp a, const int& b) noexcept { - a *= b; - return a; -} -INLINE cpu_qcomp operator * (cpu_qcomp a, const qreal& b) noexcept { - a *= b; - return a; -} - - -// reverse order of out-of-place mixed-type arithmetic (via commutation) -INLINE cpu_qcomp operator * (const int& a, const cpu_qcomp& b) noexcept { - return b * a; -} -INLINE cpu_qcomp operator * (const qreal& a, const cpu_qcomp& b) noexcept { - return b * a; -} - - -// no-op cast of pointers -INLINE cpu_qcomp* getCpuQcompPtr(qcomp* list) { - - return reinterpret_cast(list); -} - - -// get cpu_qcomp from components -INLINE cpu_qcomp getCpuQcomp(qreal re, qreal im) { - return { re, im }; -} - - -// get cpu_qcomp from qcomp -INLINE cpu_qcomp getCpuQcomp(const qcomp& a) { - return { a.real(), a.imag() }; -} - - -// get qcomp from cpu_qcomp -INLINE qcomp getQcomp(const cpu_qcomp& a) { - return qcomp( a.re, a.im ); -} - - -// creator for fixed-size dense matrices (CompMatr1 and CompMatr2) ((not inlined!)) -template -std::array,dim> getCpuQcomps(qcomp matr[dim][dim]) { - - // detect brain-dead compiler inferencing (looking at you MSVC...) - static_assert(dim == 2 || dim == 4, "getCpuQcomps called with unexpected dim"); - - std::array,dim> out; - - for (int i=0; i); -static_assert(std::is_trivially_copyable_v); - - -// TODO: -// the above checks are potentially inadequate to identify an -// insidious incompatibility between qcomp and cpu_qcomp - perhaps -// we should perform a compile-time duck-check, casting a small -// array between them and checking no data is corrupted? Perhaps -// a runtime check in initQuESTEnv() is also necessary, checking the -// casting is safe for all circumstances (e.g. heap mem, static lists) - - -#endif // CPU_TYPES_HPP \ No newline at end of file diff --git a/quest/src/gpu/gpu_cuquantum.cuh b/quest/src/gpu/gpu_cuquantum.cuh index 261b29875..0fec5ca57 100644 --- a/quest/src/gpu/gpu_cuquantum.cuh +++ b/quest/src/gpu/gpu_cuquantum.cuh @@ -46,7 +46,7 @@ #include "quest/src/core/utilities.hpp" #include "quest/src/gpu/gpu_config.hpp" -#include "quest/src/gpu/gpu_types.cuh" +#include "quest/src/gpu/gpu_qcomp.cuh" #include #include diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index 35c9dcee5..30377850a 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -23,7 +23,7 @@ #include "quest/src/core/bitwise.hpp" #include "quest/src/core/fastmath.hpp" -#include "quest/src/gpu/gpu_types.cuh" +#include "quest/src/gpu/gpu_qcomp.cuh" #if ! COMPILE_CUDA #error "A file being compiled somehow included gpu_kernels.hpp despite QuEST not being compiled in GPU-accelerated mode." diff --git a/quest/src/gpu/gpu_qcomp.cuh b/quest/src/gpu/gpu_qcomp.cuh new file mode 100644 index 000000000..b7d48e13b --- /dev/null +++ b/quest/src/gpu/gpu_qcomp.cuh @@ -0,0 +1,131 @@ +/** @file + * Definition of gpu_qcomp, an extension of base_qcomp and a + * compatible alternative to the user-facing qcomp, used + * exclusively by the GPU backend and which is compatible with + * both CUDA and HIP. + * + * This header is safe to re-include by multiple files because typedef + * redefinition is legal in C++, and all functions herein are inline. + * Furthermore, since it is only ever parsed by nvcc, the __host__ symbols + * are safely processed by other nvcc-only GPU files, like the cuquantum backend. + * + * @author Tyson Jones + * @author Oliver Brown (patched former HIP arithmetic overloads) + * @author Erich Essmann (patched former ROCm build issues) + */ + +#ifndef GPU_QCOMP_CUH +#define GPU_QCOMP_CUH + +#include "quest/include/config.h" +#include "quest/include/types.h" +#include "quest/include/precision.h" + +#include "quest/src/core/inliner.hpp" +#include "quest/src/core/base_qcomp.hpp" + +#if ! COMPILE_CUDA + #error "A file being compiled somehow included gpu_qcomp.hpp despite QuEST not being compiled in GPU-accelerated mode." +#endif + +#if (FLOAT_PRECISION == 4) + #error "Build bug; precision.h should have prevented non-float non-double qcomp precision on GPU." +#endif + +#if defined(__HIP__) + #include "quest/src/gpu/cuda_to_hip.hpp" +#endif + +#include + + + +/* + * DEFINE GPU_QCOMP + * + * which is safe to typdef and define additional overloads + * below, since never witnessed outside the GPU Backend + */ + +typedef base_qcomp gpu_qcomp; + + + +/* + * CONVERTERS + * + * which merely wrap the base_qcomp functions for clarity + * in the GPU source code, disambiguating from cpu_qcomp + */ + +INLINE gpu_qcomp* getGpuQcompPtr(qcomp* list) { + return getBaseQcompPtr(list); +} + +INLINE gpu_qcomp getGpuQcomp(qreal re, qreal im) { + return getBaseQcomp(re, im); +} + +INLINE gpu_qcomp getGpuQcomp(const qcomp& a) { + return getBaseQcomp(a); +} + +template +__host__ inline std::array getGpuQcompArray(qcomp matr[Dim]) { + static_assert(Dim == 2 || Dim == 4); + + // it's crucial we explicitly copy over the elements, + // rather than just reinterpret the pointer, to avoid + // segmentation faults when memory misaligns (like on HIP) + + std::array out; + for (int i=0; i +__host__ inline std::array getFlattenedGpuQcompMatrix(qcomp matr[Dim][Dim]) { + static_assert(Dim == 2 || Dim == 4); + + std::array out; + for (int i=0; i ctrls, v #if COMPILE_CUQUANTUM bool applyAdj = false; - auto arr = unpackMatrixToGpuQcomps(matr); + auto arr = getFlattenedGpuQcompMatrix<2>(matr.elems); // explicit template for MSVC, grr! cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(qureg, ctrls, ctrlStates, {targ}, arr.data(), applyAdj); #elif COMPILE_CUDA @@ -304,7 +304,7 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector ctrls, v devints sortedQubits = util_getSorted(ctrls, {targ}); qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0}); - auto [m00, m01, m10, m11] = unpackMatrixToGpuQcomps(matr); + auto [m00, m01, m10, m11] = getFlattenedGpuQcompMatrix<2>(matr.elems); // explicit template for MSVC, grr! kernel_statevec_anyCtrlOneTargDenseMatr_subA <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, @@ -362,7 +362,7 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve #if COMPILE_CUQUANTUM bool applyAdj = false; - auto arr = unpackMatrixToGpuQcomps(matr); + auto arr = getFlattenedGpuQcompMatrix<4>(matr.elems); // explicit template for MSVC, grr! cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(qureg, ctrls, ctrlStates, {targ1, targ2}, arr.data(), applyAdj); #elif COMPILE_CUDA @@ -374,7 +374,7 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1,targ2}, {0,0}); // unpack matrix elems which are more efficiently accessed by kernels as args than shared mem (... maybe...) - auto m = unpackMatrixToGpuQcomps(matr); + auto m = getFlattenedGpuQcompMatrix<4>(matr.elems); // explicit template for MSVC, grr! kernel_statevec_anyCtrlTwoTargDenseMatr_sub <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, @@ -570,7 +570,7 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector ctrls, vec devints deviceCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); - auto elems = unpackMatrixToGpuQcomps(matr); + auto elems = getGpuQcompArray<2>(matr.elems); // explicit template for MSVC, grr! kernel_statevec_anyCtrlOneTargDiagMatr_sub <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, @@ -638,7 +638,7 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector ctrls, vec devints deviceCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); - auto elems = unpackMatrixToGpuQcomps(matr); + auto elems = getGpuQcompArray<4>(matr.elems); // explicit template for MSVC, grr! kernel_statevec_anyCtrlTwoTargDiagMatr_sub <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, diff --git a/quest/src/gpu/gpu_thrust.cuh b/quest/src/gpu/gpu_thrust.cuh index ccdbb2130..07a650547 100644 --- a/quest/src/gpu/gpu_thrust.cuh +++ b/quest/src/gpu/gpu_thrust.cuh @@ -33,7 +33,7 @@ #include "quest/include/paulis.h" #include "quest/include/matrices.h" -#include "quest/src/gpu/gpu_types.cuh" +#include "quest/src/gpu/gpu_qcomp.cuh" #include "quest/src/core/errors.hpp" #include "quest/src/core/bitwise.hpp" #include "quest/src/core/constants.hpp" diff --git a/quest/src/gpu/gpu_types.cuh b/quest/src/gpu/gpu_types.cuh deleted file mode 100644 index 8b8d3dad9..000000000 --- a/quest/src/gpu/gpu_types.cuh +++ /dev/null @@ -1,280 +0,0 @@ -/** @file - * CUDA and HIP-compatible complex types. This file is only ever included - * when COMPILE_CUDA=1 so it can safely invoke CUDA signatures without guards. - * - * This header is safe to re-include by multiple files because typedef - * redefinition is legal in C++, and all functions herein are inline. - * Furthermore, since it is only ever parsed by nvcc, the __host__ symbols - * are safely processed by other nvcc-only GPU files, like the cuquantum backend. - * - * @author Tyson Jones - * @author Oliver Brown (patched HIP arithmetic overloads) - */ - -#ifndef GPU_TYPES_HPP -#define GPU_TYPES_HPP - -#include "quest/include/config.h" -#include "quest/include/types.h" -#include "quest/include/precision.h" - -#include "quest/src/core/inliner.hpp" - -#if ! COMPILE_CUDA - #error "A file being compiled somehow included gpu_types.hpp despite QuEST not being compiled in GPU-accelerated mode." -#endif - -#if (FLOAT_PRECISION == 4) - #error "Build bug; precision.h should have prevented non-float non-double qcomp precision on GPU." -#endif - -#if defined(__HIP__) - #include "quest/src/gpu/cuda_to_hip.hpp" -#endif - -#include -#include - - - -/* - * COMPLEX SCALAR - * - * The user-facing qcomp (which in the QuEST middle-end, resolves to - * a std::complex) is not used by the GPU backend, since incompatible - * with CUDA kernels. We use our own custom gpu_qcomp type below, in - * lieu of cuComplex or Thrust types, to workaround compatibility issues - * with HIP, and for better symmetry with cpu_qcomp. - */ - - -struct gpu_qcomp { - - // memory layout - qreal re; - qreal im; - - // in-place complex arithmetic overloads - INLINE gpu_qcomp& operator += (const gpu_qcomp& a) noexcept { - re += a.re; - im += a.im; - return *this; - } - INLINE gpu_qcomp& operator -= (const gpu_qcomp& a) noexcept { - re -= a.re; - im -= a.im; - return *this; - } - INLINE gpu_qcomp& operator *= (const gpu_qcomp& a) noexcept { - qreal re_ = re; - qreal im_ = im; - re = (re_ * a.re) - (im_ * a.im); - im = (re_ * a.im) + (im_ * a.re); - return *this; - } - - // in-place mixed-type arithmetic overloads - INLINE gpu_qcomp& operator *= (const int& a) noexcept { - re *= a; - im *= a; - return *this; - } - INLINE gpu_qcomp& operator *= (const qreal& a) noexcept { - re *= a; - im *= a; - return *this; - } -}; - - -// out-of-place complex arithmetic overloads (optimised) -INLINE gpu_qcomp operator + (gpu_qcomp a, const gpu_qcomp& b) noexcept { - a += b; - return a; -} -INLINE gpu_qcomp operator - (gpu_qcomp a, const gpu_qcomp& b) noexcept { - a -= b; - return a; -} -INLINE gpu_qcomp operator * (gpu_qcomp a, const gpu_qcomp& b) noexcept { - a *= b; - return a; -} - - -// out-of-place mixed-type arithmetic overloads -INLINE gpu_qcomp operator * (gpu_qcomp a, const int& b) noexcept { - a *= b; - return a; -} -INLINE gpu_qcomp operator * (gpu_qcomp a, const qreal& b) noexcept { - a *= b; - return a; -} -INLINE gpu_qcomp operator * (gpu_qcomp a, const size_t& b) noexcept { - a *= static_cast(b); - return a; -} - - -// reverse order of out-of-place mixed-type arithmetic (via commutation) -INLINE gpu_qcomp operator * (const int& a, const gpu_qcomp& b) noexcept { - return b * a; -} -INLINE gpu_qcomp operator * (const qreal& a, const gpu_qcomp& b) noexcept { - return b * a; -} - - -// no-op cast of pointers -INLINE gpu_qcomp* getGpuQcompPtr(qcomp* list) { - - return reinterpret_cast(list); -} - - -// get gpu_qcomp from components -INLINE gpu_qcomp getGpuQcomp(qreal re, qreal im) { - return { re, im }; -} - - -// get gpu_qcomp from qcomp (host only; qcomp forbiddin in device code) -__host__ gpu_qcomp getGpuQcomp(const qcomp& a) { - return { a.real(), a.imag() }; -} - - -// get qcomp from gpu_qcomp (host only; qcomp forbiddin in device code) -__host__ qcomp getQcomp(const gpu_qcomp& a) { - return qcomp( a.re, a.im ); -} - - // // creator for fixed-size dense matrices (CompMatr1 and CompMatr2) - // template - // INLINE std::array,dim> getCpuQcomps(qcomp matr[dim][dim]) { - - // std::array,dim> out; - - // for (int i=0; i); -static_assert(std::is_trivially_copyable_v); - - -// TODO: -// the above checks are potentially inadequate to identify an -// insidious incompatibility between qcomp and gpu_qcomp - perhaps -// we should perform a compile-time duck-check, casting a small -// array between them and checking no data is corrupted? Perhaps -// a runtime check in initQuESTEnv() is also necessary, checking the -// casting is safe for all circumstances (e.g. heap mem, static lists) - - - - -/* - * TODO: - * OLD UNPACKERS - * - * which I am hestitant to switch to the CPU-style until I better - * understand why the explicit gpu_qcomp instantiation is necessary - * (iirc static HIP structs have a different alignment than qcomp?!) - */ - - -__host__ inline std::array unpackMatrixToGpuQcomps(DiagMatr1 in) { - - // it's crucial we explicitly copy over the elements, - // rather than just reinterpret the pointer, to avoid - // segmentation faults when memory misaligns (like on HIP) - - // oh YES we must not cast statically created HIP arrays - // like within kernels?!?! - - - - // UMMMMMM is the above true?!?! - // Wen did I witness misalignment between std::complex and gpu_qcomp?! - - return {getGpuQcomp(in.elems[0]), getGpuQcomp(in.elems[1])}; -} - - -__host__ inline std::array unpackMatrixToGpuQcomps(DiagMatr2 in) { - - return { - getGpuQcomp(in.elems[0]), getGpuQcomp(in.elems[1]), - getGpuQcomp(in.elems[2]), getGpuQcomp(in.elems[3])}; -} - - -__host__ inline std::array unpackMatrixToGpuQcomps(CompMatr1 in) { - - std::array out{}; - for (int i=0; i<4; i++) - out[i] = getGpuQcomp(in.elems[i/2][i%2]); - - return out; -} - - -__host__ inline std::array unpackMatrixToGpuQcomps(CompMatr2 in) { - - std::array out{}; - for (int i=0; i<16; i++) - out[i] = getGpuQcomp(in.elems[i/4][i%4]); - - return out; -} - - -#endif // GPU_TYPES_HPP \ No newline at end of file