diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index 0950b7db..fd74c294 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -28,6 +28,7 @@ on: branches: - main - devel + - unifying-base-qcomp pull_request: branches: - main @@ -253,6 +254,11 @@ jobs: -DCMAKE_HIP_ARCHITECTURES=${{ env.hip_arch }} -DCMAKE_CXX_COMPILER=${{ matrix.compiler }} -DCMAKE_CXX_FLAGS=${{ matrix.mpi == 'ON' && matrix.cuda == 'ON' && '-fno-lto' || '' }} + -DCMAKE_CXX_FLAGS_DEBUG=${{ matrix.compiler == 'cl' && '/MP1' || '' }} + -DCMAKE_CXX_FLAGS_RELEASE=${{ matrix.compiler == 'cl' && '/MP1' || '' }} + + ### DEBUG: + ### above disables parallel compilation with MSVC # force 'Release' build (needed by MSVC to enable optimisations) - name: Compile diff --git a/CMakeLists.txt b/CMakeLists.txt index 5d308795..dc209ca5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -469,71 +469,6 @@ set(COMPILE_HIP ${ENABLE_HIP}) -# ============================ -# Patch CPU performance -# ============================ - - -# Patch performance of CPU std::complex arithmetic operator overloads. -# The cpu_subroutines.cpp file makes extensive use of std::complex operator -# overloads, and alas these are significantly slower than hand-rolled -# arithmetic, due to their NaN and inf checks, and interference with SIMD. -# It is crucial to pass additional optimisation flags to this file to restore -# hand-rolled performance (else QuEST v3 is faster than v4 eep). In theory, -# we can achieve this with specific, relatively 'safe' flags such as LLVM's: -# -ffinite-math-only -fno-signed-zeros -ffp-contract=fast -# However, it is a nuisance to find equivalent flags for different compilers -# and monitor their performance vs accuracy trade-offs. So instead, we use the -# much more aggressive and ubiquitous -Ofast flag to guarantee performance. -# This introduces many potentially dangerous optimisations, such as asserting -# associativity of flops, which would break techniques like Kahan summation. -# The cpu_subroutines.cpp must ergo be very conscious of these optimisations. -# We here also explicitly inform the file cpu_subroutines.cpp whether or not -# we are passing the flags, so it can detect/error when flags are forgotten. - -if (CMAKE_BUILD_TYPE STREQUAL "Release") - - # Release build will pass -Ofast when known for the given compiler, and - # fallback to giving a performance warning and proceeding with compilation - - if (CMAKE_CXX_COMPILER_ID MATCHES "AppleClang|Clang|Cray|CrayClang|GNU|HP|Intel|IntelLLVM|NVHPC|NVIDIA|XL|XLClang") - set(patch_flags "-Ofast") - set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=1") - elseif (CMAKE_CXX_COMPILER_ID MATCHES "HP") - set(patch_flags "+Ofast") - set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=1") - elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC") - set(patch_flags "/fp:fast") - set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=1") - else() - message(WARNING - "The compiler (${CMAKE_CXX_COMPILER_ID}) is unrecognised and so crucial optimisation flags have not been " - "passed to the CPU backend. These flags are necessary for full performance when performing complex algebra, " - "otherwise a slowdown of 3-50x may be observed. Please edit the root CMakeLists.txt to include flags which are " - "equivalent to GNU's -Ofast flag for your compiler (search this warning), or contact the QuEST developers for help." - ) - set(patch_flags "") - set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=0") - endif() - -else() - - # Non-release builds (e.g. Debug) will pass no optimisation flags, and will - # communicate to cpu_subroutines.cpp that this is intentional via a macro - - set(patch_flags "") - set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=0") - -endif() - -set_source_files_properties( - quest/src/cpu/cpu_subroutines.cpp - PROPERTIES - COMPILE_FLAGS "${patch_flags} ${patch_macro}" -) - - - # ============================ # Pass files to library # ============================ diff --git a/quest/include/types.h b/quest/include/types.h index 066d35e9..f1f49315 100644 --- a/quest/include/types.h +++ b/quest/include/types.h @@ -158,9 +158,10 @@ static inline qcomp getQcomp(qreal re, qreal im) { // not the same precision as qcomp, so compilation will fail depending // on the setting of PRECISION. To avoid this, we'll define overloads // between all type/precision permutations, always returning qcomp. These - // overloads are also used by the QuEST source code. Via the unholy macros - // below, we create 312 overloads; no doubt this is going to break something - // in the future, for which I am already sorry :'( + // overloads are also used by the QuEST source code (though incidentally, + // not the high performance backend which uses custom complex overloads). + // Via the unholy macros below, we create 312 overloads; no doubt this is + // going to break something in the future, for which I am already sorry :'( /// @cond EXCLUDE_FROM_DOXYGEN diff --git a/quest/src/core/base_qcomp.hpp b/quest/src/core/base_qcomp.hpp new file mode 100644 index 00000000..522983c1 --- /dev/null +++ b/quest/src/core/base_qcomp.hpp @@ -0,0 +1,227 @@ +/** @file + * Definition of base_qcomp, which is extended by the CPU and GPU + * backends (into cpu_qcomp and gpu_qcomp) and used in hot loops + * and kernels. + * + * The user-facing qcomp (which in the QuEST middle-end, resolves to + * std::complex) is not used by the CPU backend, since it creates + * performance pitfalls (e.g. expensive NaN checks within arithmetic + * operators) in some compilers, and is furthermore illegal in the + * GPU backend (i.e. within CUDA kernels). So the backends instead + * use custom complex types with identical memory layouts/alignment + * to qcomp. Those types extend base_qcomp defined in this file, + * since they otherwise share all the same arithmetic boilerplate. + * + * @author Tyson Jones + */ + +#ifndef BASE_QCOMP_HPP +#define BASE_QCOMP_HPP + +#include "quest/include/types.h" + +#include "quest/src/core/inliner.hpp" + + + +/* + * BASE DEFINITION + * + * which must remain POD (a simple {re,im}) and with an identical + * memory layout and alignment to qcomp (i.e. std::complex). Only + * the in-place arithmetic overloads are defined below which are + * reused by the subsequent out-of-place overloads, to avoid + * code duplication. + */ + +struct base_qcomp { + + qreal re; + qreal im; + + + /* + * IN-PLACE COMPLEX ARITHMETIC + */ + + INLINE base_qcomp& operator += (const base_qcomp& a) noexcept { + re += a.re; + im += a.im; + return *this; + } + + INLINE base_qcomp& operator -= (const base_qcomp& a) noexcept { + re -= a.re; + im -= a.im; + return *this; + } + + INLINE base_qcomp& operator *= (const base_qcomp& a) noexcept { + qreal re_ = re; + qreal im_ = im; + re = (re_ * a.re) - (im_ * a.im); + im = (re_ * a.im) + (im_ * a.re); + return *this; + } + + + /* + * IN-PLACE MIXED-TYPE ARITHMETIC + */ + + INLINE base_qcomp& operator *= (const int& a) noexcept { + re *= a; + im *= a; + return *this; + } + + INLINE base_qcomp& operator *= (const qreal& a) noexcept { + re *= a; + im *= a; + return *this; + } + + INLINE base_qcomp& operator *= (const size_t& a) noexcept { + re *= a; + im *= a; + return *this; + } + +}; // base_qcomp + + + +/* + * OUT-OF-PLACE COMPLEX ARITHMETIC + * + * which avoid code duplication by re-using the + * in-place arithmetic operator overloads above + */ + +INLINE base_qcomp operator + (base_qcomp a, const base_qcomp& b) noexcept { + a += b; + return a; +} + +INLINE base_qcomp operator - (base_qcomp a, const base_qcomp& b) noexcept { + a -= b; + return a; +} + +INLINE base_qcomp operator * (base_qcomp a, const base_qcomp& b) noexcept { + a *= b; + return a; +} + + + +/* + * OUT-OF-PLACE MIXED-TYPE ARITHMETIC + * + * which avoid code duplication by re-using the + * in-place arithmetic operator overloads above + */ + + +// base_qcomp * other + +INLINE base_qcomp operator * (base_qcomp a, const int& b) noexcept { + a *= b; + return a; +} + +INLINE base_qcomp operator * (base_qcomp a, const qreal& b) noexcept { + a *= b; + return a; +} + +INLINE base_qcomp operator * (base_qcomp a, const size_t& b) noexcept { + a *= b; + return a; +} + + +// other * base_qcomp (via commutation) + +INLINE base_qcomp operator * (const int& a, const base_qcomp& b) noexcept { + return b * a; +} + +INLINE base_qcomp operator * (const qreal& a, const base_qcomp& b) noexcept { + return b * a; +} + +INLINE base_qcomp operator * (const size_t& a, const base_qcomp& b) noexcept { + return b * a; +} + + + +/* + * BACKEND-AGNOSTIC MATHS + */ + +INLINE qreal real(const base_qcomp& a) { + return a.re; +} + +INLINE qreal imag(const base_qcomp& a) { + return a.im; +} + +INLINE base_qcomp conj(const base_qcomp& a) { + return {a.re, - a.im}; +} + +INLINE qreal norm(const base_qcomp& a) noexcept { + return (a.re * a.re) + (a.im * a.im); +} + + + +/* + * CONVERTERS + */ + +INLINE base_qcomp* getBaseQcompPtr(qcomp* list) { + return reinterpret_cast(list); +} + +INLINE base_qcomp getBaseQcomp(qreal re, qreal im) { + return { re, im }; +} + +INLINE base_qcomp getBaseQcomp(const qcomp& a) { + return { a.real(), a.imag() }; +} + +INLINE qcomp getQcomp(const base_qcomp& a) { + return qcomp( a.re, a.im ); +} + + + +/* + * CHECK COMPATIBILITY WITH QCOMP + */ + + +// check the memory layout of base_qcomp agrees with qcomp, since +// it is not formally gauranteed, unlike _Complex and std::complex +static_assert(sizeof (base_qcomp) == sizeof (qcomp)); +static_assert(alignof(base_qcomp) == alignof(qcomp)); +static_assert(std::is_standard_layout_v ); +static_assert(std::is_trivially_copyable_v); + + +// TODO: +// the above checks are potentially inadequate to identify an +// insidious incompatibility between qcomp and base_qcomp - perhaps +// we should perform a compile-time duck-check, casting a small +// array between them and checking no data is corrupted? Perhaps +// a runtime check in initQuESTEnv() is also necessary, checking the +// casting is safe for all circumstances (e.g. heap mem, static lists) + + + +#endif // BASE_QCOMP_HPP \ No newline at end of file diff --git a/quest/src/core/fastmath.hpp b/quest/src/core/fastmath.hpp index 6367e116..79884659 100644 --- a/quest/src/core/fastmath.hpp +++ b/quest/src/core/fastmath.hpp @@ -1,7 +1,12 @@ /** @file * Oerations used by all deployment modes for fast, * low-level maths, inlined and callable within hot - * loops (i.e OpenMP loops and CUDA kernels) + * loops (i.e OpenMP loops and CUDA kernels). + * + * Note this file uses the backend OpenMP/CUDA-agnostic + * base_qcomp, in lieu of the C++-user-facing qcomp (i.e. + * std::complex), to avoid its compiler-specific performance + * pitfalls. * * @author Tyson Jones */ @@ -15,27 +20,7 @@ #include "quest/src/core/inliner.hpp" #include "quest/src/core/bitwise.hpp" - - - -/* - * TYPE ALIASING - */ - - -// 'qcomp' cannot be used inside CUDA kernels/thrust, so must not appear in -// these inlined definitions. Instead, we create an alias which will resolve -// to 'qcomp' (defined in types.h) when parsed by the CPU backend, and 'cu_qcomp' -// (defined in gpu_types.cuh which is not explicitly resolved in this header) -// when parsed by the GPU backend, which will prior define USE_CU_QCOMP. It is -// essential this header is included after gpu_types.cuh is included by the -// GPU backend. Hacky, but avoids code duplication! - -#ifdef USE_CU_QCOMP - #define QCOMP_ALIAS cu_qcomp -#else - #define QCOMP_ALIAS qcomp -#endif +#include "quest/src/core/base_qcomp.hpp" @@ -121,7 +106,7 @@ INLINE void fast_getSubQuregValues(qindex basisStateIndex, int* numQubitsPerSubQ */ -INLINE QCOMP_ALIAS fast_getPauliStrElem(PauliStr str, qindex row, qindex col) { +INLINE base_qcomp fast_getPauliStrElem(PauliStr str, qindex row, qindex col) { // this function is called by both fullstatediagmatr_setElemsToPauliStrSum() // and densmatr_setAmpsToPauliStrSum_sub(). The former's PauliStr can have @@ -133,13 +118,13 @@ INLINE QCOMP_ALIAS fast_getPauliStrElem(PauliStr str, qindex row, qindex col) { // though opens the risk that the former caller erroneously has its upper // Paulis ignore. We forego this optimisation in defensive design, and // because this function is only invoked during data structure initilisation - // and ergo infrequently.s + // and ergo infrequently. // regrettably duplicated from paulis.cpp which is inaccessible here constexpr int numPaulisPerMask = sizeof(PAULI_MASK_TYPE) * 8 / 2; - // QCOMP_ALIAS-agnostic literals - QCOMP_ALIAS p0, p1,n1, pI,nI; + // T-agnostic complex literals + base_qcomp p0, p1,n1, pI,nI; p0 = {0, 0}; // 0 p1 = {+1, 0}; // 1 n1 = {-1, 0}; // -1 @@ -152,20 +137,20 @@ INLINE QCOMP_ALIAS fast_getPauliStrElem(PauliStr str, qindex row, qindex col) { // but this poses no real slowdown; this function, and its caller, are inlined // so these 16 amps are re-processed one for each full enumeration of the // PauliStrSum which is expected to have significantly more terms/coeffs - QCOMP_ALIAS matrices[][2][2] = { + base_qcomp matrices[][2][2] = { {{p1,p0},{p0,p1}}, // I {{p0,p1},{p1,p0}}, // X {{p0,nI},{pI,p0}}, // Y {{p1,p0},{p0,n1}}}; // Z - QCOMP_ALIAS elem = p1; // 1 + base_qcomp elem = p1; // 1 // could be compile-time unrolled into 32 iterations for (int t=0; t + + + +/* + * DEFINE CPU_QCOMP + * + * which is safe to typdef and define additional overloads + * below, since never witnessed outside the CPU Backend + */ + +typedef base_qcomp cpu_qcomp; + + + +/* + * CONVERTERS + * + * which merely wrap the base_qcomp functions for clarity + * in the CPU source code, disambiguating from gpu_qcomp + */ + +INLINE cpu_qcomp* getCpuQcompPtr(qcomp* ptr) { + return getBaseQcompPtr(ptr); +} + +INLINE cpu_qcomp getCpuQcomp(qreal re, qreal im) { + return getBaseQcomp(re, im); +} + +INLINE cpu_qcomp getCpuQcomp(const qcomp& a) { + return getBaseQcomp(a); +} + +template +std::array,Dim> getCpuQcompsMatrix(qcomp matr[Dim][Dim]) { + + // Creator for fixed-size dense matrices CompMatr1 and CompMatr2, + // which are respectively 2x2 and 4x4 - deliberately not inlined! + // We create new cpu_qcomp in lieu of reinterpreting a 2D pointer + // in fear of alignment and static array nightmares + static_assert(Dim == 2 || Dim == 4); + + std::array,Dim> out; + + for (int i=0; i qubitInds, vecto // amplitudes are packed at an offset into the buffer qindex offset = getSubBufferSendInd(qureg); + // use cpu_qcomp (in lieu of qcomp) even though no arithmetic happens below - just for consistency! + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer); + auto sortedQubitInds = util_getSorted(qubitInds); auto qubitStateMask = util_getBitMask(qubitInds, qubitStates); @@ -240,7 +241,7 @@ qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, vector qubitInds, vecto qindex i = insertBitsWithMaskedValues(n, sortedQubitInds.data(), numBits, qubitStateMask); // pack the potentially-strided amplitudes into a contiguous sub-buffer - qureg.cpuCommBuffer[offset + n] = qureg.cpuAmps[i]; + buffer[offset + n] = amps[i]; } // return the number of packed amps @@ -258,6 +259,10 @@ qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu // amplitudes are packed at an offset into the buffer qindex offset = getSubBufferSendInd(qureg); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer); + #pragma omp parallel for if(qureg.isMultithreaded) for (qindex n=0; n ctrls, vector c // each control qubit halves the number of iterations, each of which modifies 2 amplitudes, and skips 2 qindex numIts = qureg.numAmpsPerNode / powerOf2(2 + ctrls.size()); + // use cpu_qcomp (in lieu of qcomp) even though no arithmetic happens below - just for consistency! + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + auto sortedQubits = util_getSorted(ctrls, {targ2, targ1}); auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ2, targ1}, {0, 1}); @@ -305,7 +313,7 @@ void cpu_statevec_anyCtrlSwap_subA(Qureg qureg, vector ctrls, vector c qindex i01 = insertBitsWithMaskedValues(n, sortedQubits.data(), numQubitBits, qubitStateMask); qindex i10 = flipTwoBits(i01, targ2, targ1); - std::swap(qureg.cpuAmps[i01], qureg.cpuAmps[i10]); + std::swap(amps[i01], amps[i10]); } } @@ -321,6 +329,10 @@ void cpu_statevec_anyCtrlSwap_subB(Qureg qureg, vector ctrls, vector c // received amplitudes may begin at an arbitrary offset in the buffer qindex offset = getBufferRecvInd(); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer); + auto sortedCtrls = util_getSorted(ctrls); auto ctrlStateMask = util_getBitMask(ctrls, ctrlStates); @@ -337,7 +349,7 @@ void cpu_statevec_anyCtrlSwap_subB(Qureg qureg, vector ctrls, vector c qindex j = n + offset; // unpack the continuous sub-buffer among the strided local amplitudes - qureg.cpuAmps[i] = qureg.cpuCommBuffer[j]; + amps[i] = buffer[j]; } } @@ -353,6 +365,10 @@ void cpu_statevec_anyCtrlSwap_subC(Qureg qureg, vector ctrls, vector c // received amplitudes may begin at an arbitrary offset in the buffer qindex offset = getBufferRecvInd(); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer); + auto sortedQubits = util_getSorted(ctrls, {targ}); auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {targState}); @@ -370,7 +386,7 @@ void cpu_statevec_anyCtrlSwap_subC(Qureg qureg, vector ctrls, vector c qindex j = n + offset; // unpack the continuous sub-buffer among the strided local amplitudes - qureg.cpuAmps[i] = qureg.cpuCommBuffer[j]; + amps[i] = buffer[j]; } } @@ -394,6 +410,10 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector ctrls, v // each control qubit halves the needed iterations, and each iteration modifies two amplitudes qindex numIts = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 1); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + auto elems = getCpuQcompsMatrix<2>(matr.elems); // MSVC requires explicit template param, bah! + auto sortedQubits = util_getSorted(ctrls, {targ}); auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0}); @@ -409,11 +429,11 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector ctrls, v qindex i1 = flipBit(i0, targ); // note the two amplitudes are likely strided and not adjacent (separated by 2^t) - qcomp amp0 = qureg.cpuAmps[i0]; - qcomp amp1 = qureg.cpuAmps[i1]; + cpu_qcomp amp0 = amps[i0]; + cpu_qcomp amp1 = amps[i1]; - qureg.cpuAmps[i0] = matr.elems[0][0]*amp0 + matr.elems[0][1]*amp1; - qureg.cpuAmps[i1] = matr.elems[1][0]*amp0 + matr.elems[1][1]*amp1; + amps[i0] = elems[0][0]*amp0 + elems[0][1]*amp1; + amps[i1] = elems[1][0]*amp0 + elems[1][1]*amp1; } } @@ -429,6 +449,12 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector ctrls, v // received amplitudes may begin at an arbitrary offset in the buffer qindex offset = getBufferRecvInd(); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer); + cpu_qcomp f0 = getCpuQcomp(fac0); + cpu_qcomp f1 = getCpuQcomp(fac1); + auto sortedCtrls = util_getSorted(ctrls); auto ctrlStateMask = util_getBitMask(ctrls, ctrlStates); @@ -444,7 +470,7 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector ctrls, v // j = index of nth received amplitude from pair rank in buffer qindex j = n + offset; - qureg.cpuAmps[i] = fac0*qureg.cpuAmps[i] + fac1*qureg.cpuCommBuffer[j]; + amps[i] = f0*amps[i] + f1*buffer[j]; } } @@ -467,6 +493,10 @@ void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve // each control qubit halves the needed iterations, and each iteration modifies four amplitudes qindex numIts = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 2); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + auto elems = getCpuQcompsMatrix<4>(matr.elems); // MSVC requires explicit template param, bah! + auto sortedQubits = util_getSorted(ctrls, {targ1, targ2}); auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1, targ2}, {0, 0}); @@ -484,16 +514,16 @@ void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve qindex i11 = flipBit(i01, targ2); // note amplitudes are not necessarily adjacent, nor uniformly spaced - qcomp amp00 = qureg.cpuAmps[i00]; - qcomp amp01 = qureg.cpuAmps[i01]; - qcomp amp10 = qureg.cpuAmps[i10]; - qcomp amp11 = qureg.cpuAmps[i11]; - - // amps[i_n] = sum_j elems[n][j] amp[i_n] - qureg.cpuAmps[i00] = matr.elems[0][0]*amp00 + matr.elems[0][1]*amp01 + matr.elems[0][2]*amp10 + matr.elems[0][3]*amp11; - qureg.cpuAmps[i01] = matr.elems[1][0]*amp00 + matr.elems[1][1]*amp01 + matr.elems[1][2]*amp10 + matr.elems[1][3]*amp11; - qureg.cpuAmps[i10] = matr.elems[2][0]*amp00 + matr.elems[2][1]*amp01 + matr.elems[2][2]*amp10 + matr.elems[2][3]*amp11; - qureg.cpuAmps[i11] = matr.elems[3][0]*amp00 + matr.elems[3][1]*amp01 + matr.elems[3][2]*amp10 + matr.elems[3][3]*amp11; + cpu_qcomp amp00 = amps[i00]; + cpu_qcomp amp01 = amps[i01]; + cpu_qcomp amp10 = amps[i10]; + cpu_qcomp amp11 = amps[i11]; + + // amps[i_n] = sum_j matr.elems[n][j] amp[i_n] + amps[i00] = elems[0][0]*amp00 + elems[0][1]*amp01 + elems[0][2]*amp10 + elems[0][3]*amp11; + amps[i01] = elems[1][0]*amp00 + elems[1][1]*amp01 + elems[1][2]*amp10 + elems[1][3]*amp11; + amps[i10] = elems[2][0]*amp00 + elems[2][1]*amp01 + elems[2][2]*amp10 + elems[2][3]*amp11; + amps[i11] = elems[3][0]*amp00 + elems[3][1]*amp01 + elems[3][2]*amp10 + elems[3][3]*amp11; } } @@ -534,6 +564,10 @@ void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve // each control qubit halves iterations, each of which modifies 2^(targs.size()) amplitudes qindex numIts = qureg.numAmpsPerNode / powerOf2(ctrls.size() + targs.size()); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* elems = getCpuQcompPtr(matr.cpuElemsFlat); + // prepare a mask which yields ctrls in specified state, and targs in all-zero auto sortedQubits = util_getSorted(ctrls, targs); auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, targs, vector(targs.size(),0)); @@ -550,7 +584,7 @@ void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve #pragma omp parallel if(qureg.isMultithreaded) { // create a private cache for every thread (might be compile-time sized, and in heap or stack) - vector cache(numTargAmps); + vector cache(numTargAmps); #pragma omp for for (qindex n=0; n ctrls, ve // i = nth local index where ctrls are active and targs form value j qindex i = setBits(i0, targs.data(), numTargBits, j); // loop may be unrolled - cache[j] = qureg.cpuAmps[i]; + cache[j] = amps[i]; } // modify each amplitude (loop might be unrolled) @@ -571,7 +605,7 @@ void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve // i = nth local index where ctrls are active and targs form value k qindex i = setBits(i0, targs.data(), numTargBits, k); // loop may be unrolled - qureg.cpuAmps[i] = 0; + amps[i] = getCpuQcomp(0, 0); // loop may be unrolled for (qindex j=0; j ctrls, ve l = fast_getMatrixFlatIndex(j, k, numTargAmps); else l = fast_getMatrixFlatIndex(k, j, numTargAmps); - - qcomp elem = matr.cpuElemsFlat[l]; + cpu_qcomp elem = elems[l]; // optionally conjugate matrix elems on the fly to avoid pre-modifying heap structure if constexpr (ApplyConj) - elem = std::conj(elem); + elem = conj(elem); - qureg.cpuAmps[i] += elem * cache[j]; + amps[i] += elem * cache[j]; /// @todo /// qureg.cpuAmps[i] is being serially updated by only this thread, /// so is a candidate for Kahan summation for improved numerical /// stability. Explore whether this is time-free and worthwhile! - /// - /// BEWARE that Kahan summation is incompatible with the optimisation - /// flags currently passed to this file } } } @@ -622,6 +652,18 @@ void cpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector ctrls, vec // each control qubit halves the needed iterations, each of which will modify 1 amplitude qindex numIts = qureg.numAmpsPerNode / powerOf2(ctrls.size()); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* elems = getCpuQcompPtr(matr.elems); + + // TODO: + // I am paranoid the above casting from qcomp* to cpu_qcomp* is + // more dangerous for the static matr.elems than the heap pointer. + // Is it more vulnerable to misalignment? Check this! + // + // If this IS the case, we simply prepare a new cpu_qcomp + // instance for each element, like the GPU backend does + auto sortedCtrls = util_getSorted(ctrls); auto ctrlStateMask = util_getBitMask(ctrls, ctrlStates); @@ -638,7 +680,7 @@ void cpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector ctrls, vec qindex i = concatenateBits(qureg.rank, j, qureg.logNumAmpsPerNode); int b = getBit(i, targ); - qureg.cpuAmps[j] *= matr.elems[b]; + amps[j] *= elems[b]; } } @@ -660,6 +702,18 @@ void cpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector ctrls, vec // each control qubit halves the needed iterations, each of which will modify 1 amplitude qindex numIts = qureg.numAmpsPerNode / powerOf2(ctrls.size()); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* elems = getCpuQcompPtr(matr.elems); + + // TODO: + // I am paranoid the above casting from qcomp* to cpu_qcomp* is + // more dangerous for the static matr.elems than the heap pointer. + // Is it more vulnerable to misalignment? Check this! + // + // If this IS the case, we simply prepare a new cpu_qcomp + // instance for each element, like the GPU backend does + auto sortedCtrls = util_getSorted(ctrls); auto ctrlStateMask = util_getBitMask(ctrls, ctrlStates); @@ -676,7 +730,7 @@ void cpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector ctrls, vec qindex i = concatenateBits(qureg.rank, j, qureg.logNumAmpsPerNode); int k = getTwoBits(i, targ2, targ1); - qureg.cpuAmps[j] *= matr.elems[k]; + amps[j] *= elems[k]; } } @@ -700,6 +754,12 @@ void cpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vec // each control qubit halves the needed iterations, each of which will modify 1 amplitude qindex numIts = qureg.numAmpsPerNode / powerOf2(ctrls.size()); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* elems = getCpuQcompPtr(matr.cpuElems); + cpu_qcomp expo = getCpuQcomp(exponent); + (void) expo; // silence when unused + auto sortedCtrls = util_getSorted(ctrls); auto ctrlStateMask = util_getBitMask(ctrls, ctrlStates); @@ -718,7 +778,7 @@ void cpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vec // t = value of targeted bits, which may be in the prefix substate qindex t = getValueOfBits(i, targs.data(), numTargBits); - qcomp elem = matr.cpuElems[t]; + cpu_qcomp elem = elems[t]; // decide whether to power and conj at compile-time, to avoid branching in hot-loop. // beware that pow(qcomp,qcomp) below gives notable error over pow(qreal,qreal) @@ -726,13 +786,13 @@ void cpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vec // and negative, and the exponent is an integer. We tolerate this heightened error // because we have no reason to think matr is real (it's not constrained Hermitian). if constexpr (HasPower) - elem = std::pow(elem, exponent); + elem = pow(elem, expo); // cautiously conjugate AFTER exponentiation, else we must also conj exponent if constexpr (ApplyConj) - elem = std::conj(elem); + elem = conj(elem); - qureg.cpuAmps[j] *= elem; + amps[j] *= elem; } } @@ -762,10 +822,16 @@ void cpu_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp // every iteration modifies one amp, using one element qindex numIts = qureg.numAmpsPerNode; + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* elems = getCpuQcompPtr(matr.cpuElems); + cpu_qcomp expo = getCpuQcomp(exponent); + (void) expo; // silence when unused + #pragma omp parallel for if(qureg.isMultithreaded||qureg.isMultithreaded) for (qindex n=0; n (matr * rho) or (matr^exponent * rho) if constexpr (ApplyLeft) { // i = global row of nth local amp qindex i = fast_getQuregGlobalRowFromFlatIndex(n, matr.numElems); - qcomp term = matr.cpuElems[i]; + cpu_qcomp term = elems[i]; // compile-time decide if applying power to avoid in-loop branching... // (beware that complex pow() is numerically unstable as detailed below) if constexpr (HasPower) - term = std::pow(term, exponent); + term = pow(term, expo); fac = term; } @@ -826,23 +898,23 @@ void cpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp // j = global column corresponding to n qindex j = fast_getQuregGlobalColFromFlatIndex(m, matr.numElems); - qcomp term = matr.cpuElems[j]; + cpu_qcomp term = elems[j]; // beware that pow(qcomp,qcomp) below gives notable error over pow(qreal,qreal) // (by producing an unexpected non-zero imaginary component) when the base is real // and negative, and the exponent is an integer. We tolerate this heightened error // because we have no reason to think matr is real (it's not constrained Hermitian). if constexpr (HasPower) - term = std::pow(term, exponent); + term = pow(term, expo); // conj strictly after pow, to effect conj(matr^exponent) if constexpr (ConjRight) - term = std::conj(term); + term = conj(term); fac *= term; } - qureg.cpuAmps[n] *= fac; + amps[n] *= fac; } } @@ -866,8 +938,8 @@ template void cpu_densmatr_allTargDiagMatr_sub (Qure template INLINE void applyPauliUponAmpPair( - Qureg qureg, qindex v, qindex i0, int* indXY, int numXY, - qindex maskXY, qindex maskYZ, qcomp ampFac, qcomp pairAmpFac + cpu_qcomp* amps, qindex v, qindex i0, int* indXY, int numXY, + qindex maskXY, qindex maskYZ, cpu_qcomp ampFac, cpu_qcomp pairAmpFac ) { // this is a subroutine of cpu_statevector_anyCtrlPauliTensorOrGadget_subA() below // called in a hot-loop (hence it is here inlined) which exists because the caller @@ -885,14 +957,13 @@ INLINE void applyPauliUponAmpPair( int signB = fast_getPlusOrMinusMaskedBitParity(iB, maskYZ); // mix or swap scaled amp pair (where pairAmpFac includes Y's i factor) - qcomp ampA = qureg.cpuAmps[iA]; - qcomp ampB = qureg.cpuAmps[iB]; - qureg.cpuAmps[iA] = (ampFac * ampA) + (pairAmpFac * signB * ampB); - qureg.cpuAmps[iB] = (ampFac * ampB) + (pairAmpFac * signA * ampA); + cpu_qcomp ampA = amps[iA]; + cpu_qcomp ampB = amps[iB]; + amps[iA] = (ampFac * ampA) + (pairAmpFac * signB * ampB); + amps[iB] = (ampFac * ampB) + (pairAmpFac * signA * ampA); } - template void cpu_statevector_anyCtrlPauliTensorOrGadget_subA( Qureg qureg, vector ctrls, vector ctrlStates, @@ -900,6 +971,14 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subA( ) { assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls); assert_numTargsMatchesTemplateParam(x.size() + y.size(), NumTargs); + + // we will scale pairAmp below by i^numY, so that each amp need only choose the +-1 sign + pairAmpFac *= util_getPowerOfI(y.size()); + + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp f0 = getCpuQcomp(ampFac); + cpu_qcomp f1 = getCpuQcomp(pairAmpFac); // only X and Y count as targets vector sortedTargsXY = util_getSorted(util_getConcatenated(x, y)); @@ -912,9 +991,6 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subA( auto maskXY = util_getBitMask(sortedTargsXY); auto maskYZ = util_getBitMask(util_getConcatenated(y, z)); - // we will scale pairAmp below by i^numY, so that each amp need only choose the +-1 sign - pairAmpFac *= util_getPowerOfI(y.size()); - // use template params to compile-time unroll loops in insertBits() and inner-loop below SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size()); SET_VAR_AT_COMPILE_TIME(int, numTargBits, NumTargs, sortedTargsXY.size()); @@ -950,7 +1026,7 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subA( // serial for (qindex v=0; v( - qureg, v, i0, sortedTargsXY.data(), numTargBits, maskXY, maskYZ, ampFac, pairAmpFac); + amps, v, i0, sortedTargsXY.data(), numTargBits, maskXY, maskYZ, f0, f1); } } else { @@ -965,7 +1041,7 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subA( #pragma omp parallel for for (qindex v=0; v( - qureg, v, i0, sortedTargsXY.data(), numTargBits, maskXY, maskYZ, ampFac, pairAmpFac); + amps, v, i0, sortedTargsXY.data(), numTargBits, maskXY, maskYZ, f0, f1); } } } @@ -978,8 +1054,17 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subB( ) { assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls); + // we will scale pairAmp by i^numY, so that each amp need only choose the +-1 sign + pairAmpFac *= util_getPowerOfI(y.size()); + // each control qubit halves the needed iterations qindex numIts = qureg.numAmpsPerNode / powerOf2(ctrls.size()); + + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer); + cpu_qcomp f0 = getCpuQcomp(ampFac); + cpu_qcomp f1 = getCpuQcomp(pairAmpFac); // received amplitudes may begin at an arbitrary offset in the buffer qindex offset = getBufferRecvInd(); @@ -992,9 +1077,6 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subB( // use template param to compile-time unroll loop in insertBits() SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size()); - // we will scale pairAmp by i^numY, so that each amp need only choose the +-1 sign - pairAmpFac *= util_getPowerOfI(y.size()); - #pragma omp parallel for if(qureg.isMultithreaded) for (qindex n=0; n coeffs qindex numIts = outQureg.numAmpsPerNode; + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* outAmps = getCpuQcompPtr(outQureg.cpuAmps); + cpu_qcomp* inFacs = getCpuQcompPtr(coeffs.data()); + // use template param to compile-time unroll inner loop below SET_VAR_AT_COMPILE_TIME(int, numQuregs, NumQuregs, inQuregs.size()); @@ -1076,13 +1165,13 @@ void cpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector coeffs for (qindex n=0; n coeffs void cpu_densmatr_mixQureg_subA(qreal outProb, Qureg outQureg, qreal inProb, Qureg inDensMatr) { qindex numIts = outQureg.numAmpsPerNode; - qcomp* out = outQureg.cpuAmps; - qcomp* in = inDensMatr.cpuAmps; + + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* outAmps = getCpuQcompPtr(outQureg.cpuAmps); + cpu_qcomp* inAmps = getCpuQcompPtr(inDensMatr.cpuAmps); #pragma omp parallel for if(outQureg.isMultithreaded) for (qindex n=0; n ta // each outer iteration sets one element of outQureg qindex numOuterIts = outQureg.numAmpsPerNode; + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* inAmps = getCpuQcompPtr(inQureg.cpuAmps); + cpu_qcomp* outAmps = getCpuQcompPtr(outQureg.cpuAmps); + // targs and allTargs are sorted, but pairTargs is arbitrarily ordered (though corresponding targs) auto allTargsSorted = util_getSorted(targs, pairTargs); @@ -1750,7 +1895,7 @@ void cpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector ta qindex k = insertBits(n, allTargsSorted.data(), numAllTargs, 0); // loop may be unrolled // each outQureg amp results from summing 2^targs inQureg amps - qcomp outAmp = 0; + cpu_qcomp outAmp = getCpuQcomp(0,0); // loop may be unrolled for (qindex j=0; j ta i = setBits(i, targs .data(), numTargPairs, j); // loop may be unrolled i = setBits(i, pairTargs.data(), numTargPairs, j); // loop may be unrolled - outAmp += inQureg.cpuAmps[i]; + outAmp += inAmps[i]; } - outQureg.cpuAmps[n] = outAmp; + outAmps[n] = outAmp; } } @@ -1797,9 +1942,12 @@ qreal cpu_statevec_calcTotalProb_sub(Qureg qureg) { // every amp, iterated independently, contributes to the probability qindex numIts = qureg.numAmpsPerNode; + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + #pragma omp parallel for reduction(+:prob) if(qureg.isMultithreaded) for (qindex n=0; n qubi // (>=1 since all qubits are in suffix, so qubits.size() <= suffix size) qindex numIts = qureg.numAmpsPerNode / powerOf2(qubits.size()); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + auto sortedQubits = util_getSorted(qubits); // all in suffix auto qubitStateMask = util_getBitMask(qubits, outcomes); @@ -1862,7 +2016,7 @@ qreal cpu_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector qubi // i = nth local index where qubits are in the specified outcome state qindex i = insertBitsWithMaskedValues(n, sortedQubits.data(), numBits, qubitStateMask); - prob += std::norm(qureg.cpuAmps[i]); + prob += norm(amps[i]); } return prob; @@ -1884,6 +2038,9 @@ qreal cpu_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector qubi qindex numAmpsPerCol = powerOf2(qureg.numQubits); qindex firstDiagInd = util_getLocalIndexOfFirstDiagonalAmp(qureg); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + auto sortedQubits = util_getSorted(qubits); // all in suffix, with corresponding bra's all in suffix auto qubitStateMask = util_getBitMask(qubits, outcomes); @@ -1899,7 +2056,7 @@ qreal cpu_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector qubi // j = local, flat, density-matrix index of diagonal amp corresponding to state i qindex j = fast_getQuregLocalIndexOfDiagonalAmp(i, firstDiagInd, numAmpsPerCol); - prob += std::real(qureg.cpuAmps[j]); + prob += real(amps[j]); } return prob; @@ -1914,6 +2071,9 @@ void cpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu // every amp contributes to a statevector prob qindex numIts = qureg.numAmpsPerNode; + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + // use template param to compile-time unroll loop in getValueOfBits() SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size()); qindex numOutcomes = powerOf2(numBits); @@ -1930,7 +2090,7 @@ void cpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu #pragma omp parallel for if(qureg.isMultithreaded) for (qindex n=0; n or if constexpr (Conj) { - rhoAmp = std::conj(rhoAmp); - colAmp = std::conj(colAmp); + rhoAmp = conj(rhoAmp); + colAmp = conj(colAmp); } else - rowAmp = std::conj(rowAmp); + rowAmp = conj(rowAmp); - qcomp term = rhoAmp * rowAmp * colAmp; - fidRe += std::real(term); - fidIm += std::imag(term); + cpu_qcomp term = rhoAmp * rowAmp * colAmp; + fidRe += real(term); + fidIm += imag(term); } return qcomp(fidRe, fidIm); @@ -2095,11 +2270,14 @@ qreal cpu_statevec_calcExpecAnyTargZ_sub(Qureg qureg, vector targs) { qindex numIts = qureg.numAmpsPerNode; qindex targMask = util_getBitMask(targs); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + #pragma omp parallel for reduction(+:value) if(qureg.isMultithreaded) for (qindex n=0; n targs) { qindex numAmpsPerCol = powerOf2(qureg.numQubits); qindex firstDiagInd = util_getLocalIndexOfFirstDiagonalAmp(qureg); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + qindex targMask = util_getBitMask(targs); #pragma omp parallel for reduction(+:valueRe,valueIm) if(qureg.isMultithreaded) @@ -2128,10 +2309,10 @@ qcomp cpu_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector targs) { // r = global row of nth local diagonal, which determines amp sign qindex r = n + firstDiagInd; int sign = fast_getPlusOrMinusMaskedBitParity(r, targMask); - qcomp term = sign * qureg.cpuAmps[i]; + cpu_qcomp term = sign * amps[i]; - valueRe += std::real(term); - valueIm += std::imag(term); + valueRe += real(term); + valueIm += imag(term); } return qcomp(valueRe, valueIm); @@ -2147,6 +2328,9 @@ qcomp cpu_statevec_calcExpecPauliStr_subA(Qureg qureg, vector x, vector x, vector x, vector x, vector x, vector qindex numAmpsPerCol = powerOf2(qureg.numQubits); qindex firstDiagInd = util_getLocalIndexOfFirstDiagonalAmp(qureg); + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + // these masks indicate global paulis (i.e. not just suffix) qindex maskXY = util_getBitMask(util_getConcatenated(x, y)); qindex maskYZ = util_getBitMask(util_getConcatenated(y, z)); @@ -2237,10 +2428,10 @@ qcomp cpu_densmatr_calcExpecPauliStr_sub(Qureg qureg, vector x, vector // sign = +-1 induced by Y and Z (excludes Y's imaginary factors) int sign = fast_getPlusOrMinusMaskedBitParity(i, maskYZ); - qcomp term = sign * qureg.cpuAmps[m]; + cpu_qcomp term = sign * amps[m]; - valueRe += std::real(term); - valueIm += std::imag(term); + valueRe += real(term); + valueIm += imag(term); } // scale by i^numY (because sign above exlcuded i) @@ -2267,10 +2458,16 @@ qcomp cpu_statevec_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr // every amp, iterated independently, contributes to the expectation value qindex numIts = qureg.numAmpsPerNode; + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); + cpu_qcomp* elems = getCpuQcompPtr(matr.cpuElems); + cpu_qcomp expo = getCpuQcomp(exponent); + (void) expo; // silence when unused + #pragma omp parallel for reduction(+:valueRe,valueIm) if(qureg.isMultithreaded||matr.isMultithreaded) for (qindex n=0; n qubits, vecto // visit every amp, setting to zero or multiplying it by renorm qindex numIts = qureg.numAmpsPerNode; - qreal renorm = 1 / std::sqrt(prob); + + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); // binary value of targeted qubits in basis states which are to be retained qindex retainValue = getIntegerFromBits(outcomes.data(), outcomes.size()); + qreal renorm = 1 / std::sqrt(prob); // use template param to compile-time unroll loop in getValueOfBits() SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size()); @@ -2383,8 +2589,7 @@ void cpu_statevec_multiQubitProjector_sub(Qureg qureg, vector qubits, vecto qindex val = getValueOfBits(n, qubits.data(), numBits); // multiply amp with renorm or zero, if qubit value matches or disagrees - qcomp fac = renorm * (val == retainValue); - qureg.cpuAmps[n] *= fac; + amps[n] *= renorm * (val == retainValue); } } @@ -2401,10 +2606,13 @@ void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, vector qubits, vecto // visit every amp, setting most to zero and multiplying the remainder by renorm qindex numIts = qureg.numAmpsPerNode; - qreal renorm = 1 / prob; + + // use cpu_qcomp arithmetic overloads (avoid qcomp's) + cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps); // binary value of targeted qubits in basis states which are to be retained qindex retainValue = getIntegerFromBits(outcomes.data(), outcomes.size()); + qreal renorm = 1 / prob; // use template param to compile-time unroll loops in getValueOfBits() SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size()); @@ -2423,8 +2631,7 @@ void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, vector qubits, vecto qindex v2 = getValueOfBits(c, qubits.data(), numBits); // multiply amp with renorm or zero if values disagree with given outcomes - qcomp fac = renorm * (v1 == v2) * (retainValue == v1); - qureg.cpuAmps[n] *= fac; + amps[n] *= renorm * (v1 == v2) * (retainValue == v1); } } @@ -2453,7 +2660,7 @@ void cpu_statevec_initUniformState_sub(Qureg qureg, qcomp amp) { qureg.numAmpsPerNode, numAmpsPerPage, cpu_getOpenmpThreadInd(), cpu_getCurrentNumThreads()); - std::fill(qureg.cpuAmps + start, qureg.cpuAmps + end, amp); + std::fill(qureg.cpuAmps + start, qureg.cpuAmps + end, amp); // no cpu_qcomp cast needed } } @@ -2468,7 +2675,7 @@ void cpu_statevec_initDebugState_sub(Qureg qureg) { // i = global index of nth local amp qindex i = concatenateBits(qureg.rank, n, qureg.logNumAmpsPerNode); - qureg.cpuAmps[n] = qcomp(2*i/10., (2*i+1)/10.); + qureg.cpuAmps[n] = qcomp(2*i/10., (2*i+1)/10.); // no cpu_qcomp cast needed } } @@ -2493,6 +2700,6 @@ void cpu_statevec_initUnnormalisedUniformlyRandomPureStateAmps_sub(Qureg qureg) #pragma omp for for (qindex i=0; i #include @@ -182,7 +182,7 @@ void cuquantum_statevec_anyCtrlSwap_subA(Qureg qureg, vector ctrls, vector< CUDA_CHECK( custatevecSwapIndexBits( config.handle, - toCuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, + getGpuQcompPtr(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, targPairs, numTargPairs, // swap mask params seem to be in the reverse order to the remainder of the cuStateVec API @@ -199,7 +199,7 @@ void cuquantum_statevec_anyCtrlSwap_subA(Qureg qureg, vector ctrls, vector< */ -void cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(Qureg qureg, vector ctrls, vector ctrlStates, vector targs, cu_qcomp* flatMatrElems, bool applyAdj) { +void cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(Qureg qureg, vector ctrls, vector ctrlStates, vector targs, gpu_qcomp* flatMatrElems, bool applyAdj) { // this funciton is called 'subA' instead of just 'sub', because it is also called in // the one-target case whereby it is strictly the embarrassingly parallel _subA scenario @@ -210,7 +210,7 @@ void cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(Qureg qureg, vector CUDA_CHECK( custatevecApplyMatrix( config.handle, - toCuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, + getGpuQcompPtr(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, flatMatrElems, CUQUANTUM_QCOMP, CUSTATEVEC_MATRIX_LAYOUT_ROW, applyAdj, targs.data(), targs.size(), ctrls.data(), ctrlStates.data(), ctrls.size(), @@ -222,7 +222,7 @@ void cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(Qureg qureg, vector // there is no bespoke cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subB() -void cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vector ctrlStates, vector targs, cu_qcomp* flatMatrElems, bool conj) { +void cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vector ctrlStates, vector targs, gpu_qcomp* flatMatrElems, bool conj) { // beware that despite diagonal matrices being embarrassingly parallel, // the target qubits must still all be suffix-only to avoid a cuStateVec error @@ -239,7 +239,7 @@ void cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrl CUDA_CHECK( custatevecApplyGeneralizedPermutationMatrix( config.handle, - toCuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, + getGpuQcompPtr(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, perm, flatMatrElems, CUQUANTUM_QCOMP, adj, targs.data(), targs.size(), ctrls.data(), ctrlStates.data(), ctrls.size(), @@ -259,9 +259,9 @@ void cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrl void cuquantum_densmatr_oneQubitDephasing_subA(Qureg qureg, int qubit, qreal prob) { // effect the superoperator as a two-qubit diagonal upon a statevector suffix state - cu_qcomp a = {1, 0}; - cu_qcomp b = {1-2*prob, 0}; - cu_qcomp elems[] = {a, b, b, a}; + gpu_qcomp a = {1, 0}; + gpu_qcomp b = {1-2*prob, 0}; + gpu_qcomp elems[] = {a, b, b, a}; vector targs {qubit, util_getBraQubit(qubit,qureg)}; bool conj = false; @@ -275,8 +275,8 @@ void cuquantum_densmatr_oneQubitDephasing_subB(Qureg qureg, int ketQubit, qreal // equivalent to a state-controlled global phase upon a statevector, which is // itself a same-element one-qubit diagonal applied to any target int braBit = getBit(qureg.rank, ketQubit - qureg.logNumColsPerNode); - cu_qcomp fac = {1 - 2*prob, 0}; - cu_qcomp elems[] = {fac, fac}; + gpu_qcomp fac = {1 - 2*prob, 0}; + gpu_qcomp elems[] = {fac, fac}; // we choose to target the largest possible qubit, expecting best cuStateVec performance; // note it must still be a suffix qubit since cuQuantum does not know qureg is distributed @@ -296,9 +296,9 @@ void cuquantum_densmatr_twoQubitDephasing_subA(Qureg qureg, int qubitA, int qubi /// are the same, i.e. we skip |00><00|, |01><01|, |10><10|, |11><11| // effect the superoperator as a four-qubit diagonal upon a statevector suffix state - cu_qcomp a = {1, 0}; - cu_qcomp b = {1-4*prob/3, 0}; - cu_qcomp elems[] = {a,b,b,b, b,a,b,b, b,b,a,b, b,b,b,a}; + gpu_qcomp a = {1, 0}; + gpu_qcomp b = {1-4*prob/3, 0}; + gpu_qcomp elems[] = {a,b,b,b, b,a,b,b, b,b,a,b, b,b,b,a}; vector targs {qubitA, qubitB, util_getBraQubit(qubitA,qureg), util_getBraQubit(qubitB,qureg)}; bool conj = false; @@ -331,7 +331,7 @@ qreal cuquantum_statevec_calcTotalProb_sub(Qureg qureg) { CUDA_CHECK( custatevecAbs2SumOnZBasis( config.handle, - toCuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, + getGpuQcompPtr(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, &prob0, &prob1, &qubit, numQubits ) ); qreal total = prob0 + prob1; @@ -346,7 +346,7 @@ qreal cuquantum_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector(prob); @@ -367,7 +367,7 @@ void cuquantum_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qu CUDA_CHECK( custatevecAbs2SumArray( config.handle, - toCuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, + getGpuQcompPtr(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, outPtr, qubits.data(), qubits.size(), nullptr, nullptr, 0) ); // serially cast and copy output probabilities, if necessary @@ -409,7 +409,7 @@ qreal cuquantum_statevec_calcExpecPauliStr_subA(Qureg qureg, vector x, vect CUDA_CHECK( custatevecComputeExpectationsOnPauliBasis( config.handle, - toCuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, + getGpuQcompPtr(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, &value, termPaulis, numTerms, termTargets, numPaulisPerTerm) ); return static_cast(value); @@ -432,7 +432,7 @@ void cuquantum_statevec_multiQubitProjector_sub(Qureg qureg, vector qubits, CUDA_CHECK( custatevecCollapseByBitString( config.handle, - toCuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, + getGpuQcompPtr(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, outcomes.data(), qubits.data(), qubits.size(), prob) ); } diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index 4f2a737e..30377850 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -22,12 +22,8 @@ #include "quest/include/types.h" #include "quest/src/core/bitwise.hpp" -#include "quest/src/gpu/gpu_types.cuh" - -// kernels/thrust must use cu_qcomp, never qcomp -#define USE_CU_QCOMP #include "quest/src/core/fastmath.hpp" -#undef USE_CU_QCOMP +#include "quest/src/gpu/gpu_qcomp.cuh" #if ! COMPILE_CUDA #error "A file being compiled somehow included gpu_kernels.hpp despite QuEST not being compiled in GPU-accelerated mode." @@ -93,7 +89,7 @@ __forceinline__ __device__ int cudaGetBitMaskParity(qindex mask) { template __global__ void kernel_statevec_packAmpsIntoBuffer( - cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, + gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, int* qubits, int numQubits, qindex qubitStateMask ) { GET_THREAD_IND(n, numThreads); @@ -110,7 +106,7 @@ __global__ void kernel_statevec_packAmpsIntoBuffer( __global__ void kernel_statevec_packPairSummedAmpsIntoBuffer( - cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, + gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, int qubit1, int qubit2, int qubit3, int bit2 ) { GET_THREAD_IND(n, numThreads); @@ -132,7 +128,7 @@ __global__ void kernel_statevec_packPairSummedAmpsIntoBuffer( template __global__ void kernel_statevec_anyCtrlSwap_subA( - cu_qcomp* amps, qindex numThreads, + gpu_qcomp* amps, qindex numThreads, int* ctrlsAndTargs, int numCtrls, qindex ctrlsAndTargsMask, int targ1, int targ2 ) { GET_THREAD_IND(n, numThreads); @@ -146,7 +142,7 @@ __global__ void kernel_statevec_anyCtrlSwap_subA( qindex i10 = flipTwoBits(i01, targ2, targ1); // swap amps - cu_qcomp amp01 = amps[i01]; + gpu_qcomp amp01 = amps[i01]; amps[i01] = amps[i10]; amps[i10] = amp01; } @@ -154,7 +150,7 @@ __global__ void kernel_statevec_anyCtrlSwap_subA( template __global__ void kernel_statevec_anyCtrlSwap_subB( - cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, + gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, int* ctrls, int numCtrls, qindex ctrlStateMask ) { GET_THREAD_IND(n, numThreads); @@ -172,7 +168,7 @@ __global__ void kernel_statevec_anyCtrlSwap_subB( template __global__ void kernel_statevec_anyCtrlSwap_subC( - cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, + gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, int* ctrlsAndTarg, int numCtrls, qindex ctrlsAndTargMask ) { GET_THREAD_IND(n, numThreads); @@ -197,9 +193,9 @@ __global__ void kernel_statevec_anyCtrlSwap_subC( template __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subA( - cu_qcomp* amps, qindex numThreads, + gpu_qcomp* amps, qindex numThreads, int* ctrlsAndTarg, int numCtrls, qindex ctrlStateMask, int targ, - cu_qcomp m00, cu_qcomp m01, cu_qcomp m10, cu_qcomp m11 + gpu_qcomp m00, gpu_qcomp m01, gpu_qcomp m10, gpu_qcomp m11 ) { GET_THREAD_IND(n, numThreads); @@ -211,8 +207,8 @@ __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subA( qindex i1 = flipBit(i0, targ); // note amps are strided by 2^targ - cu_qcomp amp0 = amps[i0]; - cu_qcomp amp1 = amps[i1]; + gpu_qcomp amp0 = amps[i0]; + gpu_qcomp amp1 = amps[i1]; amps[i0] = m00*amp0 + m01*amp1; amps[i1] = m10*amp0 + m11*amp1; @@ -221,9 +217,9 @@ __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subA( template __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subB( - cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, + gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, int* ctrls, int numCtrls, qindex ctrlStateMask, - cu_qcomp fac0, cu_qcomp fac1 + gpu_qcomp fac0, gpu_qcomp fac1 ) { GET_THREAD_IND(n, numThreads); @@ -246,12 +242,12 @@ __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subB( template __global__ void kernel_statevec_anyCtrlTwoTargDenseMatr_sub( - cu_qcomp* amps, qindex numThreads, + gpu_qcomp* amps, qindex numThreads, int* ctrlsAndTarg, int numCtrls, qindex ctrlStateMask, int targ1, int targ2, - cu_qcomp m00, cu_qcomp m01, cu_qcomp m02, cu_qcomp m03, - cu_qcomp m10, cu_qcomp m11, cu_qcomp m12, cu_qcomp m13, - cu_qcomp m20, cu_qcomp m21, cu_qcomp m22, cu_qcomp m23, - cu_qcomp m30, cu_qcomp m31, cu_qcomp m32, cu_qcomp m33 + gpu_qcomp m00, gpu_qcomp m01, gpu_qcomp m02, gpu_qcomp m03, + gpu_qcomp m10, gpu_qcomp m11, gpu_qcomp m12, gpu_qcomp m13, + gpu_qcomp m20, gpu_qcomp m21, gpu_qcomp m22, gpu_qcomp m23, + gpu_qcomp m30, gpu_qcomp m31, gpu_qcomp m32, gpu_qcomp m33 ) { GET_THREAD_IND(n, numThreads); @@ -265,10 +261,10 @@ __global__ void kernel_statevec_anyCtrlTwoTargDenseMatr_sub( qindex i11 = flipBit(i01, targ2); // note amps00 and amps01 are strided by 2^targ1, and amps00 and amps10 are strided by 2^targ2 - cu_qcomp amp00 = amps[i00]; - cu_qcomp amp01 = amps[i01]; - cu_qcomp amp10 = amps[i10]; - cu_qcomp amp11 = amps[i11]; + gpu_qcomp amp00 = amps[i00]; + gpu_qcomp amp01 = amps[i01]; + gpu_qcomp amp10 = amps[i10]; + gpu_qcomp amp11 = amps[i11]; // amps[i_n] = sum_j elems[n][j] amp[i_n] amps[i00] = m00*amp00 + m01*amp01 + m02*amp10 + m03*amp11; @@ -295,9 +291,9 @@ __forceinline__ __device__ qindex getThreadsNthGlobalArrInd(qindex n, qindex thr template __global__ void kernel_statevec_anyCtrlFewTargDenseMatr( - cu_qcomp* amps, qindex numThreads, + gpu_qcomp* amps, qindex numThreads, int* ctrlsAndTargs, int numCtrls, qindex ctrlsAndTargsMask, int* targs, - cu_qcomp* flatMatrElems + gpu_qcomp* flatMatrElems ) { GET_THREAD_IND(n, numThreads); @@ -310,7 +306,7 @@ __global__ void kernel_statevec_anyCtrlFewTargDenseMatr( // spill to local memory). Hence, this _subA() function is not a subroutine // despite some logic being common to non-compile-time _subB(), and hence // why the loops below are explicitly compile-time unrolled - REGISTER cu_qcomp privateCache[1 << NumTargs]; + REGISTER gpu_qcomp privateCache[1 << NumTargs]; // we know NumTargs <= 5, though NumCtrls is permitted anything (including -1) SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); @@ -335,7 +331,7 @@ __global__ void kernel_statevec_anyCtrlFewTargDenseMatr( // i = nth local index where ctrls are active and targs form value k qindex i = setBits(i0, targs, NumTargs, k); // loop will be unrolled - amps[i] = getCuQcomp(0, 0); + amps[i] = getGpuQcomp(0, 0); // force unroll to ensure compile-time cache indices #pragma unroll @@ -349,12 +345,12 @@ __global__ void kernel_statevec_anyCtrlFewTargDenseMatr( h = fast_getMatrixFlatIndex(k, l, numTargAmps); // optionally conjugate matrix elem - cu_qcomp elem = flatMatrElems[h]; + gpu_qcomp elem = flatMatrElems[h]; if constexpr (ApplyConj) - elem.y *= -1; + elem = conj(elem); // thread-private cache is accessed with compile-time known index - amps[i] = amps[i] + (elem * privateCache[l]); + amps[i] += elem * privateCache[l]; } } } @@ -362,11 +358,11 @@ __global__ void kernel_statevec_anyCtrlFewTargDenseMatr( template __global__ void kernel_statevec_anyCtrlManyTargDenseMatr( - cu_qcomp* globalCache, - cu_qcomp* amps, qindex numThreads, qindex numBatchesPerThread, + gpu_qcomp* globalCache, + gpu_qcomp* amps, qindex numThreads, qindex numBatchesPerThread, int* ctrlsAndTargs, int numCtrls, qindex ctrlsAndTargsMask, int* targs, int numTargBits, qindex numTargAmps, - cu_qcomp* flatMatrElems + gpu_qcomp* flatMatrElems ) { GET_THREAD_IND(t, numThreads); @@ -398,7 +394,7 @@ __global__ void kernel_statevec_anyCtrlManyTargDenseMatr( // i = nth local index where ctrls are active and targs form value k qindex i = setBits(i0, targs, numTargBits, k); // loop may be unrolled - amps[i] = getCuQcomp(0, 0); + amps[i] = getGpuQcomp(0, 0); for (qindex l=0; l __global__ void kernel_statevec_anyCtrlOneTargDiagMatr_sub( - cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, + gpu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, int* ctrls, int numCtrls, qindex ctrlStateMask, int targ, - cu_qcomp m1, cu_qcomp m2 + gpu_qcomp m1, gpu_qcomp m2 ) { GET_THREAD_IND(n, numThreads); @@ -462,7 +458,7 @@ __global__ void kernel_statevec_anyCtrlOneTargDiagMatr_sub( qindex i = concatenateBits(rank, j, logNumAmpsPerNode); int b = getBit(i, targ); - amps[j] = amps[j] * (m1 + b * (m2 - m1)); + amps[j] *= m1 + b * (m2 - m1); } @@ -474,9 +470,9 @@ __global__ void kernel_statevec_anyCtrlOneTargDiagMatr_sub( template __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub( - cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, + gpu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, int* ctrls, int numCtrls, qindex ctrlStateMask, int targ1, int targ2, - cu_qcomp m1, cu_qcomp m2, cu_qcomp m3, cu_qcomp m4 + gpu_qcomp m1, gpu_qcomp m2, gpu_qcomp m3, gpu_qcomp m4 ) { GET_THREAD_IND(n, numThreads); @@ -502,8 +498,8 @@ __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub( // k = local elem index int k = getTwoBits(i, targ2, targ1); - cu_qcomp elems[] = {m1, m2, m3, m4}; - amps[j] = amps[j] * elems[k]; + gpu_qcomp elems[] = {m1, m2, m3, m4}; + amps[j] *= elems[k]; } @@ -515,9 +511,9 @@ __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub( template __global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub( - cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, + gpu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, int* ctrls, int numCtrls, qindex ctrlStateMask, int* targs, int numTargs, - cu_qcomp* elems, cu_qcomp exponent + gpu_qcomp* elems, gpu_qcomp exponent ) { GET_THREAD_IND(n, numThreads); @@ -545,15 +541,15 @@ __global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub( // t = value of targeted bits, which may be in the prefix substate qindex t = getValueOfBits(i, targs, numTargBits); - cu_qcomp elem = elems[t]; + gpu_qcomp elem = elems[t]; if constexpr (HasPower) - elem = getCompPower(elem, exponent); + elem = pow(elem, exponent); if constexpr (ApplyConj) - elem.y *= -1; + elem = conj(elem); - amps[j] = amps[j] * elem; + amps[j] *= elem; } @@ -565,20 +561,20 @@ __global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub( template __global__ void kernel_densmatr_allTargDiagMatr_sub( - cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, - cu_qcomp* elems, qindex numElems, cu_qcomp exponent + gpu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, + gpu_qcomp* elems, qindex numElems, gpu_qcomp exponent ) { GET_THREAD_IND(n, numThreads); - cu_qcomp fac = getCuQcomp(1, 0); + gpu_qcomp fac = getGpuQcomp(1, 0); if constexpr (ApplyLeft) { qindex i = fast_getQuregGlobalRowFromFlatIndex(n, numElems); - cu_qcomp term = elems[i]; + gpu_qcomp term = elems[i]; if constexpr (HasPower) - term = getCompPower(term, exponent); + term = pow(term, exponent); fac = term; } @@ -587,18 +583,18 @@ __global__ void kernel_densmatr_allTargDiagMatr_sub( qindex m = concatenateBits(rank, n, logNumAmpsPerNode); qindex j = fast_getQuregGlobalColFromFlatIndex(m, numElems); - cu_qcomp term = elems[j]; + gpu_qcomp term = elems[j]; if constexpr (HasPower) - term = getCompPower(term, exponent); + term = pow(term, exponent); if constexpr (ConjRight) - term.y *= -1; + term = conj(term); - fac = fac * term; + fac *= term; } - amps[n] = amps[n] * fac; + amps[n] *= fac; } @@ -610,10 +606,10 @@ __global__ void kernel_densmatr_allTargDiagMatr_sub( template __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subA( - cu_qcomp* amps, qindex numThreads, + gpu_qcomp* amps, qindex numThreads, int* ctrlsAndTargs, int numCtrls, qindex ctrlsAndTargsStateMask, int* targsXY, int numXY, qindex maskXY, qindex maskYZ, - cu_qcomp powI, cu_qcomp ampFac, cu_qcomp pairAmpFac + gpu_qcomp powI, gpu_qcomp ampFac, gpu_qcomp pairAmpFac ) { GET_THREAD_IND(t, numThreads); @@ -636,11 +632,11 @@ __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subA( // determine whether to multiply amps by +-1 or +-i int parA = cudaGetBitMaskParity(iA & maskYZ); int parB = cudaGetBitMaskParity(iB & maskYZ); - cu_qcomp coeffA = powI * fast_getPlusOrMinusOne(parA); - cu_qcomp coeffB = powI * fast_getPlusOrMinusOne(parB); + gpu_qcomp coeffA = powI * fast_getPlusOrMinusOne(parA); + gpu_qcomp coeffB = powI * fast_getPlusOrMinusOne(parB); - cu_qcomp ampA = amps[iA]; - cu_qcomp ampB = amps[iB]; + gpu_qcomp ampA = amps[iA]; + gpu_qcomp ampB = amps[iB]; // mix or swap scaled amp pair amps[iA] = (ampFac * ampA) + (pairAmpFac * coeffB * ampB); @@ -650,10 +646,10 @@ __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subA( template __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subB( - cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, + gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, int* ctrls, int numCtrls, qindex ctrlStateMask, qindex maskXY, qindex maskYZ, qindex bufferMaskXY, - cu_qcomp powI, cu_qcomp thisAmpFac, cu_qcomp otherAmpFac + gpu_qcomp powI, gpu_qcomp thisAmpFac, gpu_qcomp otherAmpFac ) { GET_THREAD_IND(n, numThreads); @@ -671,7 +667,7 @@ __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subB( // determine whether to multiply buffer amp by +-1 or +-i int par = cudaGetBitMaskParity(k & maskYZ); - cu_qcomp coeff = powI * fast_getPlusOrMinusOne(par); + gpu_qcomp coeff = powI * fast_getPlusOrMinusOne(par); amps[i] = (thisAmpFac * amps[i]) + (otherAmpFac * coeff * buffer[j]); } @@ -685,9 +681,9 @@ __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subB( template __global__ void kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub( - cu_qcomp* amps, qindex numThreads, + gpu_qcomp* amps, qindex numThreads, int* ctrls, int numCtrls, qindex ctrlStateMask, qindex targMask, - cu_qcomp fac0, cu_qcomp fac1 + gpu_qcomp fac0, gpu_qcomp fac1 ) { GET_THREAD_IND(n, numThreads); @@ -700,8 +696,8 @@ __global__ void kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub( // apply phase to amp depending on parity of targets in global index int p = cudaGetBitMaskParity(i & targMask); - cu_qcomp facs[] = {fac0, fac1}; - amps[i] = amps[i] * facs[p]; + gpu_qcomp facs[] = {fac0, fac1}; + amps[i] *= facs[p]; } @@ -713,18 +709,18 @@ __global__ void kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub( template __global__ void kernel_statevec_setQuregToWeightedSum_sub( - cu_qcomp* outAmps, qindex numThreads, - cu_qcomp* coeffs, cu_qcomp** inAmps, int numQuregs + gpu_qcomp* outAmps, qindex numThreads, + gpu_qcomp* coeffs, gpu_qcomp** inAmps, int numQuregs ) { GET_THREAD_IND(n, numThreads); // use template param to compile-time unroll below loop SET_VAR_AT_COMPILE_TIME(int, numInner, NumQuregs, numQuregs); - cu_qcomp amp = getCuQcomp(0, 0); + gpu_qcomp amp = getGpuQcomp(0, 0); for (int q=0; q __global__ void kernel_densmatr_partialTrace_sub( - cu_qcomp* ampsIn, cu_qcomp* ampsOut, qindex numThreads, + gpu_qcomp* ampsIn, gpu_qcomp* ampsOut, qindex numThreads, int* ketTargs, int* pairTargs, int* allTargs, int numKetTargs ) { GET_THREAD_IND(n, numThreads); @@ -1142,7 +1138,7 @@ __global__ void kernel_densmatr_partialTrace_sub( SET_VAR_AT_COMPILE_TIME(int, numTargPairs, NumTargs, numKetTargs); // may be inferred at compile-time - int numAllTargs = 2*numTargPairs; + int numAllTargs = 2 * numTargPairs; qindex numIts = powerOf2(numTargPairs); /// @todo @@ -1154,7 +1150,7 @@ __global__ void kernel_densmatr_partialTrace_sub( qindex k = insertBits(n, allTargs, numAllTargs, 0); // loop may be unrolled // each outQureg amp results from summing 2^targs inQureg amps - cu_qcomp outAmp = getCuQcomp(0, 0); + gpu_qcomp outAmp = getGpuQcomp(0, 0); // loop may be unrolled for (qindex j=0; j __global__ void kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub( - qreal* outProbs, cu_qcomp* amps, qindex numThreads, + qreal* outProbs, gpu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, int* qubits, int numQubits ) { @@ -1194,7 +1190,7 @@ __global__ void kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub( // use template param to compile-time unroll below loops SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, numQubits); - qreal prob = getCompNorm(amps[n]); + qreal prob = norm(amps[n]); // i = global index corresponding to n qindex i = concatenateBits(rank, n, logNumAmpsPerNode); @@ -1208,7 +1204,7 @@ __global__ void kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub( template __global__ void kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub( - qreal* outProbs, cu_qcomp* amps, qindex numThreads, + qreal* outProbs, gpu_qcomp* amps, qindex numThreads, qindex firstDiagInd, qindex numAmpsPerCol, int rank, qindex logNumAmpsPerNode, int* qubits, int numQubits @@ -1220,7 +1216,7 @@ __global__ void kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub( // i = index of nth local diagonal elem qindex i = fast_getQuregLocalIndexOfDiagonalAmp(n, firstDiagInd, numAmpsPerCol); - qreal prob = getCompReal(amps[i]); + qreal prob = real(amps[i]); // j = global index of i qindex j = concatenateBits(rank, i, logNumAmpsPerNode); diff --git a/quest/src/gpu/gpu_qcomp.cuh b/quest/src/gpu/gpu_qcomp.cuh new file mode 100644 index 00000000..b7d48e13 --- /dev/null +++ b/quest/src/gpu/gpu_qcomp.cuh @@ -0,0 +1,131 @@ +/** @file + * Definition of gpu_qcomp, an extension of base_qcomp and a + * compatible alternative to the user-facing qcomp, used + * exclusively by the GPU backend and which is compatible with + * both CUDA and HIP. + * + * This header is safe to re-include by multiple files because typedef + * redefinition is legal in C++, and all functions herein are inline. + * Furthermore, since it is only ever parsed by nvcc, the __host__ symbols + * are safely processed by other nvcc-only GPU files, like the cuquantum backend. + * + * @author Tyson Jones + * @author Oliver Brown (patched former HIP arithmetic overloads) + * @author Erich Essmann (patched former ROCm build issues) + */ + +#ifndef GPU_QCOMP_CUH +#define GPU_QCOMP_CUH + +#include "quest/include/config.h" +#include "quest/include/types.h" +#include "quest/include/precision.h" + +#include "quest/src/core/inliner.hpp" +#include "quest/src/core/base_qcomp.hpp" + +#if ! COMPILE_CUDA + #error "A file being compiled somehow included gpu_qcomp.hpp despite QuEST not being compiled in GPU-accelerated mode." +#endif + +#if (FLOAT_PRECISION == 4) + #error "Build bug; precision.h should have prevented non-float non-double qcomp precision on GPU." +#endif + +#if defined(__HIP__) + #include "quest/src/gpu/cuda_to_hip.hpp" +#endif + +#include + + + +/* + * DEFINE GPU_QCOMP + * + * which is safe to typdef and define additional overloads + * below, since never witnessed outside the GPU Backend + */ + +typedef base_qcomp gpu_qcomp; + + + +/* + * CONVERTERS + * + * which merely wrap the base_qcomp functions for clarity + * in the GPU source code, disambiguating from cpu_qcomp + */ + +INLINE gpu_qcomp* getGpuQcompPtr(qcomp* list) { + return getBaseQcompPtr(list); +} + +INLINE gpu_qcomp getGpuQcomp(qreal re, qreal im) { + return getBaseQcomp(re, im); +} + +INLINE gpu_qcomp getGpuQcomp(const qcomp& a) { + return getBaseQcomp(a); +} + +template +__host__ inline std::array getGpuQcompArray(qcomp matr[Dim]) { + static_assert(Dim == 2 || Dim == 4); + + // it's crucial we explicitly copy over the elements, + // rather than just reinterpret the pointer, to avoid + // segmentation faults when memory misaligns (like on HIP) + + std::array out; + for (int i=0; i +__host__ inline std::array getFlattenedGpuQcompMatrix(qcomp matr[Dim][Dim]) { + static_assert(Dim == 2 || Dim == 4); + + std::array out; + for (int i=0; i qubits, vector <<>> ( - toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[sendInd], numThreads, + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + sendInd, numThreads, getPtr(sortedQubits), qubits.size(), qubitStateMask ); @@ -173,7 +173,7 @@ qindex gpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu qindex sendInd = getSubBufferSendInd(qureg); kernel_statevec_packPairSummedAmpsIntoBuffer <<>> ( - toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[sendInd], numThreads, + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + sendInd, numThreads, qubit1, qubit2, qubit3, bit2 ); @@ -214,7 +214,7 @@ void gpu_statevec_anyCtrlSwap_subA(Qureg qureg, vector ctrls, vector c qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ2, targ1}, {0, 1}); kernel_statevec_anyCtrlSwap_subA <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, + getGpuQcompPtr(qureg.gpuAmps), numThreads, getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2 ); @@ -239,7 +239,7 @@ void gpu_statevec_anyCtrlSwap_subB(Qureg qureg, vector ctrls, vector c qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); kernel_statevec_anyCtrlSwap_subB <<>> ( - toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, getPtr(sortedCtrls), ctrls.size(), ctrlStateMask ); @@ -264,7 +264,7 @@ void gpu_statevec_anyCtrlSwap_subC(Qureg qureg, vector ctrls, vector c qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {targState}); kernel_statevec_anyCtrlSwap_subC <<>> ( - toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, getPtr(sortedQubits), ctrls.size(), qubitStateMask ); @@ -293,7 +293,7 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector ctrls, v #if COMPILE_CUQUANTUM bool applyAdj = false; - auto arr = unpackMatrixToCuQcomps(matr); + auto arr = getFlattenedGpuQcompMatrix<2>(matr.elems); // explicit template for MSVC, grr! cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(qureg, ctrls, ctrlStates, {targ}, arr.data(), applyAdj); #elif COMPILE_CUDA @@ -304,10 +304,10 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector ctrls, v devints sortedQubits = util_getSorted(ctrls, {targ}); qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0}); - auto [m00, m01, m10, m11] = unpackMatrixToCuQcomps(matr); + auto [m00, m01, m10, m11] = getFlattenedGpuQcompMatrix<2>(matr.elems); // explicit template for MSVC, grr! kernel_statevec_anyCtrlOneTargDenseMatr_subA <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, + getGpuQcompPtr(qureg.gpuAmps), numThreads, getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ, m00, m01, m10, m11 ); @@ -333,9 +333,9 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector ctrls, v qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); kernel_statevec_anyCtrlOneTargDenseMatr_subB <<>> ( - toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, - toCuQcomp(fac0), toCuQcomp(fac1) + getGpuQcomp(fac0), getGpuQcomp(fac1) ); #else @@ -362,7 +362,7 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve #if COMPILE_CUQUANTUM bool applyAdj = false; - auto arr = unpackMatrixToCuQcomps(matr); + auto arr = getFlattenedGpuQcompMatrix<4>(matr.elems); // explicit template for MSVC, grr! cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(qureg, ctrls, ctrlStates, {targ1, targ2}, arr.data(), applyAdj); #elif COMPILE_CUDA @@ -374,10 +374,10 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1,targ2}, {0,0}); // unpack matrix elems which are more efficiently accessed by kernels as args than shared mem (... maybe...) - auto m = unpackMatrixToCuQcomps(matr); + auto m = getFlattenedGpuQcompMatrix<4>(matr.elems); // explicit template for MSVC, grr! kernel_statevec_anyCtrlTwoTargDenseMatr_sub <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, + getGpuQcompPtr(qureg.gpuAmps), numThreads, getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2, m[0], m[1], m[2], m[3], m[4], m[5], m[6], m[7], m[8], m[9], m[10], m[11], m[12], m[13], m[14], m[15] @@ -405,7 +405,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve #if COMPILE_CUQUANTUM - auto matrElemsPtr = toCuQcomps(matr.gpuElemsFlat); + auto matrElemsPtr = getGpuQcompPtr(matr.gpuElemsFlat); auto matrElemsLen = matr.numRows * matr.numRows; // assert the pre-condition assumed below @@ -437,8 +437,8 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, targs, vector(targs.size(),0)); // unpacking args (to better distinguish below signatures) - auto ampsPtr = toCuQcomps(qureg.gpuAmps); - auto matrPtr = toCuQcomps(matr.gpuElemsFlat); + auto ampsPtr = getGpuQcompPtr(qureg.gpuAmps); + auto matrPtr = getGpuQcompPtr(matr.gpuElemsFlat); auto qubitsPtr = getPtr(deviceQubits); auto targsPtr = getPtr(deviceTargs); auto nCtrls = ctrls.size(); @@ -506,7 +506,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve kernel_statevec_anyCtrlManyTargDenseMatr <<>> ( - toCuQcomps(cache), + getGpuQcompPtr(cache), ampsPtr, numThreads, numBatchesPerThread, qubitsPtr, nCtrls, qubitStateMask, targsPtr, targs.size(), powerOf2(targs.size()), matrPtr @@ -550,7 +550,7 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector ctrls, vec bool conj = false; // we can pass 1D CPU .elems array directly to cuQuantum which will recognise host pointers - cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, {targ}, toCuQcomps(matr.elems), conj); + cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, {targ}, getGpuQcompPtr(matr.elems), conj); // explicitly return to avoid re-simulation below return; @@ -570,10 +570,10 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector ctrls, vec devints deviceCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); - auto elems = unpackMatrixToCuQcomps(matr); + auto elems = getGpuQcompArray<2>(matr.elems); // explicit template for MSVC, grr! kernel_statevec_anyCtrlOneTargDiagMatr_sub <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, + getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ, elems[0], elems[1] ); @@ -618,7 +618,7 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector ctrls, vec bool conj = false; // we can pass 1D CPU array directly to cuQuantum, and it will recognise host pointers - cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, {targ1, targ2}, toCuQcomps(matr.elems), conj); + cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, {targ1, targ2}, getGpuQcompPtr(matr.elems), conj); // explicitly return to avoid re-simulation below return; @@ -638,10 +638,10 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector ctrls, vec devints deviceCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); - auto elems = unpackMatrixToCuQcomps(matr); + auto elems = getGpuQcompArray<4>(matr.elems); // explicit template for MSVC, grr! kernel_statevec_anyCtrlTwoTargDiagMatr_sub <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, + getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ1, targ2, elems[0], elems[1], elems[2], elems[3] ); @@ -686,7 +686,7 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vec // cuQuantum cannot handle HasPower, in which case we fall back to custom kernel if (!HasPower && util_areAllQubitsInSuffix(targs, qureg)) { - cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, targs, toCuQcomps(util_getGpuMemPtr(matr)), ApplyConj); + cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, targs, getGpuQcompPtr(util_getGpuMemPtr(matr)), ApplyConj); // must return to avoid re-simulation below return; @@ -709,9 +709,9 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vec qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); kernel_statevec_anyCtrlAnyTargDiagMatr_sub <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, + getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, getPtr(deviceTargs), targs.size(), - toCuQcomps(util_getGpuMemPtr(matr)), toCuQcomp(exponent) + getGpuQcompPtr(util_getGpuMemPtr(matr)), getGpuQcomp(exponent) ); // must return to avoid runtime error below @@ -743,7 +743,7 @@ void gpu_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp // we always use Thrust because we are doubtful that cuQuantum's // diagonal-matrix facilities are optimised for the all-qubit case - thrust_statevec_allTargDiagMatr_sub(qureg, matr, toCuQcomp(exponent)); + thrust_statevec_allTargDiagMatr_sub(qureg, matr, getGpuQcomp(exponent)); #else error_gpuSimButGpuNotCompiled(); @@ -764,8 +764,8 @@ void gpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp kernel_densmatr_allTargDiagMatr_sub <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, - toCuQcomps(util_getGpuMemPtr(matr)), matr.numElems, toCuQcomp(exponent) + getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, + getGpuQcompPtr(util_getGpuMemPtr(matr)), matr.numElems, getGpuQcomp(exponent) ); #else @@ -823,10 +823,10 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, vector ct qindex numThreads = (qureg.numAmpsPerNode / powerOf2(ctrls.size())) / 2; // divides evenly qindex numBlocks = getNumBlocks(numThreads); kernel_statevector_anyCtrlPauliTensorOrGadget_subA <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, + getGpuQcompPtr(qureg.gpuAmps), numThreads, getPtr(deviceQubits), ctrls.size(), qubitStateMask, getPtr(deviceTargs), deviceTargs.size(), - maskXY, maskYZ, toCuQcomp(powI), toCuQcomp(ampFac), toCuQcomp(pairAmpFac) + maskXY, maskYZ, getGpuQcomp(powI), getGpuQcomp(ampFac), getGpuQcomp(pairAmpFac) ); #else @@ -854,10 +854,10 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, vector ct qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); kernel_statevector_anyCtrlPauliTensorOrGadget_subB <<>> ( - toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, maskXY, maskYZ, bufferMaskXY, - toCuQcomp(powI), toCuQcomp(ampFac), toCuQcomp(pairAmpFac) + getGpuQcomp(powI), getGpuQcomp(ampFac), getGpuQcomp(pairAmpFac) ); #else @@ -891,9 +891,9 @@ void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, vector c qindex targMask = util_getBitMask(targs); kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, + getGpuQcompPtr(qureg.gpuAmps), numThreads, getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, targMask, - toCuQcomp(fac0), toCuQcomp(fac1) + getGpuQcomp(fac0), getGpuQcomp(fac1) ); #else @@ -920,17 +920,17 @@ void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector coeffs qindex numBlocks = getNumBlocks(numThreads); // extract amp ptrs from qureg list - vector ptrs; + vector ptrs; ptrs.reserve(inQuregs.size()); for (auto& qureg : inQuregs) - ptrs.push_back(toCuQcomps(qureg.gpuAmps)); + ptrs.push_back(getGpuQcompPtr(qureg.gpuAmps)); // copy coeff and qureg lists into GPU memory - devcuqcompptrs devQuregAmps = ptrs; + devgpuqcompptrs devQuregAmps = ptrs; devcomps devCoeffs = coeffs; kernel_statevec_setQuregToWeightedSum_sub <<>> ( - toCuQcomps(outQureg.gpuAmps), numThreads, + getGpuQcompPtr(outQureg.gpuAmps), numThreads, getPtr(devCoeffs), getPtr(devQuregAmps), inQuregs.size() ); @@ -960,7 +960,7 @@ void gpu_densmatr_mixQureg_subB(qreal outProb, Qureg outQureg, qreal inProb, Qur qindex numBlocks = getNumBlocks(numThreads); kernel_densmatr_mixQureg_subB <<>> ( - outProb, toCuQcomps(outQureg.gpuAmps), inProb, toCuQcomps(inQureg.gpuAmps), + outProb, getGpuQcompPtr(outQureg.gpuAmps), inProb, getGpuQcompPtr(inQureg.gpuAmps), numThreads, inQureg.numAmps ); @@ -978,7 +978,7 @@ void gpu_densmatr_mixQureg_subC(qreal outProb, Qureg outQureg, qreal inProb) { qindex numBlocks = getNumBlocks(numThreads); kernel_densmatr_mixQureg_subC <<>> ( - outProb, toCuQcomps(outQureg.gpuAmps), inProb, toCuQcomps(outQureg.gpuCommBuffer), + outProb, getGpuQcompPtr(outQureg.gpuAmps), inProb, getGpuQcompPtr(outQureg.gpuCommBuffer), numThreads, outQureg.rank, powerOf2(outQureg.numQubits), outQureg.logNumAmpsPerNode ); @@ -1013,7 +1013,7 @@ void gpu_densmatr_oneQubitDephasing_subA(Qureg qureg, int ketQubit, qreal prob) int braQubit = util_getBraQubit(ketQubit, qureg); kernel_densmatr_oneQubitDephasing_subA <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, fac + getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braQubit, fac ); #else @@ -1039,7 +1039,7 @@ void gpu_densmatr_oneQubitDephasing_subB(Qureg qureg, int ketQubit, qreal prob) int braBit = util_getRankBitOfBraQubit(ketQubit, qureg); kernel_densmatr_oneQubitDephasing_subB <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braBit, fac + getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braBit, fac ); #else @@ -1085,7 +1085,7 @@ void gpu_densmatr_twoQubitDephasing_subB(Qureg qureg, int ketQubitA, int ketQubi int braQubitB = util_getBraQubit(ketQubitB, qureg); kernel_densmatr_twoQubitDephasing_subB <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, // numAmps, not numCols + getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, // numAmps, not numCols ketQubitA, ketQubitB, braQubitA, braQubitB, term ); @@ -1112,7 +1112,7 @@ void gpu_densmatr_oneQubitDepolarising_subA(Qureg qureg, int ketQubit, qreal pro auto factors = util_getOneQubitDepolarisingFactors(prob); kernel_densmatr_oneQubitDepolarising_subA <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, factors.c1, factors.c2, factors.c3 + getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braQubit, factors.c1, factors.c2, factors.c3 ); #else @@ -1133,7 +1133,7 @@ void gpu_densmatr_oneQubitDepolarising_subB(Qureg qureg, int ketQubit, qreal pro auto factors = util_getOneQubitDepolarisingFactors(prob); kernel_densmatr_oneQubitDepolarising_subB <<>> ( - toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, ketQubit, braBit, factors.c1, factors.c2, factors.c3 ); @@ -1161,7 +1161,7 @@ void gpu_densmatr_twoQubitDepolarising_subA(Qureg qureg, int ketQb1, int ketQb2, auto c3 = util_getTwoQubitDepolarisingFactors(prob).c3; kernel_densmatr_twoQubitDepolarising_subA <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, + getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQb1, ketQb2, braQb1, braQb2, c3 ); @@ -1186,7 +1186,7 @@ void gpu_densmatr_twoQubitDepolarising_subB(Qureg qureg, int ketQb1, int ketQb2, qreal altc1 = factors.c1 - factors.c2; kernel_densmatr_twoQubitDepolarising_subB <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, + getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQb1, ketQb2, braQb1, braQb2, altc1, factors.c2 ); @@ -1208,7 +1208,7 @@ void gpu_densmatr_twoQubitDepolarising_subC(Qureg qureg, int ketQb1, int ketQb2, auto c3 = util_getTwoQubitDepolarisingFactors(prob).c3; kernel_densmatr_twoQubitDepolarising_subC <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, + getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQb1, ketQb2, braQb1, braBit2, c3 ); @@ -1231,7 +1231,7 @@ void gpu_densmatr_twoQubitDepolarising_subD(Qureg qureg, int ketQb1, int ketQb2, auto factors = util_getTwoQubitDepolarisingFactors(prob); kernel_densmatr_twoQubitDepolarising_subD <<>> ( - toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[offset], numThreads, + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + offset, numThreads, ketQb1, ketQb2, braQb1, braBit2, factors.c1, factors.c2 ); @@ -1256,7 +1256,7 @@ void gpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int ketQb1, int ketQb2, qreal fac1 = factors.c1 - fac0; kernel_densmatr_twoQubitDepolarising_subE <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, + getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQb1, ketQb2, braBit1, braBit2, fac0, fac1 ); @@ -1279,7 +1279,7 @@ void gpu_densmatr_twoQubitDepolarising_subF(Qureg qureg, int ketQb1, int ketQb2, auto c2 = util_getTwoQubitDepolarisingFactors(prob).c2; kernel_densmatr_twoQubitDepolarising_subF <<>> ( - toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[offset], numThreads, + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + offset, numThreads, ketQb1, ketQb2, braBit1, braBit2, c2 ); @@ -1306,7 +1306,7 @@ void gpu_densmatr_oneQubitPauliChannel_subA(Qureg qureg, int ketQubit, qreal pI, auto factors = util_getOneQubitPauliChannelFactors(pI, pX, pY, pZ); kernel_densmatr_oneQubitPauliChannel_subA <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, + getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braQubit, factors.c1, factors.c2, factors.c3, factors.c4 ); @@ -1328,7 +1328,7 @@ void gpu_densmatr_oneQubitPauliChannel_subB(Qureg qureg, int ketQubit, qreal pI, auto factors = util_getOneQubitPauliChannelFactors(pI, pX, pY, pZ); kernel_densmatr_oneQubitPauliChannel_subB <<>> ( - toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, ketQubit, braBit, factors.c1, factors.c2, factors.c3, factors.c4 ); @@ -1355,7 +1355,7 @@ void gpu_densmatr_oneQubitDamping_subA(Qureg qureg, int ketQubit, qreal prob) { auto factors = util_getOneQubitDampingFactors(prob); kernel_densmatr_oneQubitDamping_subA <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, + getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braQubit, prob, factors.c1, factors.c2 ); @@ -1375,7 +1375,7 @@ void gpu_densmatr_oneQubitDamping_subB(Qureg qureg, int qubit, qreal prob) { auto c2 = util_getOneQubitDampingFactors(prob).c2; kernel_densmatr_oneQubitDamping_subB <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, qubit, c2 + getGpuQcompPtr(qureg.gpuAmps), numThreads, qubit, c2 ); #else @@ -1395,7 +1395,7 @@ void gpu_densmatr_oneQubitDamping_subC(Qureg qureg, int ketQubit, qreal prob) { auto c1 = util_getOneQubitDampingFactors(prob).c1; kernel_densmatr_oneQubitDamping_subC <<>> ( - toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braBit, c1 + getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braBit, c1 ); #else @@ -1413,7 +1413,7 @@ void gpu_densmatr_oneQubitDamping_subD(Qureg qureg, int qubit, qreal prob) { qindex recvInd = getBufferRecvInd(); kernel_densmatr_oneQubitDamping_subD <<>> ( - toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, + getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, qubit, prob ); @@ -1444,7 +1444,7 @@ void gpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector ta devints devAllTargs = util_getSorted(targs, pairTargs); kernel_densmatr_partialTrace_sub <<>> ( - toCuQcomps(inQureg.gpuAmps), toCuQcomps(outQureg.gpuAmps), numThreads, + getGpuQcompPtr(inQureg.gpuAmps), getGpuQcompPtr(outQureg.gpuAmps), numThreads, getPtr(devTargs), getPtr(devPairTargs), getPtr(devAllTargs), targs.size() ); @@ -1564,7 +1564,7 @@ void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub <<>> ( - getPtr(devProbs), toCuQcomps(qureg.gpuAmps), numThreads, + getPtr(devProbs), getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, getPtr(devQubits), devQubits.size() ); @@ -1601,7 +1601,7 @@ void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub <<>> ( - getPtr(devProbs), toCuQcomps(qureg.gpuAmps), + getPtr(devProbs), getGpuQcompPtr(qureg.gpuAmps), numThreads, firstDiagInd, numAmpsPerCol, qureg.rank, qureg.logNumAmpsPerNode, getPtr(devQubits), devQubits.size() @@ -1633,8 +1633,8 @@ qcomp gpu_statevec_calcInnerProduct_sub(Qureg quregA, Qureg quregB) { #if COMPILE_CUQUANTUM || COMPILE_CUDA - cu_qcomp prod = thrust_statevec_calcInnerProduct_sub(quregA, quregB); - return toQcomp(prod); + gpu_qcomp prod = thrust_statevec_calcInnerProduct_sub(quregA, quregB); + return getQcomp(prod); #else error_gpuSimButGpuNotCompiled(); @@ -1661,8 +1661,8 @@ qcomp gpu_densmatr_calcFidelityWithPureState_sub(Qureg rho, Qureg psi) { #if COMPILE_CUQUANTUM || COMPILE_CUDA - cu_qcomp fid = thrust_densmatr_calcFidelityWithPureState_sub(rho, psi); - return toQcomp(fid); + gpu_qcomp fid = thrust_densmatr_calcFidelityWithPureState_sub(rho, psi); + return getQcomp(fid); #else error_gpuSimButGpuNotCompiled(); @@ -1702,8 +1702,8 @@ qcomp gpu_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector targs) { #if COMPILE_CUQUANTUM || COMPILE_CUDA - cu_qcomp value = thrust_densmatr_calcExpecAnyTargZ_sub(qureg, targs); - return toQcomp(value); + gpu_qcomp value = thrust_densmatr_calcExpecAnyTargZ_sub(qureg, targs); + return getQcomp(value); #else error_gpuSimButGpuNotCompiled(); @@ -1720,8 +1720,8 @@ qcomp gpu_statevec_calcExpecPauliStr_subA(Qureg qureg, vector x, vector x, vector x, vector #if COMPILE_CUQUANTUM || COMPILE_CUDA - cu_qcomp value = thrust_densmatr_calcExpecPauliStr_sub(qureg, x, y, z); - return toQcomp(value); + gpu_qcomp value = thrust_densmatr_calcExpecPauliStr_sub(qureg, x, y, z); + return getQcomp(value); #else error_gpuSimButGpuNotCompiled(); @@ -1770,9 +1770,9 @@ qcomp gpu_statevec_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr #if COMPILE_CUQUANTUM || COMPILE_CUDA - cu_qcomp expo = toCuQcomp(exponent); - cu_qcomp value = thrust_statevec_calcExpecFullStateDiagMatr_sub(qureg, matr, expo); - return toQcomp(value); + gpu_qcomp expo = getGpuQcomp(exponent); + gpu_qcomp value = thrust_statevec_calcExpecFullStateDiagMatr_sub(qureg, matr, expo); + return getQcomp(value); #else error_gpuSimButGpuNotCompiled(); @@ -1787,9 +1787,9 @@ qcomp gpu_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr #if COMPILE_CUQUANTUM || COMPILE_CUDA - cu_qcomp expo = toCuQcomp(exponent); - cu_qcomp value = thrust_densmatr_calcExpecFullStateDiagMatr_sub(qureg, matr, expo); - return toQcomp(value); + gpu_qcomp expo = getGpuQcomp(exponent); + gpu_qcomp value = thrust_densmatr_calcExpecFullStateDiagMatr_sub(qureg, matr, expo); + return getQcomp(value); #else error_gpuSimButGpuNotCompiled(); @@ -1866,7 +1866,7 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_densmatr_multiQubitProjector void gpu_statevec_initUniformState_sub(Qureg qureg, qcomp amp) { #if COMPILE_CUDA || COMPILE_CUQUANTUM - thrust_statevec_initUniformState(qureg, toCuQcomp(amp)); + thrust_statevec_initUniformState(qureg, getGpuQcomp(amp)); #else error_gpuSimButGpuNotCompiled(); diff --git a/quest/src/gpu/gpu_thrust.cuh b/quest/src/gpu/gpu_thrust.cuh index 3b653dfd..07a65054 100644 --- a/quest/src/gpu/gpu_thrust.cuh +++ b/quest/src/gpu/gpu_thrust.cuh @@ -33,18 +33,14 @@ #include "quest/include/paulis.h" #include "quest/include/matrices.h" -#include "quest/src/gpu/gpu_types.cuh" +#include "quest/src/gpu/gpu_qcomp.cuh" #include "quest/src/core/errors.hpp" #include "quest/src/core/bitwise.hpp" #include "quest/src/core/constants.hpp" #include "quest/src/core/utilities.hpp" #include "quest/src/core/randomiser.hpp" -#include "quest/src/comm/comm_config.hpp" - -// kernels/thrust must use cu_qcomp, never qcomp -#define USE_CU_QCOMP #include "quest/src/core/fastmath.hpp" -#undef USE_CU_QCOMP +#include "quest/src/comm/comm_config.hpp" #include #include @@ -68,7 +64,7 @@ * copy constructor (devicevec d_vec = hostvec). The pointer * to the data (d_vec.data()) can be cast into a raw pointer * and passed directly to CUDA kernels (though qcomp must be - * reinterpreted to cu_qcomp) + * reinterpreted to gpu_qcomp) */ @@ -110,18 +106,18 @@ devreals getDeviceRealsVec(qindex dim) { using devcomps = thrust::device_vector; -cu_qcomp* getPtr(devcomps& comps) { +gpu_qcomp* getPtr(devcomps& comps) { - // devcomps -> qcomp -> cu_qcomp + // devcomps -> qcomp -> gpu_qcomp qcomp* ptr = thrust::raw_pointer_cast(comps.data()); - return toCuQcomps(ptr); + return getGpuQcompPtr(ptr); } // father forgive me for I have sinned -using devcuqcompptrs = thrust::device_vector; +using devgpuqcompptrs = thrust::device_vector; -cu_qcomp** getPtr(devcuqcompptrs& ptrs) { +gpu_qcomp** getPtr(devgpuqcompptrs& ptrs) { return thrust::raw_pointer_cast(ptrs.data()); } @@ -136,13 +132,13 @@ cu_qcomp** getPtr(devcuqcompptrs& ptrs) { */ -thrust::device_ptr getStartPtr(cu_qcomp* amps) { +thrust::device_ptr getStartPtr(gpu_qcomp* amps) { return thrust::device_pointer_cast(amps); } auto getStartPtr(qcomp* amps) { - return getStartPtr(toCuQcomps(amps)); + return getStartPtr(getGpuQcompPtr(amps)); } @@ -177,37 +173,37 @@ auto getEndPtr(FullStateDiagMatr matr) { struct functor_getAmpConj { - __host__ __device__ cu_qcomp operator()(cu_qcomp amp) { - return getCompConj(amp); + __host__ __device__ gpu_qcomp operator()(gpu_qcomp amp) { + return conj(amp); } }; struct functor_getAmpNorm { - __host__ __device__ qreal operator()(cu_qcomp amp) { - return getCompNorm(amp); + __host__ __device__ qreal operator()(gpu_qcomp amp) { + return norm(amp); } }; struct functor_getAmpReal { - __host__ __device__ qreal operator()(cu_qcomp amp) { - return getCompReal(amp); + __host__ __device__ qreal operator()(gpu_qcomp amp) { + return real(amp); } }; struct functor_getAmpConjProd { - __host__ __device__ cu_qcomp operator()(cu_qcomp braAmp, cu_qcomp ketAmp) { - return getCompConj(braAmp) * ketAmp; + __host__ __device__ gpu_qcomp operator()(gpu_qcomp braAmp, gpu_qcomp ketAmp) { + return conj(braAmp) * ketAmp; } }; struct functor_getNormOfAmpDif { - __host__ __device__ qreal operator()(cu_qcomp amp1, cu_qcomp amp2) { - return getCompNorm(amp1 - amp2); + __host__ __device__ qreal operator()(gpu_qcomp amp1, gpu_qcomp amp2) { + return norm(amp1 - amp2); } }; @@ -220,11 +216,11 @@ struct functor_getExpecStateVecZTerm { qindex targMask; functor_getExpecStateVecZTerm(qindex mask) : targMask(mask) {} - __device__ qreal operator()(qindex ind, cu_qcomp amp) { + __device__ qreal operator()(qindex ind, gpu_qcomp amp) { int par = cudaGetBitMaskParity(ind & targMask); // device-only int sign = fast_getPlusOrMinusOne(par); - return sign * getCompNorm(amp); + return sign * norm(amp); } }; @@ -235,12 +231,12 @@ struct functor_getExpecDensMatrZTerm { // in the expectation value of Z of a density matrix qindex numAmpsPerCol, firstDiagInd, targMask; - cu_qcomp* amps; + gpu_qcomp* amps; - functor_getExpecDensMatrZTerm(qindex dim, qindex diagInd, qindex mask, cu_qcomp* _amps) : + functor_getExpecDensMatrZTerm(qindex dim, qindex diagInd, qindex mask, gpu_qcomp* _amps) : numAmpsPerCol(dim), firstDiagInd(diagInd), targMask(mask), amps(_amps) {} - __device__ cu_qcomp operator()(qindex n) { + __device__ gpu_qcomp operator()(qindex n) { qindex i = fast_getQuregLocalIndexOfDiagonalAmp(n, firstDiagInd, numAmpsPerCol); qindex r = n + firstDiagInd; @@ -259,19 +255,19 @@ struct functor_getExpecStateVecPauliTerm { // at least one X or Y) of a statevector qindex maskXY, maskYZ; - cu_qcomp *amps, *pairAmps; + gpu_qcomp *amps, *pairAmps; - functor_getExpecStateVecPauliTerm(qindex _maskXY, qindex _maskYZ, cu_qcomp* _amps, cu_qcomp* _pairAmps) : + functor_getExpecStateVecPauliTerm(qindex _maskXY, qindex _maskYZ, gpu_qcomp* _amps, gpu_qcomp* _pairAmps) : maskXY(_maskXY), maskYZ(_maskYZ), amps(_amps), pairAmps(_pairAmps) {} - __device__ cu_qcomp operator()(qindex n) { + __device__ gpu_qcomp operator()(qindex n) { qindex j = flipBits(n, maskXY); int par = cudaGetBitMaskParity(j & maskYZ); // device-only int sign = fast_getPlusOrMinusOne(par); // sign excludes i^numY contribution - return sign * getCompConj(amps[n]) * pairAmps[j]; // pairAmps may be amps or buffer + return sign * conj(amps[n]) * pairAmps[j]; // pairAmps may be amps or buffer } }; @@ -284,12 +280,12 @@ struct functor_getExpecDensMatrPauliTerm { qindex maskXY, maskYZ; qindex numAmpsPerCol, firstDiagInd; - cu_qcomp *amps; + gpu_qcomp *amps; - functor_getExpecDensMatrPauliTerm(qindex _maskXY, qindex _maskYZ, qindex _numAmpsPerCol, qindex _firstDiagInd, cu_qcomp* _amps) : + functor_getExpecDensMatrPauliTerm(qindex _maskXY, qindex _maskYZ, qindex _numAmpsPerCol, qindex _firstDiagInd, gpu_qcomp* _amps) : maskXY(_maskXY), maskYZ(_maskYZ), numAmpsPerCol(_numAmpsPerCol), firstDiagInd(_firstDiagInd), amps(_amps) {} - __device__ cu_qcomp operator()(qindex n) { + __device__ gpu_qcomp operator()(qindex n) { qindex r = n + firstDiagInd; qindex i = flipBits(r, maskXY); @@ -310,19 +306,19 @@ struct functor_getExpecDensMatrDiagMatrTerm { // value of a FullStateDiagMatr upon a density matrix qindex numAmpsPerCol, firstDiagInd; - cu_qcomp *amps, *elems, expo; + gpu_qcomp *amps, *elems, expo; - functor_getExpecDensMatrDiagMatrTerm(qindex dim, qindex diagInd, cu_qcomp* _amps, cu_qcomp* _elems, cu_qcomp _expo) : + functor_getExpecDensMatrDiagMatrTerm(qindex dim, qindex diagInd, gpu_qcomp* _amps, gpu_qcomp* _elems, gpu_qcomp _expo) : numAmpsPerCol(dim), firstDiagInd(diagInd), amps(_amps), elems(_elems), expo(_expo) {} - __device__ cu_qcomp operator()(qindex n) { + __device__ gpu_qcomp operator()(qindex n) { - cu_qcomp elem = elems[n]; + gpu_qcomp elem = elems[n]; if constexpr (HasPower && ! UseRealPow) - elem = getCompPower(elem, expo); + elem = pow(elem, expo); if constexpr (HasPower && UseRealPow) - elem = getCuQcomp(pow(getCompReal(elem), getCompReal(expo)),0); // CUDA pow(qreal,qreal) + elem = getGpuQcomp(pow(real(elem), real(expo)), 0); // CUDA pow(qreal,qreal) qindex i = fast_getQuregLocalIndexOfDiagonalAmp(n, firstDiagInd, numAmpsPerCol); @@ -339,13 +335,13 @@ struct functor_setAmpToPauliStrSumElem { qindex suffixLen; qindex numTerms; - cu_qcomp* amps; - cu_qcomp* coeffs; + gpu_qcomp* amps; + gpu_qcomp* coeffs; PauliStr* strings; functor_setAmpToPauliStrSumElem( int rank, qindex dim, qindex suffixLen, qindex numTerms, - cu_qcomp* amps, cu_qcomp* coeffs, PauliStr* strings + gpu_qcomp* amps, gpu_qcomp* coeffs, PauliStr* strings ) : rank(rank), dim(dim), suffixLen(suffixLen), numTerms(numTerms), amps(amps), coeffs(coeffs), strings(strings) @@ -381,7 +377,7 @@ struct functor_mixAmps { qreal outProb, inProb; functor_mixAmps(qreal out, qreal in) : outProb(out), inProb(in) {} - __host__ __device__ cu_qcomp operator()(cu_qcomp outAmp, cu_qcomp inAmp) { + __host__ __device__ gpu_qcomp operator()(gpu_qcomp outAmp, gpu_qcomp inAmp) { return (outProb * outAmp) + (inProb * inAmp); } @@ -397,18 +393,18 @@ struct functor_multiplyElemPowerWithAmpOrNorm { // a statevector amp (used when modifying the state) // or its norm (used when calculating expected values) - cu_qcomp exponent; - functor_multiplyElemPowerWithAmpOrNorm(cu_qcomp power) : exponent(power) {} + gpu_qcomp exponent; + functor_multiplyElemPowerWithAmpOrNorm(gpu_qcomp power) : exponent(power) {} - __host__ __device__ cu_qcomp operator()(cu_qcomp quregAmp, cu_qcomp matrElem) { + __host__ __device__ gpu_qcomp operator()(gpu_qcomp quregAmp, gpu_qcomp matrElem) { if constexpr (HasPower && ! UseRealPow) - matrElem = getCompPower(matrElem, exponent); + matrElem = pow(matrElem, exponent); if constexpr (HasPower && UseRealPow) - matrElem = getCuQcomp(pow(getCompReal(matrElem), getCompReal(exponent)),0); // CUDA pow(qreal,qreal) + matrElem = getGpuQcomp(pow(real(matrElem), real(exponent)), 0); // CUDA pow(qreal,qreal) if constexpr (Norm) - quregAmp = getCuQcomp(getCompNorm(quregAmp), 0); + quregAmp = getGpuQcomp(norm(quregAmp), 0); return matrElem * quregAmp; } @@ -466,15 +462,15 @@ template struct functor_getFidelityTerm { int rank, numQubits; qindex logNumAmpsPerNode, numAmpsPerCol; - cu_qcomp *rho, *psi; + gpu_qcomp *rho, *psi; functor_getFidelityTerm( - int _rank, int _numQubits, qindex _logNumAmpsPerNode, qindex _numAmpsPerCol, cu_qcomp* _rho, cu_qcomp* _psi + int _rank, int _numQubits, qindex _logNumAmpsPerNode, qindex _numAmpsPerCol, gpu_qcomp* _rho, gpu_qcomp* _psi ) : rank(_rank), numQubits(_numQubits), logNumAmpsPerNode(_logNumAmpsPerNode), numAmpsPerCol(_numAmpsPerCol), rho(_rho), psi(_psi) {} - __host__ __device__ cu_qcomp operator()(qindex n) { + __host__ __device__ gpu_qcomp operator()(qindex n) { // i = global index of nth local amp of rho qindex i = concatenateBits(rank, n, logNumAmpsPerNode); @@ -484,18 +480,18 @@ struct functor_getFidelityTerm { qindex c = getBitsLeftOfIndex(i, numQubits-1); // collect amps involved in this term - cu_qcomp rhoAmp = rho[n]; - cu_qcomp rowAmp = psi[r]; - cu_qcomp colAmp = psi[c]; + gpu_qcomp rhoAmp = rho[n]; + gpu_qcomp rowAmp = psi[r]; + gpu_qcomp colAmp = psi[c]; // compute term of or if constexpr (Conj) { - rhoAmp = getCompConj(rhoAmp); - colAmp = getCompConj(colAmp); + rhoAmp = conj(rhoAmp); + colAmp = conj(colAmp); } else - rowAmp = getCompConj(rowAmp); + rowAmp = conj(rowAmp); - cu_qcomp fid = rhoAmp * rowAmp * colAmp; + gpu_qcomp fid = rhoAmp * rowAmp * colAmp; return fid; } }; @@ -525,7 +521,7 @@ struct functor_projectStateVec { assert_numTargsMatchesTemplateParam(numTargets, NumTargets); } - __host__ __device__ cu_qcomp operator()(qindex n, cu_qcomp amp) { + __host__ __device__ gpu_qcomp operator()(qindex n, gpu_qcomp amp) { // use the compile-time value if possible, to auto-unroll the getValueOfBits() loop below SET_VAR_AT_COMPILE_TIME(int, numBits, NumTargets, numTargets); @@ -562,7 +558,7 @@ struct functor_projectDensMatr { assert_numTargsMatchesTemplateParam(numTargets, NumTargets); } - __host__ __device__ cu_qcomp operator()(qindex n, cu_qcomp amp) { + __host__ __device__ gpu_qcomp operator()(qindex n, gpu_qcomp amp) { // use the compile-time value if possible, to auto-unroll the getValueOfBits() loop below SET_VAR_AT_COMPILE_TIME(int, numBits, NumTargets, numTargets); @@ -603,7 +599,7 @@ struct functor_setRandomStateVecAmp { unsigned baseSeed; functor_setRandomStateVecAmp(unsigned seed) : baseSeed(seed) {} - __host__ __device__ cu_qcomp operator()(qindex ampInd) { + __host__ __device__ gpu_qcomp operator()(qindex ampInd) { // wastefully create new distributions for every amp thrust::random::normal_distribution normDist(0, 1); // mean=0, var=1 @@ -634,8 +630,8 @@ struct functor_setRandomStateVecAmp { auto iphase = thrust::complex(0, phase); auto amp = sqrt(prob) * thrust::exp(iphase); // CUDA sqrt - // cast thrust::complex to cu_qcomp - return getCuQcomp(amp.real(), amp.imag()); + // cast thrust::complex to gpu_qcomp + return getGpuQcomp(amp.real(), amp.imag()); } }; @@ -654,7 +650,7 @@ void thrust_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, Pauli thrust::device_vector devStrings(in.strings, in.strings + in.numTerms); // obtain raw pointers which can be passed to fastmath.hpp routines - cu_qcomp* devCoeffsPtr = toCuQcomps(thrust::raw_pointer_cast(devCoeffs.data())); + gpu_qcomp* devCoeffsPtr = getGpuQcompPtr(thrust::raw_pointer_cast(devCoeffs.data())); PauliStr* devStringsPtr = thrust::raw_pointer_cast(devStrings.data()); int rank = out.isDistributed? comm_getRank() : 0; @@ -663,7 +659,7 @@ void thrust_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, Pauli // indicates the PauliStrSum is diagonal (contains only I or Z) auto functor = functor_setAmpToPauliStrSumElem( rank, out.numElems, logNumElemsPerNode, - in.numTerms, toCuQcomps(out.gpuElems), devCoeffsPtr, devStringsPtr); + in.numTerms, getGpuQcompPtr(out.gpuElems), devCoeffsPtr, devStringsPtr); auto indIter = thrust::make_counting_iterator(QINDEX_ZERO); auto endIter = indIter + out.numElemsPerNode; @@ -677,7 +673,7 @@ void thrust_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, Pauli */ -void thrust_setElemsToConjugate(cu_qcomp* matrElemsPtr, qindex matrElemsLen) { +void thrust_setElemsToConjugate(gpu_qcomp* matrElemsPtr, qindex matrElemsLen) { auto ptr = getStartPtr(matrElemsPtr); thrust::transform(ptr, ptr + matrElemsLen, ptr, functor_getAmpConj()); @@ -700,13 +696,13 @@ void thrust_densmatr_setAmpsToPauliStrSum_sub(Qureg qureg, PauliStrSum sum) { thrust::device_vector devStrings(sum.strings, sum.strings + sum.numTerms); // obtain raw pointers which can be passed to fastmath.hpp routines - cu_qcomp* devCoeffsPtr = toCuQcomps(thrust::raw_pointer_cast(devCoeffs.data())); + gpu_qcomp* devCoeffsPtr = getGpuQcompPtr(thrust::raw_pointer_cast(devCoeffs.data())); PauliStr* devStringsPtr = thrust::raw_pointer_cast(devStrings.data()); // indicates the PauliStrSum is not diagonal (contains X or Y) auto functor = functor_setAmpToPauliStrSumElem( qureg.rank, powerOf2(qureg.numQubits), qureg.logNumAmpsPerNode, - sum.numTerms, toCuQcomps(qureg.gpuAmps), devCoeffsPtr, devStringsPtr); + sum.numTerms, getGpuQcompPtr(qureg.gpuAmps), devCoeffsPtr, devStringsPtr); auto indIter = thrust::make_counting_iterator(QINDEX_ZERO); auto endIter = indIter + qureg.numAmpsPerNode; @@ -724,7 +720,7 @@ void thrust_densmatr_mixQureg_subA(qreal outProb, Qureg outQureg, qreal inProb, template -void thrust_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, cu_qcomp exponent) { +void thrust_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, gpu_qcomp exponent) { thrust::transform( getStartPtr(qureg), getEndPtr(qureg), @@ -832,13 +828,13 @@ qreal thrust_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector q */ -cu_qcomp thrust_statevec_calcInnerProduct_sub(Qureg quregA, Qureg quregB) { +gpu_qcomp thrust_statevec_calcInnerProduct_sub(Qureg quregA, Qureg quregB) { - cu_qcomp init = getCuQcomp(0, 0); + gpu_qcomp init = getGpuQcomp(0, 0); - cu_qcomp prod = thrust::inner_product( + gpu_qcomp prod = thrust::inner_product( getStartPtr(quregA), getEndPtr(quregA), getStartPtr(quregB), - init, thrust::plus(), functor_getAmpConjProd()); + init, thrust::plus(), functor_getAmpConjProd()); return prod; } @@ -857,20 +853,20 @@ qreal thrust_densmatr_calcHilbertSchmidtDistance_sub(Qureg quregA, Qureg quregB) template -cu_qcomp thrust_densmatr_calcFidelityWithPureState_sub(Qureg rho, Qureg psi) { +gpu_qcomp thrust_densmatr_calcFidelityWithPureState_sub(Qureg rho, Qureg psi) { - // functor accepts an index and produces a cu_qcomp + // functor accepts an index and produces a gpu_qcomp auto functor = functor_getFidelityTerm( rho.rank, rho.numQubits, rho.logNumAmpsPerNode, - psi.numAmps, toCuQcomps(rho.gpuAmps), toCuQcomps(psi.gpuAmps)); + psi.numAmps, getGpuQcompPtr(rho.gpuAmps), getGpuQcompPtr(psi.gpuAmps)); auto indIter = thrust::make_counting_iterator(QINDEX_ZERO); qindex numIts = rho.numAmpsPerNode; - cu_qcomp init = getCuQcomp(0, 0); - cu_qcomp fid = thrust::transform_reduce( + gpu_qcomp init = getGpuQcomp(0, 0); + gpu_qcomp fid = thrust::transform_reduce( indIter, indIter + numIts, - functor, init, thrust::plus()); + functor, init, thrust::plus()); return fid; } @@ -897,71 +893,71 @@ qreal thrust_statevec_calcExpecAnyTargZ_sub(Qureg qureg, vector targs) { } -cu_qcomp thrust_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector targs) { +gpu_qcomp thrust_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector targs) { qindex dim = powerOf2(qureg.numQubits); qindex ind = util_getLocalIndexOfFirstDiagonalAmp(qureg); qindex mask = util_getBitMask(targs); - auto functor = functor_getExpecDensMatrZTerm(dim, ind, mask, toCuQcomps(qureg.gpuAmps)); + auto functor = functor_getExpecDensMatrZTerm(dim, ind, mask, getGpuQcompPtr(qureg.gpuAmps)); - cu_qcomp init = getCuQcomp(0, 0); + gpu_qcomp init = getGpuQcomp(0, 0); auto indIter = thrust::make_counting_iterator(QINDEX_ZERO); auto endIter = indIter + powerOf2(qureg.logNumColsPerNode); - return thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); + return thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); } -cu_qcomp thrust_statevec_calcExpecPauliStr_subA(Qureg qureg, vector x, vector y, vector z) { +gpu_qcomp thrust_statevec_calcExpecPauliStr_subA(Qureg qureg, vector x, vector y, vector z) { qindex maskXY = util_getBitMask(util_getConcatenated(x, y)); qindex maskYZ = util_getBitMask(util_getConcatenated(y, z)); - auto ampsPtr = toCuQcomps(qureg.gpuAmps); + auto ampsPtr = getGpuQcompPtr(qureg.gpuAmps); auto functor = functor_getExpecStateVecPauliTerm(maskXY, maskYZ, ampsPtr, ampsPtr); // amps=pairAmps - cu_qcomp init = getCuQcomp(0, 0); + gpu_qcomp init = getGpuQcomp(0, 0); auto indIter = thrust::make_counting_iterator(QINDEX_ZERO); auto endIter = indIter + qureg.numAmpsPerNode; - cu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); + gpu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); - return value * toCuQcomp(util_getPowerOfI(y.size())); + return value * getGpuQcomp(util_getPowerOfI(y.size())); } -cu_qcomp thrust_statevec_calcExpecPauliStr_subB(Qureg qureg, vector x, vector y, vector z) { +gpu_qcomp thrust_statevec_calcExpecPauliStr_subB(Qureg qureg, vector x, vector y, vector z) { qindex maskXY = util_getBitMask(util_getConcatenated(x, y)); qindex maskYZ = util_getBitMask(util_getConcatenated(y, z)); - auto ampsPtr = toCuQcomps(qureg.gpuAmps); - auto buffPtr = toCuQcomps(qureg.gpuCommBuffer); + auto ampsPtr = getGpuQcompPtr(qureg.gpuAmps); + auto buffPtr = getGpuQcompPtr(qureg.gpuCommBuffer); auto functor = functor_getExpecStateVecPauliTerm(maskXY, maskYZ, ampsPtr, buffPtr); - cu_qcomp init = getCuQcomp(0, 0); + gpu_qcomp init = getGpuQcomp(0, 0); auto indIter = thrust::make_counting_iterator(QINDEX_ZERO); auto endIter = indIter + qureg.numAmpsPerNode; - cu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); + gpu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); - return value * toCuQcomp(util_getPowerOfI(y.size())); + return value * getGpuQcomp(util_getPowerOfI(y.size())); } -cu_qcomp thrust_densmatr_calcExpecPauliStr_sub(Qureg qureg, vector x, vector y, vector z) { +gpu_qcomp thrust_densmatr_calcExpecPauliStr_sub(Qureg qureg, vector x, vector y, vector z) { qindex mXY = util_getBitMask(util_getConcatenated(x, y)); qindex mYZ = util_getBitMask(util_getConcatenated(y, z)); qindex dim = powerOf2(qureg.numQubits); qindex ind = util_getLocalIndexOfFirstDiagonalAmp(qureg); - auto functor = functor_getExpecDensMatrPauliTerm(mXY, mYZ, dim, ind, toCuQcomps(qureg.gpuAmps)); + auto functor = functor_getExpecDensMatrPauliTerm(mXY, mYZ, dim, ind, getGpuQcompPtr(qureg.gpuAmps)); - cu_qcomp init = getCuQcomp(0, 0); + gpu_qcomp init = getGpuQcomp(0, 0); auto indIter = thrust::make_counting_iterator(QINDEX_ZERO); auto endIter = indIter + powerOf2(qureg.logNumColsPerNode); - cu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); + gpu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); - return value * toCuQcomp(util_getPowerOfI(y.size())); + return value * getGpuQcomp(util_getPowerOfI(y.size())); } @@ -972,33 +968,33 @@ cu_qcomp thrust_densmatr_calcExpecPauliStr_sub(Qureg qureg, vector x, vecto template -cu_qcomp thrust_statevec_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, cu_qcomp expo) { +gpu_qcomp thrust_statevec_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, gpu_qcomp expo) { - cu_qcomp init = getCuQcomp(0, 0); + gpu_qcomp init = getGpuQcomp(0, 0); auto functor = functor_multiplyElemPowerWithAmpOrNorm(expo); - cu_qcomp value = thrust::inner_product( + gpu_qcomp value = thrust::inner_product( getStartPtr(qureg), getEndPtr(qureg), getStartPtr(matr), - init, thrust::plus(), functor); + init, thrust::plus(), functor); return value; } template -cu_qcomp thrust_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, cu_qcomp expo) { +gpu_qcomp thrust_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, gpu_qcomp expo) { qindex dim = powerOf2(qureg.numQubits); qindex ind = util_getLocalIndexOfFirstDiagonalAmp(qureg); - auto ampsPtr = toCuQcomps(qureg.gpuAmps); - auto elemsPtr = toCuQcomps(matr.gpuElems); + auto ampsPtr = getGpuQcompPtr(qureg.gpuAmps); + auto elemsPtr = getGpuQcompPtr(matr.gpuElems); auto functor = functor_getExpecDensMatrDiagMatrTerm(dim, ind, ampsPtr, elemsPtr, expo); - cu_qcomp init = getCuQcomp(0, 0); + gpu_qcomp init = getGpuQcomp(0, 0); auto indIter = thrust::make_counting_iterator(QINDEX_ZERO); auto endIter = indIter + powerOf2(qureg.logNumColsPerNode); - return thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); + return thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); } @@ -1047,7 +1043,7 @@ void thrust_densmatr_multiQubitProjector_sub(Qureg qureg, vector qubits, ve */ -void thrust_statevec_initUniformState(Qureg qureg, cu_qcomp amp) { +void thrust_statevec_initUniformState(Qureg qureg, gpu_qcomp amp) { thrust::fill(getStartPtr(qureg), getEndPtr(qureg), amp); } @@ -1057,11 +1053,11 @@ void thrust_statevec_initDebugState_sub(Qureg qureg) { // globally, |n> gains coefficient 2n/10 + i(2n+1)/10, // which is a step-size of 2/10 + i(2/10)... - cu_qcomp step = getCuQcomp(2/10., 2/10.); + gpu_qcomp step = getGpuQcomp(2/10., 2/10.); // and each node begins from a unique n (if distributed) qindex n = util_getGlobalIndexOfFirstLocalAmp(qureg); - cu_qcomp init = getCuQcomp(2*n/10., (2*n+1)/10.); + gpu_qcomp init = getGpuQcomp(2*n/10., (2*n+1)/10.); thrust::sequence(getStartPtr(qureg), getEndPtr(qureg), init, step); } diff --git a/quest/src/gpu/gpu_types.cuh b/quest/src/gpu/gpu_types.cuh deleted file mode 100644 index a934ecef..00000000 --- a/quest/src/gpu/gpu_types.cuh +++ /dev/null @@ -1,274 +0,0 @@ -/** @file - * CUDA and HIP-compatible complex types. This file is only ever included - * when COMPILE_CUDA=1 so it can safely invoke CUDA signatures without guards. - * - * This header is safe to re-include by multiple files because typedef - * redefinition is legal in C++, and all functions herein are inline. - * Furthermore, since it is only ever parsed by nvcc, the __host__ symbols - * are safely processed by other nvcc-only GPU files, like the cuquantum backend. - * - * @author Tyson Jones - * @author Oliver Brown (patched HIP arithmetic overloads) - */ - -#ifndef GPU_TYPES_HPP -#define GPU_TYPES_HPP - -#include "quest/include/config.h" -#include "quest/include/types.h" -#include "quest/include/precision.h" - -#include "quest/src/core/inliner.hpp" - -#if ! COMPILE_CUDA - #error "A file being compiled somehow included gpu_types.hpp despite QuEST not being compiled in GPU-accelerated mode." -#endif - -#if defined(__NVCC__) - #include -#elif defined(__HIP__) - #include "quest/src/gpu/cuda_to_hip.hpp" -#endif - -#include -#include - - - -/* - * CUDA-COMPATIBLE QCOMP ALIAS (cu_qcomp) - * - * which we opt to use over a Thrust complex type to gaurantee - * compatibility with cuQuantum, though this irritatingly - * requires explicitly defining operator overloads below - */ - - -#if (FLOAT_PRECISION == 1) - typedef cuFloatComplex cu_qcomp; - -#elif (FLOAT_PRECISION == 2) - typedef cuDoubleComplex cu_qcomp; - -#else - #error "Build bug; precision.h should have prevented non-float non-double qcomp precision on GPU." - -#endif - - - -/* - * TRANSFORMING qcomp AND cu_qcomp - */ - - -INLINE cu_qcomp getCuQcomp(qreal re, qreal im) { - -#if (FLOAT_PRECISION == 1) - return make_cuFloatComplex(re, im); -#else - return make_cuDoubleComplex(re, im); -#endif -} - - -__host__ inline cu_qcomp toCuQcomp(qcomp a) { - return getCuQcomp(std::real(a), std::imag(a)); -} -__host__ inline qcomp toQcomp(cu_qcomp a) { - return getQcomp(a.x, a.y); -} - - -__host__ inline cu_qcomp* toCuQcomps(qcomp* a) { - - // reinterpret a qcomp ptr as a cu_qcomp ptr, - // which is ONLY SAFE when comp and cu_qcomp - // have identical memory layouts. Be very - // careful; HIP stack arrays (e.g. qcomp[]) - // seg-fault when passed here, so this funciton - // should only ever be used on malloc'd data! - // Stack objects should use the below unpacks. - - return reinterpret_cast(a); -} - - -__host__ inline std::array unpackMatrixToCuQcomps(DiagMatr1 in) { - - // it's crucial we explicitly copy over the elements, - // rather than just reinterpret the pointer, to avoid - // segmentation faults when memory misaligns (like on HIP) - - return {toCuQcomp(in.elems[0]), toCuQcomp(in.elems[1])}; -} - - -__host__ inline std::array unpackMatrixToCuQcomps(DiagMatr2 in) { - - return { - toCuQcomp(in.elems[0]), toCuQcomp(in.elems[1]), - toCuQcomp(in.elems[2]), toCuQcomp(in.elems[3])}; -} - - -__host__ inline std::array unpackMatrixToCuQcomps(CompMatr1 in) { - - std::array out{}; - for (int i=0; i<4; i++) - out[i] = toCuQcomp(in.elems[i/2][i%2]); - - return out; -} - - -__host__ inline std::array unpackMatrixToCuQcomps(CompMatr2 in) { - - std::array out{}; - for (int i=0; i<16; i++) - out[i] = toCuQcomp(in.elems[i/4][i%4]); - - return out; -} - - - - -/* - * cu_qcomp ARITHMETIC OVERLOADS - * - * which are only needed by NVCC because - * HIP defines them for us. This good deed - * goes punished; a HIP bug disables our - * use of *= and += overloads, so kernels.cuh - * has disgusting (x = x * y) statements. Bah! - */ - - -/// @todo -/// - clean this up (with templates?) -/// - use getCuQcomp() rather than struct creation, -/// to make the algebra implementation-agnostic - - -#if defined(__NVCC__) - -INLINE cu_qcomp operator + (const cu_qcomp& a, const cu_qcomp& b) { - cu_qcomp out = { - .x = a.x + b.x, - .y = a.y + b.y - }; - return out; -} - -INLINE cu_qcomp operator - (const cu_qcomp& a, const cu_qcomp& b) { - cu_qcomp out = { - .x = a.x - b.x, - .y = a.y - b.y - }; - return out; -} - -INLINE cu_qcomp operator * (const cu_qcomp& a, const cu_qcomp& b) { - cu_qcomp out = { - .x = a.x * b.x - a.y * b.y, - .y = a.x * b.y + a.y * b.x - }; - return out; -} - - -INLINE cu_qcomp operator + (const cu_qcomp& a, const qreal& b) { - cu_qcomp out = { - .x = a.x + b, - .y = a.y + b - }; - return out; -} -INLINE cu_qcomp operator + (const qreal& b, const cu_qcomp& a) { - cu_qcomp out = { - .x = a.x + b, - .y = a.y + b - }; - return out; -} - -INLINE cu_qcomp operator - (const cu_qcomp& a, const qreal& b) { - cu_qcomp out = { - .x = a.x - b, - .y = a.y - b - }; - return out; -} -INLINE cu_qcomp operator - (const qreal& b, const cu_qcomp& a) { - cu_qcomp out = { - .x = a.x - b, - .y = a.y - b - }; - return out; -} - -INLINE cu_qcomp operator * (const cu_qcomp& a, const qreal& b) { - cu_qcomp out = { - .x = a.x * b, - .y = a.y * b - }; - return out; -} -INLINE cu_qcomp operator * (const qreal& b, const cu_qcomp& a) { - cu_qcomp out = { - .x = a.x * b, - .y = a.y * b - }; - return out; -} - -#endif - - - -/* - * cu_qcomp UNARY FUNCTIONS - */ - - -INLINE qreal getCompReal(cu_qcomp num) { - return num.x; -} - -INLINE cu_qcomp getCompConj(cu_qcomp num) { - num.y *= -1; - return num; -} - -INLINE qreal getCompNorm(cu_qcomp num) { - return (num.x * num.x) + (num.y * num.y); -} - -INLINE cu_qcomp getCompPower(cu_qcomp base, cu_qcomp exponent) { - - // using https://mathworld.wolfram.com/ComplexExponentiation.html, - // and the principal argument of 'base' - - // base = a + b i, exponent = c + d i - qreal a = base.x; - qreal b = base.y; - qreal c = exponent.x; - qreal d = exponent.y; - - // intermediate quantities (uses CUDA atan2,log,pow,exp,cos,sin) - qreal arg = atan2(b, a); - qreal mag = a*a + b*b; - qreal ln = log(mag); - qreal fac = pow(mag, c/2) * exp(-d * arg); - qreal ang = c*arg + d*ln/2; - - // output scalar - qreal re = fac * cos(ang); - qreal im = fac * sin(ang); - return getCuQcomp(re, im); -} - - - -#endif // GPU_TYPES_HPP \ No newline at end of file diff --git a/utils/scripts/compile.sh b/utils/scripts/compile.sh index 8eca0ae2..609f85be 100755 --- a/utils/scripts/compile.sh +++ b/utils/scripts/compile.sh @@ -312,8 +312,6 @@ echo "deployment modes:" ALL_LINK_FLAGS="${USER_LINK_FLAGS}" # choose compiler and flags for CPU/OMP files -CPU_FILES_FLAGS='-Ofast -DCOMPLEX_OVERLOADS_PATCHED=1' - if [ $ENABLE_MULTITHREADING == 1 ] then echo "${INDENT}(multithreading enabled)"