diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml
index 0950b7db..fd74c294 100644
--- a/.github/workflows/compile.yml
+++ b/.github/workflows/compile.yml
@@ -28,6 +28,7 @@ on:
     branches:
       - main
       - devel
+      - unifying-base-qcomp
   pull_request:
     branches:
       - main
@@ -253,6 +254,11 @@ jobs:
           -DCMAKE_HIP_ARCHITECTURES=${{ env.hip_arch }}
           -DCMAKE_CXX_COMPILER=${{ matrix.compiler }}
           -DCMAKE_CXX_FLAGS=${{ matrix.mpi == 'ON' && matrix.cuda == 'ON' && '-fno-lto' || '' }}
+          -DCMAKE_CXX_FLAGS_DEBUG=${{ matrix.compiler == 'cl' && '/MP1' || '' }}
+          -DCMAKE_CXX_FLAGS_RELEASE=${{ matrix.compiler == 'cl' && '/MP1' || '' }}
+
+      ### DEBUG:
+      ### above disables parallel compilation with MSVC
 
       # force 'Release' build (needed by MSVC to enable optimisations)
       - name: Compile
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5d308795..dc209ca5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -469,71 +469,6 @@ set(COMPILE_HIP ${ENABLE_HIP})
 
 
 
-# ============================
-# Patch CPU performance
-# ============================
-
-
-# Patch performance of CPU std::complex arithmetic operator overloads.
-# The cpu_subroutines.cpp file makes extensive use of std::complex operator
-# overloads, and alas these are significantly slower than hand-rolled 
-# arithmetic, due to their NaN and inf checks, and interference with SIMD.
-# It is crucial to pass additional optimisation flags to this file to restore
-# hand-rolled performance (else QuEST v3 is faster than v4 eep). In theory,
-# we can achieve this with specific, relatively 'safe' flags such as LLVM's:
-#     -ffinite-math-only -fno-signed-zeros -ffp-contract=fast
-# However, it is a nuisance to find equivalent flags for different compilers
-# and monitor their performance vs accuracy trade-offs. So instead, we use the
-# much more aggressive and ubiquitous -Ofast flag to guarantee performance. 
-# This introduces many potentially dangerous optimisations, such as asserting
-# associativity of flops, which would break techniques like Kahan summation.
-# The cpu_subroutines.cpp must ergo be very conscious of these optimisations.
-# We here also explicitly inform the file cpu_subroutines.cpp whether or not
-# we are passing the flags, so it can detect/error when flags are forgotten.
-
-if (CMAKE_BUILD_TYPE STREQUAL "Release")
-
-  # Release build will pass -Ofast when known for the given compiler, and
-  # fallback to giving a performance warning and proceeding with compilation
-
-  if (CMAKE_CXX_COMPILER_ID MATCHES "AppleClang|Clang|Cray|CrayClang|GNU|HP|Intel|IntelLLVM|NVHPC|NVIDIA|XL|XLClang")
-    set(patch_flags "-Ofast")
-    set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=1")
-  elseif (CMAKE_CXX_COMPILER_ID MATCHES "HP")
-    set(patch_flags "+Ofast")
-    set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=1")
-  elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
-    set(patch_flags "/fp:fast")
-    set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=1")
-  else()
-    message(WARNING 
-      "The compiler (${CMAKE_CXX_COMPILER_ID}) is unrecognised and so crucial optimisation flags have not been "
-      "passed to the CPU backend. These flags are necessary for full performance when performing complex algebra, "
-      "otherwise a slowdown of 3-50x may be observed. Please edit the root CMakeLists.txt to include flags which are "
-      "equivalent to GNU's -Ofast flag for your compiler (search this warning), or contact the QuEST developers for help."
-    )
-    set(patch_flags "")
-    set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=0")
-  endif()
-  
-else()
-
-  # Non-release builds (e.g. Debug) will pass no optimisation flags, and will
-  # communicate to cpu_subroutines.cpp that this is intentional via a macro
-
-  set(patch_flags "")
-  set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=0")
-
-endif()
-
-set_source_files_properties(
-  quest/src/cpu/cpu_subroutines.cpp
-  PROPERTIES
-  COMPILE_FLAGS "${patch_flags} ${patch_macro}"
-)
-
-
-
 # ============================
 # Pass files to library
 # ============================
diff --git a/quest/include/types.h b/quest/include/types.h
index 066d35e9..f1f49315 100644
--- a/quest/include/types.h
+++ b/quest/include/types.h
@@ -158,9 +158,10 @@ static inline qcomp getQcomp(qreal re, qreal im) {
     // not the same precision as qcomp, so compilation will fail depending
     // on the setting of PRECISION. To avoid this, we'll define overloads
     // between all type/precision permutations, always returning qcomp. These
-    // overloads are also used by the QuEST source code. Via the unholy macros 
-    // below, we create 312 overloads; no doubt this is going to break something
-    // in the future, for which I am already sorry :'(
+    // overloads are also used by the QuEST source code (though incidentally,
+    // not the high performance backend which uses custom complex overloads).
+    // Via the unholy macros below, we create 312 overloads; no doubt this is 
+    // going to break something in the future, for which I am already sorry :'(
 
     /// @cond EXCLUDE_FROM_DOXYGEN
 
diff --git a/quest/src/core/base_qcomp.hpp b/quest/src/core/base_qcomp.hpp
new file mode 100644
index 00000000..522983c1
--- /dev/null
+++ b/quest/src/core/base_qcomp.hpp
@@ -0,0 +1,227 @@
+/** @file
+ * Definition of base_qcomp, which is extended by the CPU and GPU
+ * backends (into cpu_qcomp and gpu_qcomp) and used in hot loops
+ * and kernels.
+ * 
+ * The user-facing qcomp (which in the QuEST middle-end, resolves to 
+ * std::complex) is not used by the CPU backend, since it creates
+ * performance pitfalls (e.g. expensive NaN checks within arithmetic
+ * operators) in some compilers, and is furthermore illegal in the
+ * GPU backend (i.e. within CUDA kernels). So the backends instead
+ * use custom complex types with identical memory layouts/alignment
+ * to qcomp. Those types extend base_qcomp defined in this file,
+ * since they otherwise share all the same arithmetic boilerplate.
+ * 
+ * @author Tyson Jones
+ */
+
+#ifndef BASE_QCOMP_HPP
+#define BASE_QCOMP_HPP
+
+#include "quest/include/types.h"
+
+#include "quest/src/core/inliner.hpp"
+
+
+
+/*
+ * BASE DEFINITION
+ *
+ * which must remain POD (a simple {re,im}) and with an identical
+ * memory layout and alignment to qcomp (i.e. std::complex). Only
+ * the in-place arithmetic overloads are defined below which are
+ * reused by the subsequent out-of-place overloads, to avoid
+ * code duplication.
+ */
+
+struct base_qcomp {
+
+    qreal re;
+    qreal im;
+
+
+    /*
+     * IN-PLACE COMPLEX ARITHMETIC
+     */
+    
+    INLINE base_qcomp& operator += (const base_qcomp& a) noexcept {
+        re += a.re;
+        im += a.im;
+        return *this;
+    }
+
+    INLINE base_qcomp& operator -= (const base_qcomp& a) noexcept {
+        re -= a.re;
+        im -= a.im;
+        return *this;
+    }
+
+    INLINE base_qcomp& operator *= (const base_qcomp& a) noexcept {
+        qreal re_ = re;
+        qreal im_ = im;
+        re = (re_ * a.re) - (im_ * a.im);
+        im = (re_ * a.im) + (im_ * a.re);
+        return *this;
+    }
+
+
+    /*
+     * IN-PLACE MIXED-TYPE ARITHMETIC
+     */
+
+    INLINE base_qcomp& operator *= (const int& a) noexcept {
+        re *= a;
+        im *= a;
+        return *this;
+    }
+
+    INLINE base_qcomp& operator *= (const qreal& a) noexcept {
+        re *= a;
+        im *= a;
+        return *this;
+    }
+
+    INLINE base_qcomp& operator *= (const size_t& a) noexcept {
+        re *= a;
+        im *= a;
+        return *this;
+    }
+
+}; // base_qcomp
+
+
+
+/*
+ * OUT-OF-PLACE COMPLEX ARITHMETIC
+ * 
+ * which avoid code duplication by re-using the
+ * in-place arithmetic operator overloads above
+ */
+
+INLINE base_qcomp operator + (base_qcomp a, const base_qcomp& b) noexcept {
+    a += b;
+    return a;
+}
+
+INLINE base_qcomp operator - (base_qcomp a, const base_qcomp& b) noexcept {
+    a -= b;
+    return a;
+}
+
+INLINE base_qcomp operator * (base_qcomp a, const base_qcomp& b) noexcept {
+    a *= b;
+    return a;
+}
+
+
+
+/*
+ * OUT-OF-PLACE MIXED-TYPE ARITHMETIC
+ * 
+ * which avoid code duplication by re-using the
+ * in-place arithmetic operator overloads above
+ */
+
+
+// base_qcomp * other
+
+INLINE base_qcomp operator * (base_qcomp a, const int& b) noexcept {
+    a *= b;
+    return a;
+}
+
+INLINE base_qcomp operator * (base_qcomp a, const qreal& b) noexcept {
+    a *= b;
+    return a;
+}
+
+INLINE base_qcomp operator * (base_qcomp a, const size_t& b) noexcept {
+    a *= b;
+    return a;
+}
+
+
+// other * base_qcomp (via commutation)
+
+INLINE base_qcomp operator * (const int& a, const base_qcomp& b) noexcept {
+    return b * a;
+}
+
+INLINE base_qcomp operator * (const qreal& a, const base_qcomp& b) noexcept {
+    return b * a;
+}
+
+INLINE base_qcomp operator * (const size_t& a, const base_qcomp& b) noexcept {
+    return b * a;
+}
+
+
+
+/*
+ * BACKEND-AGNOSTIC MATHS
+ */
+
+INLINE qreal real(const base_qcomp& a) {
+    return a.re;
+}
+
+INLINE qreal imag(const base_qcomp& a) {
+    return a.im;
+}
+
+INLINE base_qcomp conj(const base_qcomp& a) {
+    return {a.re, - a.im};
+}
+
+INLINE qreal norm(const base_qcomp& a) noexcept {
+    return (a.re * a.re) + (a.im * a.im);
+}
+
+
+
+/*
+ * CONVERTERS
+ */
+
+INLINE base_qcomp* getBaseQcompPtr(qcomp* list) {
+    return reinterpret_cast<base_qcomp*>(list);
+}
+
+INLINE base_qcomp getBaseQcomp(qreal re, qreal im) {
+    return { re, im };
+}
+
+INLINE base_qcomp getBaseQcomp(const qcomp& a) {
+    return { a.real(), a.imag() };
+}
+
+INLINE qcomp getQcomp(const base_qcomp& a) {
+    return qcomp( a.re, a.im );
+}
+
+
+
+/*
+ * CHECK COMPATIBILITY WITH QCOMP
+ */
+
+
+// check the memory layout of base_qcomp agrees with qcomp, since
+// it is not formally gauranteed, unlike _Complex and std::complex
+static_assert(sizeof (base_qcomp) == sizeof (qcomp));
+static_assert(alignof(base_qcomp) == alignof(qcomp));
+static_assert(std::is_standard_layout_v   <base_qcomp>);
+static_assert(std::is_trivially_copyable_v<base_qcomp>);
+
+
+// TODO:
+// the above checks are potentially inadequate to identify an
+// insidious incompatibility between qcomp and base_qcomp - perhaps
+// we should perform a compile-time duck-check, casting a small
+// array between them and checking no data is corrupted? Perhaps
+// a runtime check in initQuESTEnv() is also necessary, checking the
+// casting is safe for all circumstances (e.g. heap mem, static lists)
+
+
+
+#endif // BASE_QCOMP_HPP
\ No newline at end of file
diff --git a/quest/src/core/fastmath.hpp b/quest/src/core/fastmath.hpp
index 6367e116..79884659 100644
--- a/quest/src/core/fastmath.hpp
+++ b/quest/src/core/fastmath.hpp
@@ -1,7 +1,12 @@
 /** @file
  * Oerations used by all deployment modes for fast,
  * low-level maths, inlined and callable within hot
- * loops (i.e OpenMP loops and CUDA kernels)
+ * loops (i.e OpenMP loops and CUDA kernels).
+ * 
+ * Note this file uses the backend OpenMP/CUDA-agnostic
+ * base_qcomp, in lieu of the C++-user-facing qcomp (i.e.
+ * std::complex), to avoid its compiler-specific performance
+ * pitfalls.
  * 
  * @author Tyson Jones
  */
@@ -15,27 +20,7 @@
 
 #include "quest/src/core/inliner.hpp"
 #include "quest/src/core/bitwise.hpp"
-
-
-
-/*
- * TYPE ALIASING
- */
-
-
-// 'qcomp' cannot be used inside CUDA kernels/thrust, so must not appear in
-// these inlined definitions. Instead, we create an alias which will resolve
-// to 'qcomp' (defined in types.h) when parsed by the CPU backend, and 'cu_qcomp'
-// (defined in gpu_types.cuh which is not explicitly resolved in this header)
-// when parsed by the GPU backend, which will prior define USE_CU_QCOMP. It is
-// essential this header is included after gpu_types.cuh is included by the
-// GPU backend. Hacky, but avoids code duplication!
-
-#ifdef USE_CU_QCOMP
-    #define QCOMP_ALIAS cu_qcomp
-#else
-    #define QCOMP_ALIAS qcomp
-#endif
+#include "quest/src/core/base_qcomp.hpp"
 
 
 
@@ -121,7 +106,7 @@ INLINE void fast_getSubQuregValues(qindex basisStateIndex, int* numQubitsPerSubQ
  */
 
 
-INLINE QCOMP_ALIAS fast_getPauliStrElem(PauliStr str, qindex row, qindex col) {
+INLINE base_qcomp fast_getPauliStrElem(PauliStr str, qindex row, qindex col) {
 
     // this function is called by both fullstatediagmatr_setElemsToPauliStrSum()
     // and densmatr_setAmpsToPauliStrSum_sub(). The former's PauliStr can have
@@ -133,13 +118,13 @@ INLINE QCOMP_ALIAS fast_getPauliStrElem(PauliStr str, qindex row, qindex col) {
     // though opens the risk that the former caller erroneously has its upper
     // Paulis ignore. We forego this optimisation in defensive design, and
     // because this function is only invoked during data structure initilisation
-    // and ergo infrequently.s
+    // and ergo infrequently.
 
     // regrettably duplicated from paulis.cpp which is inaccessible here
     constexpr int numPaulisPerMask = sizeof(PAULI_MASK_TYPE) * 8 / 2;
 
-    // QCOMP_ALIAS-agnostic literals
-    QCOMP_ALIAS p0, p1,n1, pI,nI;
+    // T-agnostic complex literals
+    base_qcomp p0, p1,n1, pI,nI;
     p0 = {0,  0}; //  0
     p1 = {+1, 0}; //  1
     n1 = {-1, 0}; // -1
@@ -152,20 +137,20 @@ INLINE QCOMP_ALIAS fast_getPauliStrElem(PauliStr str, qindex row, qindex col) {
     // but this poses no real slowdown; this function, and its caller, are inlined
     // so these 16 amps are re-processed one for each full enumeration of the
     // PauliStrSum which is expected to have significantly more terms/coeffs
-    QCOMP_ALIAS matrices[][2][2] = {
+    base_qcomp matrices[][2][2] = {
         {{p1,p0},{p0,p1}},  // I
         {{p0,p1},{p1,p0}},  // X
         {{p0,nI},{pI,p0}},  // Y
         {{p1,p0},{p0,n1}}}; // Z
 
-    QCOMP_ALIAS elem = p1; // 1
+    base_qcomp elem = p1; // 1
 
     // could be compile-time unrolled into 32 iterations
     for (int t=0; t<numPaulisPerMask; t++) {
         int p = getTwoAdjacentBits(str.lowPaulis, 2*t);
         int i = getBit(row, t);
         int j = getBit(col, t);
-        elem = elem * matrices[p][i][j]; // HIP-friendly avoiding *=
+        elem *= matrices[p][i][j];
     }
 
     // could be compile-time unrolled into 32 iterations
@@ -173,31 +158,27 @@ INLINE QCOMP_ALIAS fast_getPauliStrElem(PauliStr str, qindex row, qindex col) {
         int p = getTwoAdjacentBits(str.highPaulis, 2*t);
         int i = getBit(row, t + numPaulisPerMask);
         int j = getBit(col, t + numPaulisPerMask);
-        elem = elem * matrices[p][i][j];
+        elem *= matrices[p][i][j];
     }
 
     return elem;
 }
 
 
-INLINE QCOMP_ALIAS fast_getPauliStrSumElem(QCOMP_ALIAS* coeffs, PauliStr* strings, qindex numTerms, qindex row, qindex col) {
+INLINE base_qcomp fast_getPauliStrSumElem(base_qcomp* coeffs, PauliStr* strings, qindex numTerms, qindex row, qindex col) {
 
     // this function accepts unpacked PauliStrSum fields since a PauliStrSum cannot 
     // be directly processed in CUDA kernels/thrust due to its 'qcomp' field.
     // it also assumes str.highPaulis==0 for all str in strings, as per above func.
 
-    QCOMP_ALIAS elem = {0, 0}; // type-agnostic literal
+    base_qcomp elem = {0, 0};
 
     // this loop is expected exponentially smaller than caller's loop
     for (qindex n=0; n<numTerms; n++)
-        elem = elem + coeffs[n] * fast_getPauliStrElem(strings[n], row, col); // += is HIP-incomaptible
+        elem += coeffs[n] * fast_getPauliStrElem(strings[n], row, col);
 
     return elem;
 }
 
 
-
-// avoid exposing alias macro outside header
-#undef QCOMP_ALIAS
-
 #endif // FASTMATH_HPP
\ No newline at end of file
diff --git a/quest/src/cpu/cpu_qcomp.hpp b/quest/src/cpu/cpu_qcomp.hpp
new file mode 100644
index 00000000..70de4694
--- /dev/null
+++ b/quest/src/cpu/cpu_qcomp.hpp
@@ -0,0 +1,94 @@
+/** @file
+ * Definition of cpu_qcomp, an extension of base_qcomp and a
+ * compatible alternative to the user-facing qcomp, used
+ * exclusively by the CPU backend. This custom complex type
+ * avoids performance pitfalls of qcomp (i.e. std::complex),
+ * e.g. due to NaN checks, without resorting to platform
+ * and compiler-specific flags.
+ * 
+ * @author Tyson Jones
+ */
+
+#ifndef CPU_QCOMP_HPP
+#define CPU_QCOMP_HPP
+
+#include "quest/include/types.h"
+
+#include "quest/src/core/inliner.hpp"
+#include "quest/src/core/base_qcomp.hpp"
+
+#include <array>
+
+
+
+/*
+ * DEFINE CPU_QCOMP
+ *
+ * which is safe to typdef and define additional overloads
+ * below, since never witnessed outside the CPU Backend
+ */
+
+typedef base_qcomp cpu_qcomp;
+
+
+
+/*
+ * CONVERTERS
+ *
+ * which merely wrap the base_qcomp functions for clarity
+ * in the CPU source code, disambiguating from gpu_qcomp
+ */
+
+INLINE cpu_qcomp* getCpuQcompPtr(qcomp* ptr) {
+    return getBaseQcompPtr(ptr);
+}
+
+INLINE cpu_qcomp getCpuQcomp(qreal re, qreal im) {
+    return getBaseQcomp(re, im);
+}
+
+INLINE cpu_qcomp getCpuQcomp(const qcomp& a) {
+    return getBaseQcomp(a);
+}
+
+template <int Dim>
+std::array<std::array<cpu_qcomp,Dim>,Dim> getCpuQcompsMatrix(qcomp matr[Dim][Dim]) {
+
+    // Creator for fixed-size dense matrices CompMatr1 and CompMatr2,
+    // which are respectively 2x2 and 4x4 - deliberately not inlined!
+    // We create new cpu_qcomp in lieu of reinterpreting a 2D pointer
+    // in fear of alignment and static array nightmares
+    static_assert(Dim == 2 || Dim == 4);
+
+    std::array<std::array<cpu_qcomp,Dim>,Dim> out;
+
+    for (int i=0; i<Dim; i++)
+        for (int j=0; j<Dim; j++)
+            out[i][j] = getCpuQcomp(matr[i][j]);
+
+    return out;
+}
+
+
+
+/*
+ * CPU-SPECIFIC MATHS
+ */
+
+INLINE cpu_qcomp pow(cpu_qcomp base, cpu_qcomp expo) noexcept {
+
+    // Here, we re-use std::pow(std::complex) to avoid a custom definition,
+    // and so accept NaN-check performance penalties. Notice too we also
+    // create new qcomp(), rather than just reinterpreting the given cpu_qcomp,
+    // just to avoid any insiduous issues alignment/aliasing issues (since the
+    // creation time iss occluded by std::pow time).
+
+    qcomp base_ = getQcomp(base);
+    qcomp expo_ = getQcomp(expo);
+    qcomp out_ = std::pow(base_, expo_);
+    return getCpuQcomp(out_);
+}
+
+
+
+#endif // CPU_QCOMP_HPP
\ No newline at end of file
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index 13bf6eec..b9f779b9 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -2,11 +2,8 @@
  * CPU OpenMP-accelerated definitions of the main backend simulation routines,
  * as mirrored by gpu_subroutines.cpp, and called by accelerator.cpp. 
  * 
- * BEWARE that this specific file receives additional compiler optimisation flags
- * in order to counteract a performance issue in the use of std::complex operator
- * overloads. These flags (like -Ofast) may induce assumed associativity of qcomp
- * algebra, breaking techniques like Kahan summation. As such, this file CANNOT
- * assume IEEE floating-point behaviour.
+ * These 'hot-loop' functions use cpu_qcomp arithmetic operators, in lieu of qcomp
+ * (i.e. std::complex) which has compiler-specific performance pitfalls.
  * 
  * Some of these definitions are templated, defining multiple versions optimised 
  * (at compile-time) for handling different numbers of input qubits; such functions
@@ -34,6 +31,7 @@
 #include "quest/src/core/randomiser.hpp"
 #include "quest/src/core/accelerator.hpp"
 #include "quest/src/core/autodeployer.hpp"
+#include "quest/src/cpu/cpu_qcomp.hpp"
 #include "quest/src/cpu/cpu_config.hpp"
 #include "quest/src/cpu/cpu_subroutines.hpp"
 #include "quest/src/comm/comm_config.hpp"
@@ -45,26 +43,16 @@
 using std::vector;
 
 
-/*
- * Beware that this file makes extensive use of std::complex (qcomp) operator
- * overloads and so requires additional compiler flags to achieve hand-rolled
- * arithmetic performance; otherwise a 3-50x slowdown may be observed. We here
- * enforce that these flags were not forgotton (but may be deliberatedly avoided).
- * Beware these flags may induce associativity and break e.g. Kakan summation.
- */
 
-#if !defined(COMPLEX_OVERLOADS_PATCHED)
-    #error "Crucial, bespoke optimisation flags were not passed (or acknowledged) to cpu_subroutines.cpp which are necessary for full complex arithmetic performance."
-    
-#elif !COMPLEX_OVERLOADS_PATCHED
 
-    #if defined(_MSC_VER)
-        #pragma message("Warning: The CPU backend is being deliberately compiled without the necessary flags to obtain full complex arithmetic performance.")
-    #else
-        #warning "The CPU backend is being deliberately compiled without the necessary flags to obtain full complex arithmetic performance."
-    #endif
 
-#endif
+            // TODO:
+            // try to restore OpenMP custom reduction for cpu_qcomp??
+            // may work on modern MSVC compilers now, negating the need
+            // to do the indvidual real/imag reduction
+
+
+            // TODO: move numIts closer to the for loops!
 
 
 
@@ -105,6 +93,10 @@ void cpu_densmatr_setAmpsToPauliStrSum_sub(Qureg qureg, PauliStrSum sum) {
     qindex numIts = qureg.numAmpsPerNode;
     qindex dim = powerOf2(qureg.numQubits);
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* coeffs = getCpuQcompPtr(sum.coeffs);
+
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
@@ -116,7 +108,7 @@ void cpu_densmatr_setAmpsToPauliStrSum_sub(Qureg qureg, PauliStrSum sum) {
         qindex c = fast_getQuregGlobalColFromFlatIndex(i, dim);
 
         // contains non-unrolled loop (and args unpacked due to CUDA qcomp incompatibility, grr)
-        qureg.cpuAmps[n] = fast_getPauliStrSumElem(sum.coeffs, sum.strings, sum.numTerms, r, c);
+        amps[n] = fast_getPauliStrSumElem(coeffs, sum.strings, sum.numTerms, r, c);
     }
 }
 
@@ -131,6 +123,10 @@ void cpu_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, PauliStr
 
     int rank = out.isDistributed? comm_getRank() : 0;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* inCoeffs = getCpuQcompPtr(in.coeffs);
+    cpu_qcomp* outElems = getCpuQcompPtr(out.cpuElems);
+
     #pragma omp parallel for if(out.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
@@ -141,7 +137,7 @@ void cpu_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, PauliStr
         // contains I and Z which can in principle be computed faster; this
         // is a superfluous optimisation since this function is expected to
         // be called infrequently (i.e. only for data structure initialisation)
-        out.cpuElems[n] = fast_getPauliStrSumElem(in.coeffs, in.strings, in.numTerms, i, i);
+        outElems[n] = fast_getPauliStrSumElem(inCoeffs, in.strings, in.numTerms, i, i);
     }
 }
 
@@ -173,7 +169,8 @@ void cpu_fullstatediagmatr_setElemsFromMultiDimLists(FullStateDiagMatr out, void
             // update only the CPU elems using lists which are duplicated on every node.
             // note we are calling a util-function inside a parallelised hot-loop which
             // is generally inadvisable, but it does not matter in this case since the
-            // function is recursive and cannot be inlined.
+            // function is recursive and cannot be inlined. We are furthermore copying
+            // qcomp directly, rather than cpu_qcomp, because no arithmetic is performed
             out.cpuElems[localInd] = util_getElemFromNestedPtrs(lists, listInds.data(), numDims);
         }
     }
@@ -204,7 +201,7 @@ void cpu_fullstatediagmatr_setElemsFromMultiVarFunc(FullStateDiagMatr out, qcomp
             fast_getSubQuregValues(globalInd, numQubitsPerVar, numVars, areSigned, varValues.data());
     
             // call user function, which we assume is thread safe!
-            out.cpuElems[localInd] = callbackFunc(varValues.data());
+            out.cpuElems[localInd] = callbackFunc(varValues.data()); // no cpu_qcomp casting needed
         }
     }
 }
@@ -227,6 +224,10 @@ qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, vector<int> qubitInds, vecto
     // amplitudes are packed at an offset into the buffer
     qindex offset = getSubBufferSendInd(qureg);
 
+    // use cpu_qcomp (in lieu of qcomp) even though no arithmetic happens below - just for consistency!
+    cpu_qcomp* amps   = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer);
+
     auto sortedQubitInds = util_getSorted(qubitInds);
     auto qubitStateMask  = util_getBitMask(qubitInds, qubitStates);
     
@@ -240,7 +241,7 @@ qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, vector<int> qubitInds, vecto
         qindex i = insertBitsWithMaskedValues(n, sortedQubitInds.data(), numBits, qubitStateMask);
 
         // pack the potentially-strided amplitudes into a contiguous sub-buffer
-        qureg.cpuCommBuffer[offset + n] = qureg.cpuAmps[i];
+        buffer[offset + n] = amps[i];
     }
 
     // return the number of packed amps
@@ -258,6 +259,10 @@ qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu
     // amplitudes are packed at an offset into the buffer
     qindex offset = getSubBufferSendInd(qureg);
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps   = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer);
+
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
@@ -266,7 +271,7 @@ qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu
         qindex i0b0 = setBit(i000, qubit2, bit2);
         qindex i1b1 = flipTwoBits(i0b0, qubit3, qubit1);
 
-        qureg.cpuCommBuffer[offset + n] = qureg.cpuAmps[i0b0] + qureg.cpuAmps[i1b1];
+        buffer[offset + n] = amps[i0b0] + amps[i1b1];
     }
 
     // return the number of packed amps
@@ -291,6 +296,9 @@ void cpu_statevec_anyCtrlSwap_subA(Qureg qureg, vector<int> ctrls, vector<int> c
     // each control qubit halves the number of iterations, each of which modifies 2 amplitudes, and skips 2
     qindex numIts = qureg.numAmpsPerNode / powerOf2(2 + ctrls.size());
 
+    // use cpu_qcomp (in lieu of qcomp) even though no arithmetic happens below - just for consistency!
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+
     auto sortedQubits   = util_getSorted(ctrls, {targ2, targ1});
     auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ2, targ1}, {0, 1});
     
@@ -305,7 +313,7 @@ void cpu_statevec_anyCtrlSwap_subA(Qureg qureg, vector<int> ctrls, vector<int> c
         qindex i01 = insertBitsWithMaskedValues(n, sortedQubits.data(), numQubitBits, qubitStateMask);
         qindex i10 = flipTwoBits(i01, targ2, targ1);
 
-        std::swap(qureg.cpuAmps[i01], qureg.cpuAmps[i10]);
+        std::swap(amps[i01], amps[i10]);
     }
 }
 
@@ -321,6 +329,10 @@ void cpu_statevec_anyCtrlSwap_subB(Qureg qureg, vector<int> ctrls, vector<int> c
     // received amplitudes may begin at an arbitrary offset in the buffer
     qindex offset = getBufferRecvInd();
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps   = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer);
+
     auto sortedCtrls   = util_getSorted(ctrls);
     auto ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
     
@@ -337,7 +349,7 @@ void cpu_statevec_anyCtrlSwap_subB(Qureg qureg, vector<int> ctrls, vector<int> c
         qindex j = n + offset;
 
         // unpack the continuous sub-buffer among the strided local amplitudes
-        qureg.cpuAmps[i] = qureg.cpuCommBuffer[j];
+        amps[i] = buffer[j];
     }
 }
 
@@ -353,6 +365,10 @@ void cpu_statevec_anyCtrlSwap_subC(Qureg qureg, vector<int> ctrls, vector<int> c
     // received amplitudes may begin at an arbitrary offset in the buffer
     qindex offset = getBufferRecvInd();
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps   = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer);
+
     auto sortedQubits   = util_getSorted(ctrls, {targ});
     auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {targState});
     
@@ -370,7 +386,7 @@ void cpu_statevec_anyCtrlSwap_subC(Qureg qureg, vector<int> ctrls, vector<int> c
         qindex j = n + offset;
 
         // unpack the continuous sub-buffer among the strided local amplitudes
-        qureg.cpuAmps[i] = qureg.cpuCommBuffer[j];
+        amps[i] = buffer[j];
     }
 }
 
@@ -394,6 +410,10 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, v
     // each control qubit halves the needed iterations, and each iteration modifies two amplitudes
     qindex numIts = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 1);
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+    auto elems = getCpuQcompsMatrix<2>(matr.elems); // MSVC requires explicit template param, bah!
+
     auto sortedQubits   = util_getSorted(ctrls, {targ});
     auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0});
 
@@ -409,11 +429,11 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, v
         qindex i1 = flipBit(i0, targ);
 
         // note the two amplitudes are likely strided and not adjacent (separated by 2^t)
-        qcomp amp0 = qureg.cpuAmps[i0];
-        qcomp amp1 = qureg.cpuAmps[i1];
+        cpu_qcomp amp0 = amps[i0];
+        cpu_qcomp amp1 = amps[i1];
 
-        qureg.cpuAmps[i0] = matr.elems[0][0]*amp0 + matr.elems[0][1]*amp1;
-        qureg.cpuAmps[i1] = matr.elems[1][0]*amp0 + matr.elems[1][1]*amp1;
+        amps[i0] = elems[0][0]*amp0 + elems[0][1]*amp1;
+        amps[i1] = elems[1][0]*amp0 + elems[1][1]*amp1;
     }
 }
 
@@ -429,6 +449,12 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector<int> ctrls, v
     // received amplitudes may begin at an arbitrary offset in the buffer
     qindex offset = getBufferRecvInd();
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps   = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer);
+    cpu_qcomp f0 = getCpuQcomp(fac0);
+    cpu_qcomp f1 = getCpuQcomp(fac1);
+
     auto sortedCtrls   = util_getSorted(ctrls);
     auto ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
 
@@ -444,7 +470,7 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector<int> ctrls, v
         // j = index of nth received amplitude from pair rank in buffer
         qindex j = n + offset;
 
-        qureg.cpuAmps[i] = fac0*qureg.cpuAmps[i] + fac1*qureg.cpuCommBuffer[j];
+        amps[i] = f0*amps[i] + f1*buffer[j];
     }
 }
 
@@ -467,6 +493,10 @@ void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
     // each control qubit halves the needed iterations, and each iteration modifies four amplitudes
     qindex numIts = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 2);
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+    auto elems = getCpuQcompsMatrix<4>(matr.elems); // MSVC requires explicit template param, bah!
+
     auto sortedQubits   = util_getSorted(ctrls, {targ1, targ2});
     auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1, targ2}, {0, 0});
 
@@ -484,16 +514,16 @@ void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
         qindex i11 = flipBit(i01, targ2);
 
         // note amplitudes are not necessarily adjacent, nor uniformly spaced
-        qcomp amp00 = qureg.cpuAmps[i00];
-        qcomp amp01 = qureg.cpuAmps[i01];
-        qcomp amp10 = qureg.cpuAmps[i10];
-        qcomp amp11 = qureg.cpuAmps[i11];
-
-        // amps[i_n] = sum_j elems[n][j] amp[i_n]
-        qureg.cpuAmps[i00] = matr.elems[0][0]*amp00 + matr.elems[0][1]*amp01 + matr.elems[0][2]*amp10 + matr.elems[0][3]*amp11;
-        qureg.cpuAmps[i01] = matr.elems[1][0]*amp00 + matr.elems[1][1]*amp01 + matr.elems[1][2]*amp10 + matr.elems[1][3]*amp11;
-        qureg.cpuAmps[i10] = matr.elems[2][0]*amp00 + matr.elems[2][1]*amp01 + matr.elems[2][2]*amp10 + matr.elems[2][3]*amp11;
-        qureg.cpuAmps[i11] = matr.elems[3][0]*amp00 + matr.elems[3][1]*amp01 + matr.elems[3][2]*amp10 + matr.elems[3][3]*amp11;
+        cpu_qcomp amp00 = amps[i00];
+        cpu_qcomp amp01 = amps[i01];
+        cpu_qcomp amp10 = amps[i10];
+        cpu_qcomp amp11 = amps[i11];
+
+        // amps[i_n] = sum_j matr.elems[n][j] amp[i_n]
+        amps[i00] = elems[0][0]*amp00 + elems[0][1]*amp01 + elems[0][2]*amp10 + elems[0][3]*amp11;
+        amps[i01] = elems[1][0]*amp00 + elems[1][1]*amp01 + elems[1][2]*amp10 + elems[1][3]*amp11;
+        amps[i10] = elems[2][0]*amp00 + elems[2][1]*amp01 + elems[2][2]*amp10 + elems[2][3]*amp11;
+        amps[i11] = elems[3][0]*amp00 + elems[3][1]*amp01 + elems[3][2]*amp10 + elems[3][3]*amp11;
     }
 }
 
@@ -534,6 +564,10 @@ void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
     // each control qubit halves iterations, each of which modifies 2^(targs.size()) amplitudes
     qindex numIts = qureg.numAmpsPerNode / powerOf2(ctrls.size() + targs.size());
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps  = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* elems = getCpuQcompPtr(matr.cpuElemsFlat);
+
     // prepare a mask which yields ctrls in specified state, and targs in all-zero
     auto sortedQubits   = util_getSorted(ctrls, targs);
     auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, targs, vector<int>(targs.size(),0));
@@ -550,7 +584,7 @@ void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
     #pragma omp parallel if(qureg.isMultithreaded)
     {
         // create a private cache for every thread (might be compile-time sized, and in heap or stack)
-        vector<qcomp> cache(numTargAmps);
+        vector<cpu_qcomp> cache(numTargAmps);
 
         #pragma omp for
         for (qindex n=0; n<numIts; n++) {
@@ -563,7 +597,7 @@ void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
 
                 // i = nth local index where ctrls are active and targs form value j
                 qindex i = setBits(i0, targs.data(), numTargBits, j); // loop may be unrolled
-                cache[j] = qureg.cpuAmps[i];
+                cache[j] = amps[i];
             }
 
             // modify each amplitude (loop might be unrolled)
@@ -571,7 +605,7 @@ void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
 
                 // i = nth local index where ctrls are active and targs form value k
                 qindex i = setBits(i0, targs.data(), numTargBits, k); // loop may be unrolled
-                qureg.cpuAmps[i] = 0;
+                amps[i] = getCpuQcomp(0, 0);
             
                 // loop may be unrolled
                 for (qindex j=0; j<numTargAmps; j++) {
@@ -582,22 +616,18 @@ void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
                         l = fast_getMatrixFlatIndex(j, k, numTargAmps);
                     else
                         l = fast_getMatrixFlatIndex(k, j, numTargAmps);
-
-                    qcomp elem = matr.cpuElemsFlat[l];
+                    cpu_qcomp elem = elems[l];
 
                     // optionally conjugate matrix elems on the fly to avoid pre-modifying heap structure
                     if constexpr (ApplyConj)
-                        elem = std::conj(elem);
+                        elem = conj(elem);
 
-                    qureg.cpuAmps[i] += elem * cache[j];
+                    amps[i] += elem * cache[j];
 
                     /// @todo
                     /// qureg.cpuAmps[i] is being serially updated by only this thread,
                     /// so is a candidate for Kahan summation for improved numerical
                     /// stability. Explore whether this is time-free and worthwhile!
-                    ///
-                    /// BEWARE that Kahan summation is incompatible with the optimisation
-                    /// flags currently passed to this file
                 }
             }
         }
@@ -622,6 +652,18 @@ void cpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
     // each control qubit halves the needed iterations, each of which will modify 1 amplitude
     qindex numIts = qureg.numAmpsPerNode / powerOf2(ctrls.size());
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps  = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* elems = getCpuQcompPtr(matr.elems);
+
+        // TODO:
+        // I am paranoid the above casting from qcomp* to cpu_qcomp* is
+        // more dangerous for the static matr.elems than the heap pointer.
+        // Is it more vulnerable to misalignment? Check this!
+        //
+        // If this IS the case, we simply prepare a new cpu_qcomp
+        // instance for each element, like the GPU backend does
+
     auto sortedCtrls   = util_getSorted(ctrls);
     auto ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
 
@@ -638,7 +680,7 @@ void cpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
         qindex i = concatenateBits(qureg.rank, j, qureg.logNumAmpsPerNode);
 
         int b = getBit(i, targ);
-        qureg.cpuAmps[j] *= matr.elems[b];
+        amps[j] *= elems[b];
     }
 }
 
@@ -660,6 +702,18 @@ void cpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
     // each control qubit halves the needed iterations, each of which will modify 1 amplitude
     qindex numIts = qureg.numAmpsPerNode / powerOf2(ctrls.size());
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps  = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* elems = getCpuQcompPtr(matr.elems);
+
+        // TODO:
+        // I am paranoid the above casting from qcomp* to cpu_qcomp* is
+        // more dangerous for the static matr.elems than the heap pointer.
+        // Is it more vulnerable to misalignment? Check this!
+        //
+        // If this IS the case, we simply prepare a new cpu_qcomp
+        // instance for each element, like the GPU backend does
+
     auto sortedCtrls   = util_getSorted(ctrls);
     auto ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
 
@@ -676,7 +730,7 @@ void cpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
         qindex i = concatenateBits(qureg.rank, j, qureg.logNumAmpsPerNode);
 
         int k = getTwoBits(i, targ2, targ1);
-        qureg.cpuAmps[j] *= matr.elems[k];
+        amps[j] *= elems[k];
     }
 }
 
@@ -700,6 +754,12 @@ void cpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
     // each control qubit halves the needed iterations, each of which will modify 1 amplitude
     qindex numIts = qureg.numAmpsPerNode / powerOf2(ctrls.size());
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps  = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* elems = getCpuQcompPtr(matr.cpuElems);
+    cpu_qcomp expo   = getCpuQcomp(exponent);
+    (void) expo; // silence when unused
+
     auto sortedCtrls   = util_getSorted(ctrls);
     auto ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
 
@@ -718,7 +778,7 @@ void cpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
 
         // t = value of targeted bits, which may be in the prefix substate
         qindex t = getValueOfBits(i, targs.data(), numTargBits);
-        qcomp elem = matr.cpuElems[t];
+        cpu_qcomp elem = elems[t];
 
         // decide whether to power and conj at compile-time, to avoid branching in hot-loop.
         // beware that pow(qcomp,qcomp) below gives notable error over pow(qreal,qreal) 
@@ -726,13 +786,13 @@ void cpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
         // and negative, and the exponent is an integer. We tolerate this heightened error
         // because we have no reason to think matr is real (it's not constrained Hermitian).
         if constexpr (HasPower)
-            elem = std::pow(elem, exponent);
+            elem = pow(elem, expo);
 
         // cautiously conjugate AFTER exponentiation, else we must also conj exponent
         if constexpr (ApplyConj)
-            elem = std::conj(elem);
+            elem = conj(elem);
 
-        qureg.cpuAmps[j] *= elem;
+        amps[j] *= elem;
     }
 }
 
@@ -762,10 +822,16 @@ void cpu_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp
     // every iteration modifies one amp, using one element
     qindex numIts = qureg.numAmpsPerNode;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps  = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* elems = getCpuQcompPtr(matr.cpuElems);
+    cpu_qcomp expo   = getCpuQcomp(exponent);
+    (void) expo; // silence when unused
+
     #pragma omp parallel for if(qureg.isMultithreaded||qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
-        qcomp elem = matr.cpuElems[n];
+        cpu_qcomp elem = elems[n];
 
         // compile-time decide if applying power to avoid in-loop branching.
         // beware that pow(qcomp,qcomp) below gives notable error over pow(qreal,qreal) 
@@ -773,9 +839,9 @@ void cpu_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp
         // and negative, and the exponent is an integer. We tolerate this heightened error
         // because we have no reason to think matr is real (it's not constrained Hermitian).
         if constexpr (HasPower)
-            elem = std::pow(elem, exponent);
+            elem = pow(elem, expo);
 
-        qureg.cpuAmps[n] *= elem;
+        amps[n] *= elem;
     }
 }
 
@@ -796,23 +862,29 @@ void cpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp
     // every iteration modifies one qureg amp, using one or two matr elements
     qindex numIts = qureg.numAmpsPerNode;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps  = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* elems = getCpuQcompPtr(matr.cpuElems);
+    cpu_qcomp expo   = getCpuQcomp(exponent);
+    (void) expo; // silence when unused
+
     #pragma omp parallel for if(qureg.isMultithreaded||matr.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
         // the nth local amplitude will be multiplied by fac
-        qcomp fac = 1;
+        cpu_qcomp fac = getCpuQcomp(1,0);
 
         // update fac to effect rho -> (matr * rho) or (matr^exponent * rho)
         if constexpr (ApplyLeft) {
 
             // i = global row of nth local amp
             qindex i = fast_getQuregGlobalRowFromFlatIndex(n, matr.numElems);
-            qcomp term = matr.cpuElems[i];
+            cpu_qcomp term = elems[i];
 
             // compile-time decide if applying power to avoid in-loop branching...
             // (beware that complex pow() is numerically unstable as detailed below)
             if constexpr (HasPower)
-                term = std::pow(term, exponent);
+                term = pow(term, expo);
 
             fac = term;
         }
@@ -826,23 +898,23 @@ void cpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp
 
             // j = global column corresponding to n
             qindex j = fast_getQuregGlobalColFromFlatIndex(m, matr.numElems);
-            qcomp term = matr.cpuElems[j];
+            cpu_qcomp term = elems[j];
 
             // beware that pow(qcomp,qcomp) below gives notable error over pow(qreal,qreal) 
             // (by producing an unexpected non-zero imaginary component) when the base is real 
             // and negative, and the exponent is an integer. We tolerate this heightened error
             // because we have no reason to think matr is real (it's not constrained Hermitian).
             if constexpr (HasPower)
-                term = std::pow(term, exponent);
+                term = pow(term, expo);
 
             // conj strictly after pow, to effect conj(matr^exponent)
             if constexpr (ConjRight)
-                term = std::conj(term);
+                term = conj(term);
 
             fac *= term;
         }
 
-        qureg.cpuAmps[n] *= fac;
+        amps[n] *= fac;
     }
 }
 
@@ -866,8 +938,8 @@ template void cpu_densmatr_allTargDiagMatr_sub<true,  false, true,  false> (Qure
 
 template <int NumTargs>
 INLINE void applyPauliUponAmpPair(
-    Qureg qureg, qindex v, qindex i0, int* indXY, int numXY, 
-    qindex maskXY, qindex maskYZ, qcomp ampFac, qcomp pairAmpFac
+    cpu_qcomp* amps, qindex v, qindex i0, int* indXY, int numXY, 
+    qindex maskXY, qindex maskYZ, cpu_qcomp ampFac, cpu_qcomp pairAmpFac
 ) {
     // this is a subroutine of cpu_statevector_anyCtrlPauliTensorOrGadget_subA() below
     // called in a hot-loop (hence it is here inlined) which exists because the caller
@@ -885,14 +957,13 @@ INLINE void applyPauliUponAmpPair(
     int signB = fast_getPlusOrMinusMaskedBitParity(iB, maskYZ);
 
     // mix or swap scaled amp pair (where pairAmpFac includes Y's i factor)
-    qcomp ampA = qureg.cpuAmps[iA];
-    qcomp ampB = qureg.cpuAmps[iB];
-    qureg.cpuAmps[iA] = (ampFac * ampA) + (pairAmpFac * signB * ampB);
-    qureg.cpuAmps[iB] = (ampFac * ampB) + (pairAmpFac * signA * ampA);
+    cpu_qcomp ampA = amps[iA];
+    cpu_qcomp ampB = amps[iB];
+    amps[iA] = (ampFac * ampA) + (pairAmpFac * signB * ampB);
+    amps[iB] = (ampFac * ampB) + (pairAmpFac * signA * ampA);
 }
 
 
-
 template <int NumCtrls, int NumTargs>
 void cpu_statevector_anyCtrlPauliTensorOrGadget_subA(
     Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, 
@@ -900,6 +971,14 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subA(
 ) {
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
     assert_numTargsMatchesTemplateParam(x.size() + y.size(), NumTargs);
+
+    // we will scale pairAmp below by i^numY, so that each amp need only choose the +-1 sign
+    pairAmpFac *= util_getPowerOfI(y.size());
+
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp f0 = getCpuQcomp(ampFac);
+    cpu_qcomp f1 = getCpuQcomp(pairAmpFac);
     
     // only X and Y count as targets
     vector<int> sortedTargsXY = util_getSorted(util_getConcatenated(x, y));
@@ -912,9 +991,6 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subA(
     auto maskXY = util_getBitMask(sortedTargsXY);
     auto maskYZ = util_getBitMask(util_getConcatenated(y, z));
 
-    // we will scale pairAmp below by i^numY, so that each amp need only choose the +-1 sign
-    pairAmpFac *= util_getPowerOfI(y.size());
-
     // use template params to compile-time unroll loops in insertBits() and inner-loop below
     SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size());
     SET_VAR_AT_COMPILE_TIME(int, numTargBits, NumTargs, sortedTargsXY.size());
@@ -950,7 +1026,7 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subA(
             // serial
             for (qindex v=0; v<numInnerIts; v++)
                 applyPauliUponAmpPair<NumTargs>(
-                    qureg, v, i0, sortedTargsXY.data(), numTargBits, maskXY, maskYZ, ampFac, pairAmpFac);
+                    amps, v, i0, sortedTargsXY.data(), numTargBits, maskXY, maskYZ, f0, f1);
         }
 
     } else {
@@ -965,7 +1041,7 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subA(
             #pragma omp parallel for
             for (qindex v=0; v<numInnerIts; v++)
                 applyPauliUponAmpPair<NumTargs>(
-                    qureg, v, i0, sortedTargsXY.data(), numTargBits, maskXY, maskYZ, ampFac, pairAmpFac);
+                    amps, v, i0, sortedTargsXY.data(), numTargBits, maskXY, maskYZ, f0, f1);
         }
     }
 }
@@ -978,8 +1054,17 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subB(
 ) {
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
 
+    // we will scale pairAmp by i^numY, so that each amp need only choose the +-1 sign
+    pairAmpFac *= util_getPowerOfI(y.size());
+
     // each control qubit halves the needed iterations
     qindex numIts = qureg.numAmpsPerNode / powerOf2(ctrls.size());
+
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps   = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer);
+    cpu_qcomp f0 = getCpuQcomp(ampFac);
+    cpu_qcomp f1 = getCpuQcomp(pairAmpFac);
     
     // received amplitudes may begin at an arbitrary offset in the buffer
     qindex offset = getBufferRecvInd();
@@ -992,9 +1077,6 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subB(
     // use template param to compile-time unroll loop in insertBits()
     SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size());
 
-    // we will scale pairAmp by i^numY, so that each amp need only choose the +-1 sign
-    pairAmpFac *= util_getPowerOfI(y.size());
-
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
@@ -1008,8 +1090,8 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subB(
         qindex k = flipBits(i, maskXY);
         int sign = fast_getPlusOrMinusMaskedBitParity(k, maskYZ);
 
-        qureg.cpuAmps[i] *= ampFac;
-        qureg.cpuAmps[i] += pairAmpFac * sign * qureg.cpuCommBuffer[j]; // pairAmpFac includes Y's i factors
+        amps[i] *= f0;
+        amps[i] += f1 * sign * buffer[j]; // f1=pairAmpFac includes Y's i factors
     }
 }
 
@@ -1034,7 +1116,10 @@ void cpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(
     // each control qubit halves the needed iterations, each of which modifies 1 amp
     qindex numIts = qureg.numAmpsPerNode / powerOf2(ctrls.size());
 
-    qcomp facs[] = {fac0, fac1};
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp facs[] = {getCpuQcomp(fac0), getCpuQcomp(fac1)};
+
     auto sortedCtrls   = util_getSorted(ctrls);
     auto ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
     auto targMask      = util_getBitMask(targs);
@@ -1050,7 +1135,7 @@ void cpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(
 
         // apply phase to amp depending on parity of targets
         int p = getBitMaskParity(i & targMask);
-        qureg.cpuAmps[i] *= facs[p];
+        amps[i] *= facs[p];
     }
 }
 
@@ -1069,6 +1154,10 @@ void cpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs
 
     qindex numIts = outQureg.numAmpsPerNode;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* outAmps = getCpuQcompPtr(outQureg.cpuAmps);
+    cpu_qcomp* inFacs = getCpuQcompPtr(coeffs.data());
+
     // use template param to compile-time unroll inner loop below
     SET_VAR_AT_COMPILE_TIME(int, numQuregs, NumQuregs, inQuregs.size());
 
@@ -1076,13 +1165,13 @@ void cpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs
     for (qindex n=0; n<numIts; n++) {
 
         // unrolled when inQuregs.size() <= 5
-        qcomp amp = 0;
+        cpu_qcomp amp = getCpuQcomp(0, 0);
         for (int q=0; q<numQuregs; q++)
-            amp += coeffs[q] * inQuregs[q].cpuAmps[n];
+            amp += inFacs[q] * getCpuQcompPtr(inQuregs[q].cpuAmps)[n];
 
         // must not modify cpuAmps[n] before computing the amp since
         // outQureg can legally appear among inQuregs
-        outQureg.cpuAmps[n] = amp;
+        outAmps[n] = amp;
     }
 }
 
@@ -1090,12 +1179,14 @@ void cpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs
 void cpu_densmatr_mixQureg_subA(qreal outProb, Qureg outQureg, qreal inProb, Qureg inDensMatr) {
 
     qindex numIts = outQureg.numAmpsPerNode;
-    qcomp* out = outQureg.cpuAmps;
-    qcomp* in = inDensMatr.cpuAmps;
+
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* outAmps = getCpuQcompPtr(outQureg.cpuAmps);
+    cpu_qcomp* inAmps = getCpuQcompPtr(inDensMatr.cpuAmps);
     
     #pragma omp parallel for if(outQureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++)
-        out[n] = (outProb * out[n]) + (inProb * in[n]);
+        outAmps[n] = (outProb * outAmps[n]) + (inProb * inAmps[n]);
 }
 
 
@@ -1103,8 +1194,10 @@ void cpu_densmatr_mixQureg_subB(qreal outProb, Qureg outQureg, qreal inProb, Qur
 
     qindex numIts = outQureg.numAmpsPerNode;
     qindex dim = inStateVec.numAmps;
-    qcomp* out = outQureg.cpuAmps;
-    qcomp* in = inStateVec.cpuAmps;
+
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* outAmps = getCpuQcompPtr(outQureg.cpuAmps);
+    cpu_qcomp* inAmps = getCpuQcompPtr(inStateVec.cpuAmps);
     
     #pragma omp parallel for if(outQureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
@@ -1113,7 +1206,7 @@ void cpu_densmatr_mixQureg_subB(qreal outProb, Qureg outQureg, qreal inProb, Qur
         qindex i = fast_getQuregGlobalRowFromFlatIndex(n, dim);
         qindex j = fast_getQuregGlobalColFromFlatIndex(n, dim);
 
-        out[n] = (outProb * out[n]) + (inProb * in[i] * std::conj(in[j]));
+        outAmps[n] = (outProb * outAmps[n]) + (inProb * inAmps[i] * conj(inAmps[j]));
     }
 }
 
@@ -1123,8 +1216,10 @@ void cpu_densmatr_mixQureg_subC(qreal outProb, Qureg outQureg, qreal inProb) {
     // received inQureg's entire statevector amplitudes into every node's buffer
     qindex numIts = outQureg.numAmpsPerNode;
     qindex dim = powerOf2(outQureg.numQubits);
-    qcomp* out = outQureg.cpuAmps;
-    qcomp* in = outQureg.cpuCommBuffer;
+
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* outAmps = getCpuQcompPtr(outQureg.cpuAmps);
+    cpu_qcomp* inAmps = getCpuQcompPtr(outQureg.cpuCommBuffer);
 
     #pragma omp parallel for if(outQureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
@@ -1136,7 +1231,7 @@ void cpu_densmatr_mixQureg_subC(qreal outProb, Qureg outQureg, qreal inProb) {
         qindex i = fast_getQuregGlobalRowFromFlatIndex(m, dim);
         qindex j = fast_getQuregGlobalColFromFlatIndex(m, dim);
 
-        out[n] = (outProb * out[n]) + (inProb * in[i] * std::conj(in[j]));
+        outAmps[n] = (outProb * outAmps[n]) + (inProb * inAmps[i] * conj(inAmps[j]));
     }
 }
 
@@ -1155,6 +1250,9 @@ void cpu_densmatr_oneQubitDephasing_subA(Qureg qureg, int ketQubit, qreal prob)
     // half of all local amps are scaled, and each iteration modifies two
     qindex numIts = qureg.numAmpsPerNode / 4;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+
     // loop constants
     qreal fac = util_getOneQubitDephasingFactor(prob);
     int braQubit = util_getBraQubit(ketQubit, qureg);
@@ -1176,8 +1274,8 @@ void cpu_densmatr_oneQubitDephasing_subA(Qureg qureg, int ketQubit, qreal prob)
         qindex i01 = insertTwoBits(n, braQubit, 0, ketQubit, 1);
         qindex i10 = insertTwoBits(n, braQubit, 1, ketQubit, 0);
 
-        qureg.cpuAmps[i01] *= fac;
-        qureg.cpuAmps[i10] *= fac;
+        amps[i01] *= fac;
+        amps[i10] *= fac;
     }
 }
 
@@ -1186,6 +1284,9 @@ void cpu_densmatr_oneQubitDephasing_subB(Qureg qureg, int ketQubit, qreal prob)
 
     // half of all local amps are scaled
     qindex numIts = qureg.numAmpsPerNode / 2;
+
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
     
     // loop constants
     qreal fac = util_getOneQubitDephasingFactor(prob);
@@ -1197,7 +1298,7 @@ void cpu_densmatr_oneQubitDephasing_subB(Qureg qureg, int ketQubit, qreal prob)
         // i = nth local index where bra-qubit differs from ket-qubit
         qindex i = insertBit(n, ketQubit, ! braBit);
 
-        qureg.cpuAmps[i] *= fac;
+        amps[i] *= fac;
     }
 }
 
@@ -1226,6 +1327,9 @@ void cpu_densmatr_twoQubitDephasing_subB(Qureg qureg, int ketQubitA, int ketQubi
     // 75% of amps are updated, but we just enumerate all for simplicity
     qindex numIts = qureg.numAmpsPerNode;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+
     // loop constants
     qreal term = util_getTwoQubitDephasingTerm(prob);
 
@@ -1245,7 +1349,7 @@ void cpu_densmatr_twoQubitDephasing_subB(Qureg qureg, int ketQubitA, int ketQubi
         int flag = bitA | bitB;
 
         // by multiplying by 1 or (1 + term)
-        qureg.cpuAmps[n] *= 1 + (flag * term);
+        amps[n] *= 1 + (flag * term);
     }
 }
 
@@ -1261,8 +1365,8 @@ void cpu_densmatr_oneQubitDepolarising_subA(Qureg qureg, int ketQubit, qreal pro
     // all amps are modified, and each iteration modifies 4
     qindex numIts = qureg.numAmpsPerNode / 4;
 
-    // for brevity
-    qcomp* amps = qureg.cpuAmps;
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
 
     int braQubit = util_getBraQubit(ketQubit, qureg);
     auto factors = util_getOneQubitDepolarisingFactors(prob);
@@ -1281,8 +1385,8 @@ void cpu_densmatr_oneQubitDepolarising_subA(Qureg qureg, int ketQubit, qreal pro
         qindex i11 = flipBit(i01, braQubit);
 
         // modify 4 amps, mixing a pair, and scaling the other
-        qcomp amp00 = amps[i00];
-        qcomp amp11 = amps[i11];
+        cpu_qcomp amp00 = amps[i00];
+        cpu_qcomp amp11 = amps[i11];
         amps[i00] = (facAA * amp00) + (facBB * amp11);
         amps[i01] *= facAB;
         amps[i10] *= facAB;
@@ -1296,12 +1400,15 @@ void cpu_densmatr_oneQubitDepolarising_subB(Qureg qureg, int ketQubit, qreal pro
     // all amps are modified, and each iteration modifies 2
     qindex numIts = qureg.numAmpsPerNode / 2;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer);
+
     // received amplitudes may begin at an arbitrary offset in the buffer
     qindex offset = getBufferRecvInd();
 
     int braBit = util_getRankBitOfBraQubit(ketQubit, qureg);
     auto factors = util_getOneQubitDepolarisingFactors(prob);
-
     auto facAA = factors.c1;
     auto facBB = factors.c2;
     auto facAB = factors.c3;
@@ -1324,9 +1431,9 @@ void cpu_densmatr_oneQubitDepolarising_subB(Qureg qureg, int ketQubit, qreal pro
         // iAB = nth local index where ket qubit disagrees with bra qubit
         qindex iAB = insertBit(n, ketQubit, ! braBit);
 
-        qureg.cpuAmps[iAA] *= facAA;
-        qureg.cpuAmps[iAA] += facBB * qureg.cpuCommBuffer[jBB];
-        qureg.cpuAmps[iAB] *= facAB;
+        amps[iAA] *= facAA;
+        amps[iAA] += facBB * buffer[jBB];
+        amps[iAB] *= facAB;
     }
 }
 
@@ -1342,6 +1449,9 @@ void cpu_densmatr_twoQubitDepolarising_subA(Qureg qureg, int ketQb1, int ketQb2,
     // all amps are scaled (although 1/16 of them will be unchanged)
     qindex numIts  = qureg.numAmpsPerNode;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+
     // bra-qubits corresponding to ket-qubits
     int braQb1 = util_getBraQubit(ketQb1, qureg);
     int braQb2 = util_getBraQubit(ketQb2, qureg);
@@ -1357,7 +1467,7 @@ void cpu_densmatr_twoQubitDepolarising_subA(Qureg qureg, int ketQb1, int ketQb2,
         bool mod = !(flag1 & flag2);
 
         // multiply 15/16-th of all amps by (1 + c3)
-        qureg.cpuAmps[n] *= 1 + c3 * mod;
+        amps[n] *= 1 + c3 * mod;
     }
 }
 
@@ -1367,6 +1477,9 @@ void cpu_densmatr_twoQubitDepolarising_subB(Qureg qureg, int ketQb1, int ketQb2,
     // one quarter of amps will be modified, and four are mixed each iteration
     qindex numIts = qureg.numAmpsPerNode / 16;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+
     // bra-qubits corresponding to ket-qubits
     int braQb1 = util_getBraQubit(ketQb1, qureg);
     int braQb2 = util_getBraQubit(ketQb2, qureg);
@@ -1379,9 +1492,6 @@ void cpu_densmatr_twoQubitDepolarising_subB(Qureg qureg, int ketQb1, int ketQb2,
     // below, we compute term = (sum of all four amps) for brevity, so adjust c1
     c1 -= c2;
 
-    // for brevity
-    qcomp* amps = qureg.cpuAmps;
-
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
     
@@ -1392,7 +1502,7 @@ void cpu_densmatr_twoQubitDepolarising_subB(Qureg qureg, int ketQb1, int ketQb2,
         qindex i1111 = flipTwoBits(i0101, braQb2, ketQb2);
         
         // mix 1/16 of all amps in groups of 4
-        qcomp term = amps[i0000] + amps[i0101] + amps[i1010] + amps[i1111];
+        cpu_qcomp term = amps[i0000] + amps[i0101] + amps[i1010] + amps[i1111];
 
         amps[i0000] = c1*amps[i0000] + c2*term;
         amps[i0101] = c1*amps[i0101] + c2*term;
@@ -1407,6 +1517,9 @@ void cpu_densmatr_twoQubitDepolarising_subC(Qureg qureg, int ketQb1, int ketQb2,
     // scale 25% of amps but iterate all
     qindex numIts = qureg.numAmpsPerNode;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+
     int braQb1 = util_getBraQubit(ketQb1, qureg);
     int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg);
 
@@ -1427,7 +1540,7 @@ void cpu_densmatr_twoQubitDepolarising_subC(Qureg qureg, int ketQb1, int ketQb2,
         bool mod = !(flag1 & flag2);
 
         // scale amp by 1 or (1 + c3)
-        qureg.cpuAmps[n] *= 1 + c3 * mod;
+        amps[n] *= 1 + c3 * mod;
     }
 }
 
@@ -1437,6 +1550,10 @@ void cpu_densmatr_twoQubitDepolarising_subD(Qureg qureg, int ketQb1, int ketQb2,
     // 25% of local amps are modified, two in each iteration
     qindex numIts = qureg.numAmpsPerNode / 8;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer);
+
     // received amplitudes may begin at an arbitrary offset in the buffer
     qindex offset = getBufferRecvInd();
 
@@ -1459,11 +1576,11 @@ void cpu_densmatr_twoQubitDepolarising_subD(Qureg qureg, int ketQb1, int ketQb2,
         qindex j = n + offset;
 
         // mix pair of amps using buffer
-        qcomp amp0b0 = qureg.cpuAmps[i0b0];
-        qcomp amp1b1 = qureg.cpuAmps[i1b1];
+        cpu_qcomp amp0b0 = amps[i0b0];
+        cpu_qcomp amp1b1 = amps[i1b1];
 
-        qureg.cpuAmps[i0b0] = c1*amp0b0 + c2*(amp1b1 + qureg.cpuCommBuffer[j]);
-        qureg.cpuAmps[i1b1] = c1*amp1b1 + c2*(amp0b0 + qureg.cpuCommBuffer[j]);
+        amps[i0b0] = c1*amp0b0 + c2*(amp1b1 + buffer[j]);
+        amps[i1b1] = c1*amp1b1 + c2*(amp0b0 + buffer[j]);
     }
 }
 
@@ -1473,6 +1590,9 @@ void cpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int ketQb1, int ketQb2,
     // all amplitudes are scaled; 25% by c1 and 75% by 1 + c3
     qindex numIts = qureg.numAmpsPerNode;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+
     auto factors = util_getTwoQubitDepolarisingFactors(prob);
     qreal fac0 = 1 + factors.c3;
     qreal fac1 = factors.c1 - fac0;
@@ -1489,7 +1609,7 @@ void cpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int ketQb1, int ketQb2,
         bool flag = same1 & same2;
 
         // scale amp by c1 or (1+c3)
-        qureg.cpuAmps[n] *= fac1 * flag + fac0;;
+        amps[n] *= fac1 * flag + fac0;;
     }
 }
 
@@ -1499,6 +1619,10 @@ void cpu_densmatr_twoQubitDepolarising_subF(Qureg qureg, int ketQb1, int ketQb2,
     // modify 25% of local amps, one per iteration
     qindex numIts = qureg.numAmpsPerNode / 4;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer);
+
     // received amplitudes may begin at an arbitrary offset in the buffer
     qindex offset = getBufferRecvInd();
 
@@ -1517,7 +1641,7 @@ void cpu_densmatr_twoQubitDepolarising_subF(Qureg qureg, int ketQb1, int ketQb2,
         qindex j = n + offset;
 
         // mix local amp with received buffer amp
-        qureg.cpuAmps[i] += c2 * qureg.cpuCommBuffer[j];
+        amps[i] += c2 * buffer[j];
     }
 }
 
@@ -1533,6 +1657,9 @@ void cpu_densmatr_oneQubitPauliChannel_subA(Qureg qureg, int ketQubit, qreal pI,
     // all amps are modified, and each iteration modifies 4
     qindex numIts = qureg.numAmpsPerNode / 4;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+
     int braQubit = util_getBraQubit(ketQubit, qureg);
 
     auto factors = util_getOneQubitPauliChannelFactors(pI, pX, pY, pZ);
@@ -1541,9 +1668,6 @@ void cpu_densmatr_oneQubitPauliChannel_subA(Qureg qureg, int ketQubit, qreal pI,
     auto facAB = factors.c3;
     auto facBA = factors.c4;
 
-    // for brevity
-    qcomp* amps = qureg.cpuAmps;
-
     /// @todo
     /// each iteration modifies 4 amps in two separable mixed pairs, which may
     /// lead to sub-optimal caching. Iterating twice and modifying a single pair
@@ -1560,10 +1684,10 @@ void cpu_densmatr_oneQubitPauliChannel_subA(Qureg qureg, int ketQubit, qreal pI,
         qindex i11 = flipBit(i01, braQubit);
 
         // modify 4 amps in 2 separable pairs
-        qcomp amp00 = amps[i00];
-        qcomp amp01 = amps[i01];
-        qcomp amp10 = amps[i10];
-        qcomp amp11 = amps[i11];
+        cpu_qcomp amp00 = amps[i00];
+        cpu_qcomp amp01 = amps[i01];
+        cpu_qcomp amp10 = amps[i10];
+        cpu_qcomp amp11 = amps[i11];
 
         amps[i00] = (facAA * amp00) + (facBB * amp11);
         amps[i01] = (facAB * amp01) + (facBA * amp10);
@@ -1578,6 +1702,10 @@ void cpu_densmatr_oneQubitPauliChannel_subB(Qureg qureg, int ketQubit, qreal pI,
     // all amps are modified, and each iteration modifies 2
     qindex numIts = qureg.numAmpsPerNode / 2;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer);
+
     // received amplitudes may begin at an arbitrary offset in the buffer
     qindex offset = getBufferRecvInd();
 
@@ -1608,11 +1736,11 @@ void cpu_densmatr_oneQubitPauliChannel_subB(Qureg qureg, int ketQubit, qreal pI,
         qindex jBA = iAA + offset;
 
         // mix each local amp with a received buffer amp
-        qureg.cpuAmps[iAA] *= facAA;
-        qureg.cpuAmps[iAA] += facBB * qureg.cpuCommBuffer[jBB];
+        amps[iAA] *= facAA;
+        amps[iAA] += facBB * buffer[jBB];
 
-        qureg.cpuAmps[iAB] *= facAB;
-        qureg.cpuAmps[iAB] += facBA * qureg.cpuCommBuffer[jBA];
+        amps[iAB] *= facAB;
+        amps[iAB] += facBA * buffer[jBA];
     }
 }
 
@@ -1628,6 +1756,9 @@ void cpu_densmatr_oneQubitDamping_subA(Qureg qureg, int ketQubit, qreal prob) {
     // each iteration modifies 4 amps
     qindex numIts = qureg.numAmpsPerNode / 4;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+
     int braQubit = util_getBraQubit(ketQubit, qureg);
 
     auto factors = util_getOneQubitDampingFactors(prob);
@@ -1644,12 +1775,12 @@ void cpu_densmatr_oneQubitDamping_subA(Qureg qureg, int ketQubit, qreal prob) {
         qindex i11 = flipBit(i01, braQubit);
         
         // mix both-zero amp with both-one amp (but not vice versa)
-        qureg.cpuAmps[i00] += prob * qureg.cpuAmps[i11];
+        amps[i00] += prob * amps[i11];
 
         // scale other amps
-        qureg.cpuAmps[i01] *= c1;
-        qureg.cpuAmps[i10] *= c1;
-        qureg.cpuAmps[i11] *= c2;
+        amps[i01] *= c1;
+        amps[i10] *= c1;
+        amps[i11] *= c2;
     }
 }
 
@@ -1659,6 +1790,9 @@ void cpu_densmatr_oneQubitDamping_subB(Qureg qureg, int qubit, qreal prob) {
     // half of all local amps are scaled
     qindex numIts = qureg.numAmpsPerNode / 2;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+
     auto c2 = util_getOneQubitDampingFactors(prob).c2;
 
     #pragma omp parallel for if(qureg.isMultithreaded)
@@ -1666,7 +1800,7 @@ void cpu_densmatr_oneQubitDamping_subB(Qureg qureg, int qubit, qreal prob) {
 
         // i = nth local index where qubit=1
         qindex i= insertBit(n, qubit, 1);
-        qureg.cpuAmps[i] *= c2;
+        amps[i] *= c2;
     }
 }
 
@@ -1676,6 +1810,9 @@ void cpu_densmatr_oneQubitDamping_subC(Qureg qureg, int ketQubit, qreal prob) {
     // half of all local amps are scaled
     qindex numIts = qureg.numAmpsPerNode / 2;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+
     int braBit = util_getRankBitOfBraQubit(ketQubit, qureg);
     auto c1 = util_getOneQubitDampingFactors(prob).c1;
 
@@ -1684,7 +1821,7 @@ void cpu_densmatr_oneQubitDamping_subC(Qureg qureg, int ketQubit, qreal prob) {
 
         // i = nth local index where ket differs from bra
         qindex i = insertBit(n, ketQubit, ! braBit);
-        qureg.cpuAmps[i] *= c1;
+        amps[i] *= c1;
     }
 }
 
@@ -1694,6 +1831,10 @@ void cpu_densmatr_oneQubitDamping_subD(Qureg qureg, int qubit, qreal prob) {
     // half of all local amps are combined with buffer
     qindex numIts = qureg.numAmpsPerNode / 2;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer);
+
     // received amplitudes may begin at an arbitrary offset in the buffer
     qindex offset = getBufferRecvInd();
 
@@ -1706,7 +1847,7 @@ void cpu_densmatr_oneQubitDamping_subD(Qureg qureg, int qubit, qreal prob) {
         // j = nth received buffer indes
         qindex j = n + offset;
 
-        qureg.cpuAmps[i] += prob * qureg.cpuCommBuffer[j];
+        amps[i] += prob * buffer[j];
     }
 }
 
@@ -1725,6 +1866,10 @@ void cpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector<int> ta
     // each outer iteration sets one element of outQureg
     qindex numOuterIts = outQureg.numAmpsPerNode;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* inAmps = getCpuQcompPtr(inQureg.cpuAmps);
+    cpu_qcomp* outAmps = getCpuQcompPtr(outQureg.cpuAmps);
+
     // targs and allTargs are sorted, but pairTargs is arbitrarily ordered (though corresponding targs)
     auto allTargsSorted = util_getSorted(targs, pairTargs);
 
@@ -1750,7 +1895,7 @@ void cpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector<int> ta
         qindex k = insertBits(n, allTargsSorted.data(), numAllTargs, 0); // loop may be unrolled
 
         // each outQureg amp results from summing 2^targs inQureg amps
-        qcomp outAmp = 0;
+        cpu_qcomp outAmp = getCpuQcomp(0,0);
 
         // loop may be unrolled
         for (qindex j=0; j<numInnerIts; j++) {
@@ -1760,10 +1905,10 @@ void cpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector<int> ta
             i = setBits(i, targs    .data(), numTargPairs, j); // loop may be unrolled
             i = setBits(i, pairTargs.data(), numTargPairs, j); // loop may be unrolled
 
-            outAmp += inQureg.cpuAmps[i];
+            outAmp += inAmps[i];
         }
 
-        outQureg.cpuAmps[n] = outAmp;
+        outAmps[n] = outAmp;
     }
 }
 
@@ -1797,9 +1942,12 @@ qreal cpu_statevec_calcTotalProb_sub(Qureg qureg) {
     // every amp, iterated independently, contributes to the probability
     qindex numIts = qureg.numAmpsPerNode;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+
     #pragma omp parallel for reduction(+:prob) if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++)
-        prob += std::norm(qureg.cpuAmps[n]);
+        prob += norm(amps[n]);
 
     return prob;
 }
@@ -1827,12 +1975,15 @@ qreal cpu_densmatr_calcTotalProb_sub(Qureg qureg) {
     qindex numAmpsPerCol = powerOf2(qureg.numQubits);
     qindex firstDiagInd = util_getLocalIndexOfFirstDiagonalAmp(qureg);
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+
     #pragma omp parallel for reduction(+:prob) if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
         // i = local index of nth local diagonal element
         qindex i = fast_getQuregLocalIndexOfDiagonalAmp(n, firstDiagInd, numAmpsPerCol);
-        prob += std::real(qureg.cpuAmps[i]);
+        prob += real(amps[i]);
     }
 
     return prob;
@@ -1850,6 +2001,9 @@ qreal cpu_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubi
     // (>=1 since all qubits are in suffix, so qubits.size() <= suffix size) 
     qindex numIts = qureg.numAmpsPerNode / powerOf2(qubits.size());
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+
     auto sortedQubits = util_getSorted(qubits); // all in suffix
     auto qubitStateMask = util_getBitMask(qubits, outcomes);
 
@@ -1862,7 +2016,7 @@ qreal cpu_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubi
         // i = nth local index where qubits are in the specified outcome state
         qindex i = insertBitsWithMaskedValues(n, sortedQubits.data(), numBits, qubitStateMask);
 
-        prob += std::norm(qureg.cpuAmps[i]);
+        prob += norm(amps[i]);
     }
 
     return prob;
@@ -1884,6 +2038,9 @@ qreal cpu_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubi
     qindex numAmpsPerCol = powerOf2(qureg.numQubits);
     qindex firstDiagInd = util_getLocalIndexOfFirstDiagonalAmp(qureg);
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+
     auto sortedQubits = util_getSorted(qubits); // all in suffix, with corresponding bra's all in suffix
     auto qubitStateMask = util_getBitMask(qubits, outcomes);
 
@@ -1899,7 +2056,7 @@ qreal cpu_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubi
         // j = local, flat, density-matrix index of diagonal amp corresponding to state i
         qindex j = fast_getQuregLocalIndexOfDiagonalAmp(i, firstDiagInd, numAmpsPerCol);
 
-        prob += std::real(qureg.cpuAmps[j]);
+        prob += real(amps[j]);
     }
 
     return prob;
@@ -1914,6 +2071,9 @@ void cpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
     // every amp contributes to a statevector prob
     qindex numIts = qureg.numAmpsPerNode;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+
     // use template param to compile-time unroll loop in getValueOfBits()
     SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size());
     qindex numOutcomes = powerOf2(numBits);
@@ -1930,7 +2090,7 @@ void cpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
-        qreal prob = std::norm(qureg.cpuAmps[n]);
+        qreal prob = norm(amps[n]);
 
         // i = global index corresponding to n
         qindex i = concatenateBits(qureg.rank, n, qureg.logNumAmpsPerNode);
@@ -1953,6 +2113,9 @@ void cpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
     qindex numIts = powerOf2(qureg.logNumColsPerNode);
     qindex numAmpsPerCol = powerOf2(qureg.numQubits);
     qindex firstDiagInd = util_getLocalIndexOfFirstDiagonalAmp(qureg);
+
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
     
     // use template param to compile-time unroll loop in getValueOfBits()
     SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size());
@@ -1972,7 +2135,7 @@ void cpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
 
         // i = local index of nth local diagonal element
         qindex i = fast_getQuregLocalIndexOfDiagonalAmp(n, firstDiagInd, numAmpsPerCol);
-        qreal prob = std::real(qureg.cpuAmps[i]);
+        qreal prob = real(amps[i]);
 
         // j = global index of i
         qindex j = concatenateBits(qureg.rank, i, qureg.logNumAmpsPerNode);
@@ -2007,12 +2170,16 @@ qcomp cpu_statevec_calcInnerProduct_sub(Qureg quregA, Qureg quregB) {
     // every local amp contributes to the reduction
     qindex numIts = quregA.numAmpsPerNode;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* ampsA = getCpuQcompPtr(quregA.cpuAmps);
+    cpu_qcomp* ampsB = getCpuQcompPtr(quregB.cpuAmps);
+
     #pragma omp parallel for reduction(+:prodRe,prodIm) if(quregA.isMultithreaded||quregB.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
-        qcomp term = std::conj(quregA.cpuAmps[n]) * quregB.cpuAmps[n];
+        cpu_qcomp term = conj(ampsA[n]) * ampsB[n];
 
-        prodRe += std::real(term);
-        prodIm += std::imag(term);
+        prodRe += real(term);
+        prodIm += imag(term);
     }
 
     return qcomp(prodRe, prodIm);
@@ -2026,9 +2193,13 @@ qreal cpu_densmatr_calcHilbertSchmidtDistance_sub(Qureg quregA, Qureg quregB) {
     // every local amp contributes to the reduction
     qindex numIts = quregA.numAmpsPerNode;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* ampsA = getCpuQcompPtr(quregA.cpuAmps);
+    cpu_qcomp* ampsB = getCpuQcompPtr(quregB.cpuAmps);
+
     #pragma omp parallel for reduction(+:dist) if(quregA.isMultithreaded||quregA.isMultithreaded)
     for (qindex n=0; n<numIts; n++)
-        dist += std::norm(quregA.cpuAmps[n] - quregB.cpuAmps[n]); // |A-B|^2
+        dist += norm(ampsA[n] - ampsB[n]); // |A-B|^2
 
     return dist; // do not sqrt yet
 }
@@ -2044,6 +2215,10 @@ qcomp cpu_densmatr_calcFidelityWithPureState_sub(Qureg rho, Qureg psi) {
     // every local density matrix amp contributes to the reduction
     qindex numIts = rho.numAmpsPerNode;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* rhoAmps = getCpuQcompPtr(rho.cpuAmps);
+    cpu_qcomp* psiAmps = getCpuQcompPtr(psi.cpuAmps);
+
     #pragma omp parallel for reduction(+:fidRe,fidIm) if(rho.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
@@ -2055,20 +2230,20 @@ qcomp cpu_densmatr_calcFidelityWithPureState_sub(Qureg rho, Qureg psi) {
         qindex c = getBitsLeftOfIndex(i, rho.numQubits-1);
 
         // collect amps involved in this term
-        qcomp rhoAmp = rho.cpuAmps[n];
-        qcomp rowAmp = psi.cpuAmps[r];
-        qcomp colAmp = psi.cpuAmps[c]; // likely to be last iteration's amp in cache
+        cpu_qcomp rhoAmp = rhoAmps[n];
+        cpu_qcomp rowAmp = psiAmps[r];
+        cpu_qcomp colAmp = psiAmps[c]; // likely to be last iteration's amp in cache
 
         // compute term of <psi|rho^dagger|psi> or <psi|rho|psi>
         if constexpr (Conj) {
-            rhoAmp = std::conj(rhoAmp);
-            colAmp = std::conj(colAmp);
+            rhoAmp = conj(rhoAmp);
+            colAmp = conj(colAmp);
         } else
-            rowAmp = std::conj(rowAmp);
+            rowAmp = conj(rowAmp);
 
-        qcomp term = rhoAmp * rowAmp * colAmp;
-        fidRe += std::real(term);
-        fidIm += std::imag(term);
+        cpu_qcomp term = rhoAmp * rowAmp * colAmp;
+        fidRe += real(term);
+        fidIm += imag(term);
     }
 
     return qcomp(fidRe, fidIm);
@@ -2095,11 +2270,14 @@ qreal cpu_statevec_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs) {
     qindex numIts = qureg.numAmpsPerNode;
     qindex targMask = util_getBitMask(targs);
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+
     #pragma omp parallel for reduction(+:value) if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
         int sign = fast_getPlusOrMinusMaskedBitParity(n, targMask);
-        value += sign * std::norm(qureg.cpuAmps[n]);
+        value += sign * norm(amps[n]);
     }
 
     return value;
@@ -2117,6 +2295,9 @@ qcomp cpu_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs) {
     qindex numAmpsPerCol = powerOf2(qureg.numQubits);
     qindex firstDiagInd = util_getLocalIndexOfFirstDiagonalAmp(qureg);
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+
     qindex targMask = util_getBitMask(targs);
 
     #pragma omp parallel for reduction(+:valueRe,valueIm) if(qureg.isMultithreaded)
@@ -2128,10 +2309,10 @@ qcomp cpu_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs) {
         // r = global row of nth local diagonal, which determines amp sign
         qindex r = n + firstDiagInd;
         int sign = fast_getPlusOrMinusMaskedBitParity(r, targMask);
-        qcomp term = sign * qureg.cpuAmps[i];
+        cpu_qcomp term = sign * amps[i];
 
-        valueRe += std::real(term);
-        valueIm += std::imag(term);
+        valueRe += real(term);
+        valueIm += imag(term);
     }
 
     return qcomp(valueRe, valueIm);
@@ -2147,6 +2328,9 @@ qcomp cpu_statevec_calcExpecPauliStr_subA(Qureg qureg, vector<int> x, vector<int
     // all local amps appear twice, and each iteration contributes two amps
     qindex numIts = qureg.numAmpsPerNode;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+
     qindex maskXY = util_getBitMask(util_getConcatenated(x, y));
     qindex maskYZ = util_getBitMask(util_getConcatenated(y, z));
 
@@ -2158,10 +2342,10 @@ qcomp cpu_statevec_calcExpecPauliStr_subA(Qureg qureg, vector<int> x, vector<int
 
         // sign = +-1 induced by Y and Z (excludes Y i factors)
         int sign = fast_getPlusOrMinusMaskedBitParity(j, maskYZ);
-        qcomp term = sign * std::conj(qureg.cpuAmps[n]) * qureg.cpuAmps[j];
+        cpu_qcomp term = sign * conj(amps[n]) * amps[j];
 
-        valueRe += std::real(term);
-        valueIm += std::imag(term);
+        valueRe += real(term);
+        valueIm += imag(term);
     }
 
     // scale by i^numY (because sign above exlcuded i)
@@ -2186,6 +2370,10 @@ qcomp cpu_statevec_calcExpecPauliStr_subB(Qureg qureg, vector<int> x, vector<int
     // all local amps contribute to the sum
     qindex numIts = qureg.numAmpsPerNode;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps   = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer);
+
     qindex maskXY = util_getBitMask(util_getConcatenated(x, y));
     qindex maskYZ = util_getBitMask(util_getConcatenated(y, z));
 
@@ -2197,10 +2385,10 @@ qcomp cpu_statevec_calcExpecPauliStr_subB(Qureg qureg, vector<int> x, vector<int
 
         // sign = +-1 induced by Y and Z (excludes Y i factors)
         int sign = fast_getPlusOrMinusMaskedBitParity(j, maskYZ);
-        qcomp term = sign * std::conj(qureg.cpuAmps[n]) * qureg.cpuCommBuffer[j];
+        cpu_qcomp term = sign * conj(amps[n]) * buffer[j];
 
-        valueRe += std::real(term);
-        valueIm += std::imag(term);
+        valueRe += real(term);
+        valueIm += imag(term);
     }
 
     // scale by i^numY (because sign above exlcuded i)
@@ -2219,6 +2407,9 @@ qcomp cpu_densmatr_calcExpecPauliStr_sub(Qureg qureg, vector<int> x, vector<int>
     qindex numAmpsPerCol = powerOf2(qureg.numQubits);
     qindex firstDiagInd = util_getLocalIndexOfFirstDiagonalAmp(qureg);
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
+
     // these masks indicate global paulis (i.e. not just suffix)
     qindex maskXY = util_getBitMask(util_getConcatenated(x, y));
     qindex maskYZ = util_getBitMask(util_getConcatenated(y, z));
@@ -2237,10 +2428,10 @@ qcomp cpu_densmatr_calcExpecPauliStr_sub(Qureg qureg, vector<int> x, vector<int>
 
         // sign = +-1 induced by Y and Z (excludes Y's imaginary factors)
         int sign = fast_getPlusOrMinusMaskedBitParity(i, maskYZ);
-        qcomp term = sign * qureg.cpuAmps[m];
+        cpu_qcomp term = sign * amps[m];
 
-        valueRe += std::real(term);
-        valueIm += std::imag(term);
+        valueRe += real(term);
+        valueIm += imag(term);
     }
 
     // scale by i^numY (because sign above exlcuded i)
@@ -2267,10 +2458,16 @@ qcomp cpu_statevec_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr
     // every amp, iterated independently, contributes to the expectation value
     qindex numIts = qureg.numAmpsPerNode;
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps  = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* elems = getCpuQcompPtr(matr.cpuElems);
+    cpu_qcomp expo   = getCpuQcomp(exponent);
+    (void) expo; // silence when unused
+
     #pragma omp parallel for reduction(+:valueRe,valueIm) if(qureg.isMultithreaded||matr.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
-        qcomp elem = matr.cpuElems[n];
+        cpu_qcomp elem = elems[n];
 
         // compile-time decide if applying power (!= 1) to avoid in-loop branching.
         // beware that pow(qcomp,qcomp) below gives notable error over pow(qreal,qreal) 
@@ -2280,14 +2477,14 @@ qcomp cpu_statevec_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr
         // application of matr. When UseRealPow is set, we discard the imaginary components
         // of the matrix and exponent (latter gauranteed zero) and use the stable pow().
         if constexpr (HasPower && ! UseRealPow)
-            elem = std::pow(elem, exponent); // pow(qcomp)
+            elem = pow(elem, expo); // pow(qcomp)
         if constexpr (HasPower &&   UseRealPow)
-            elem = qcomp(std::pow(std::real(elem), std::real(exponent)), 0); // pow(qreal)
+            elem = getCpuQcomp(std::pow(real(elem), real(expo)), 0); // pow(qreal)
 
-        qcomp term = elem * std::norm(qureg.cpuAmps[n]);
+        cpu_qcomp term = elem * norm(amps[n]);
 
-        valueRe += std::real(term);
-        valueIm += std::imag(term);
+        valueRe += real(term);
+        valueIm += imag(term);
     }
 
     return qcomp(valueRe, valueIm);
@@ -2309,10 +2506,16 @@ qcomp cpu_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr
     qindex numAmpsPerCol = powerOf2(qureg.numQubits);
     qindex firstDiagInd = util_getLocalIndexOfFirstDiagonalAmp(qureg);
 
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps  = getCpuQcompPtr(qureg.cpuAmps);
+    cpu_qcomp* elems = getCpuQcompPtr(matr.cpuElems);
+    cpu_qcomp expo   = getCpuQcomp(exponent);
+    (void) expo; // silence when unused
+
     #pragma omp parallel for reduction(+:valueRe,valueIm) if(qureg.isMultithreaded||matr.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
-        qcomp elem = matr.cpuElems[n];
+        cpu_qcomp elem = elems[n];
 
         // compile-time decide if applying power (!= 1) to avoid in-loop branching.
         // Beware that pow(qcomp,qcomp) below gives notable error over pow(qreal,qreal) 
@@ -2327,16 +2530,16 @@ qcomp cpu_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr
         // is not true of the density matrix calculation here, where the erroneous imaginary
         // components of pow(qcomp,qcomp) will sabotage the final result. Wah!
         if constexpr (HasPower && ! UseRealPow)
-            elem = std::pow(elem, exponent);
+            elem = pow(elem, expo);
         if constexpr (HasPower &&   UseRealPow)
-            elem = qcomp(std::pow(std::real(elem), std::real(exponent)), 0);
+            elem = getCpuQcomp(std::pow(real(elem), real(expo)), 0);
 
         // i = local index of nth local diagonal element
         qindex i = fast_getQuregLocalIndexOfDiagonalAmp(n, firstDiagInd, numAmpsPerCol);
-        qcomp term = elem * qureg.cpuAmps[i];
+        cpu_qcomp term = elem * amps[i];
 
-        valueRe += std::real(term);
-        valueIm += std::imag(term);
+        valueRe += real(term);
+        valueIm += imag(term);
     }
 
     return qcomp(valueRe, valueIm);
@@ -2368,10 +2571,13 @@ void cpu_statevec_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vecto
 
     // visit every amp, setting to zero or multiplying it by renorm
     qindex numIts = qureg.numAmpsPerNode;
-    qreal renorm = 1 / std::sqrt(prob);
+
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps  = getCpuQcompPtr(qureg.cpuAmps);
 
     // binary value of targeted qubits in basis states which are to be retained
     qindex retainValue = getIntegerFromBits(outcomes.data(), outcomes.size());
+    qreal renorm = 1 / std::sqrt(prob);
 
     // use template param to compile-time unroll loop in getValueOfBits()
     SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size());
@@ -2383,8 +2589,7 @@ void cpu_statevec_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vecto
         qindex val = getValueOfBits(n, qubits.data(), numBits);
 
         // multiply amp with renorm or zero, if qubit value matches or disagrees
-        qcomp fac = renorm * (val == retainValue);
-        qureg.cpuAmps[n] *= fac;
+        amps[n] *= renorm * (val == retainValue);
     }
 }
 
@@ -2401,10 +2606,13 @@ void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vecto
 
     // visit every amp, setting most to zero and multiplying the remainder by renorm
     qindex numIts = qureg.numAmpsPerNode;
-    qreal renorm = 1 / prob;
+
+    // use cpu_qcomp arithmetic overloads (avoid qcomp's)
+    cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
 
     // binary value of targeted qubits in basis states which are to be retained
     qindex retainValue = getIntegerFromBits(outcomes.data(), outcomes.size());
+    qreal renorm = 1 / prob;
 
     // use template param to compile-time unroll loops in getValueOfBits()
     SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size());
@@ -2423,8 +2631,7 @@ void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vecto
         qindex v2 = getValueOfBits(c, qubits.data(), numBits);
 
         // multiply amp with renorm or zero if values disagree with given outcomes
-        qcomp fac = renorm * (v1 == v2) * (retainValue == v1);
-        qureg.cpuAmps[n] *= fac;
+        amps[n] *= renorm * (v1 == v2) * (retainValue == v1);
     }
 }
 
@@ -2453,7 +2660,7 @@ void cpu_statevec_initUniformState_sub(Qureg qureg, qcomp amp) {
             qureg.numAmpsPerNode, numAmpsPerPage,
             cpu_getOpenmpThreadInd(), cpu_getCurrentNumThreads());
 
-        std::fill(qureg.cpuAmps + start, qureg.cpuAmps + end, amp);
+        std::fill(qureg.cpuAmps + start, qureg.cpuAmps + end, amp); // no cpu_qcomp cast needed
     }
 }
 
@@ -2468,7 +2675,7 @@ void cpu_statevec_initDebugState_sub(Qureg qureg) {
 
         // i = global index of nth local amp
         qindex i = concatenateBits(qureg.rank, n, qureg.logNumAmpsPerNode);
-        qureg.cpuAmps[n] = qcomp(2*i/10., (2*i+1)/10.);
+        qureg.cpuAmps[n] = qcomp(2*i/10., (2*i+1)/10.); // no cpu_qcomp cast needed
     }
 }
 
@@ -2493,6 +2700,6 @@ void cpu_statevec_initUnnormalisedUniformlyRandomPureStateAmps_sub(Qureg qureg)
 
         #pragma omp for
         for (qindex i=0; i<numIts; i++)
-            qureg.cpuAmps[i] = rand_getThreadPrivateRandomAmp(gen, normDist, phaseDist); // advances gen
+            qureg.cpuAmps[i] = rand_getThreadPrivateRandomAmp(gen, normDist, phaseDist); // advances gen, no cpu_qcomp cast needed
     }
 }
diff --git a/quest/src/gpu/gpu_cuquantum.cuh b/quest/src/gpu/gpu_cuquantum.cuh
index 6ba32100..0fec5ca5 100644
--- a/quest/src/gpu/gpu_cuquantum.cuh
+++ b/quest/src/gpu/gpu_cuquantum.cuh
@@ -46,7 +46,7 @@
 
 #include "quest/src/core/utilities.hpp"
 #include "quest/src/gpu/gpu_config.hpp"
-#include "quest/src/gpu/gpu_types.cuh"
+#include "quest/src/gpu/gpu_qcomp.cuh"
 
 #include <custatevec.h>
 #include <vector>
@@ -182,7 +182,7 @@ void cuquantum_statevec_anyCtrlSwap_subA(Qureg qureg, vector<int> ctrls, vector<
 
     CUDA_CHECK( custatevecSwapIndexBits(
         config.handle,
-        toCuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode,
+        getGpuQcompPtr(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode,
         targPairs, numTargPairs,
 
         // swap mask params seem to be in the reverse order to the remainder of the cuStateVec API
@@ -199,7 +199,7 @@ void cuquantum_statevec_anyCtrlSwap_subA(Qureg qureg, vector<int> ctrls, vector<
  */
 
 
-void cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, cu_qcomp* flatMatrElems, bool applyAdj) {
+void cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, gpu_qcomp* flatMatrElems, bool applyAdj) {
 
     // this funciton is called 'subA' instead of just 'sub', because it is also called in 
     // the one-target case whereby it is strictly the embarrassingly parallel _subA scenario
@@ -210,7 +210,7 @@ void cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(Qureg qureg, vector<int>
 
     CUDA_CHECK( custatevecApplyMatrix(
         config.handle, 
-        toCuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, 
+        getGpuQcompPtr(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, 
         flatMatrElems, CUQUANTUM_QCOMP, CUSTATEVEC_MATRIX_LAYOUT_ROW, applyAdj, 
         targs.data(), targs.size(),
         ctrls.data(), ctrlStates.data(), ctrls.size(), 
@@ -222,7 +222,7 @@ void cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(Qureg qureg, vector<int>
 // there is no bespoke cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subB()
 
 
-void cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, cu_qcomp* flatMatrElems, bool conj) {
+void cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, gpu_qcomp* flatMatrElems, bool conj) {
 
     // beware that despite diagonal matrices being embarrassingly parallel,
     // the target qubits must still all be suffix-only to avoid a cuStateVec error
@@ -239,7 +239,7 @@ void cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrl
 
     CUDA_CHECK( custatevecApplyGeneralizedPermutationMatrix(
         config.handle,
-        toCuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode,
+        getGpuQcompPtr(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode,
         perm, flatMatrElems, CUQUANTUM_QCOMP, adj, 
         targs.data(), targs.size(), 
         ctrls.data(), ctrlStates.data(), ctrls.size(),
@@ -259,9 +259,9 @@ void cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrl
 void cuquantum_densmatr_oneQubitDephasing_subA(Qureg qureg, int qubit, qreal prob) {
 
     // effect the superoperator as a two-qubit diagonal upon a statevector suffix state
-    cu_qcomp a = {1,        0};
-    cu_qcomp b = {1-2*prob, 0};
-    cu_qcomp elems[] = {a, b, b, a};
+    gpu_qcomp a = {1,        0};
+    gpu_qcomp b = {1-2*prob, 0};
+    gpu_qcomp elems[] = {a, b, b, a};
     vector<int> targs {qubit, util_getBraQubit(qubit,qureg)};
 
     bool conj = false;
@@ -275,8 +275,8 @@ void cuquantum_densmatr_oneQubitDephasing_subB(Qureg qureg, int ketQubit, qreal
     // equivalent to a state-controlled global phase upon a statevector, which is 
     // itself a same-element one-qubit diagonal applied to any target
     int braBit = getBit(qureg.rank, ketQubit - qureg.logNumColsPerNode);
-    cu_qcomp fac = {1 - 2*prob, 0};
-    cu_qcomp elems[] = {fac, fac};
+    gpu_qcomp fac = {1 - 2*prob, 0};
+    gpu_qcomp elems[] = {fac, fac};
 
     // we choose to target the largest possible qubit, expecting best cuStateVec performance;
     // note it must still be a suffix qubit since cuQuantum does not know qureg is distributed
@@ -296,9 +296,9 @@ void cuquantum_densmatr_twoQubitDephasing_subA(Qureg qureg, int qubitA, int qubi
     /// are the same, i.e. we skip |00><00|, |01><01|, |10><10|, |11><11|
 
     // effect the superoperator as a four-qubit diagonal upon a statevector suffix state
-    cu_qcomp a = {1,          0};
-    cu_qcomp b = {1-4*prob/3, 0};
-    cu_qcomp elems[] = {a,b,b,b, b,a,b,b, b,b,a,b, b,b,b,a};
+    gpu_qcomp a = {1,          0};
+    gpu_qcomp b = {1-4*prob/3, 0};
+    gpu_qcomp elems[] = {a,b,b,b, b,a,b,b, b,b,a,b, b,b,b,a};
     vector<int> targs {qubitA, qubitB, util_getBraQubit(qubitA,qureg), util_getBraQubit(qubitB,qureg)};
 
     bool conj = false;
@@ -331,7 +331,7 @@ qreal cuquantum_statevec_calcTotalProb_sub(Qureg qureg) {
 
     CUDA_CHECK( custatevecAbs2SumOnZBasis(
         config.handle,
-        toCuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode,
+        getGpuQcompPtr(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode,
         &prob0, &prob1, &qubit, numQubits ) );
 
     qreal total = prob0 + prob1;
@@ -346,7 +346,7 @@ qreal cuquantum_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int
 
     CUDA_CHECK( custatevecAbs2SumArray(
         config.handle,
-        toCuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode,
+        getGpuQcompPtr(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode,
         &prob, nullptr, 0, outcomes.data(), qubits.data(), qubits.size()) );
 
     return static_cast<qreal>(prob);
@@ -367,7 +367,7 @@ void cuquantum_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qu
 
     CUDA_CHECK( custatevecAbs2SumArray(
         config.handle,
-        toCuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode,
+        getGpuQcompPtr(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode,
         outPtr, qubits.data(), qubits.size(), nullptr, nullptr, 0) );
 
     // serially cast and copy output probabilities, if necessary
@@ -409,7 +409,7 @@ qreal cuquantum_statevec_calcExpecPauliStr_subA(Qureg qureg, vector<int> x, vect
 
     CUDA_CHECK( custatevecComputeExpectationsOnPauliBasis(
         config.handle,
-        toCuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode,
+        getGpuQcompPtr(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode,
         &value, termPaulis, numTerms, termTargets, numPaulisPerTerm) );
 
     return static_cast<qreal>(value);
@@ -432,7 +432,7 @@ void cuquantum_statevec_multiQubitProjector_sub(Qureg qureg, vector<int> qubits,
 
     CUDA_CHECK( custatevecCollapseByBitString(
         config.handle,
-        toCuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode,
+        getGpuQcompPtr(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode,
         outcomes.data(), qubits.data(), qubits.size(), prob) );
 }
 
diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh
index 4f2a737e..30377850 100644
--- a/quest/src/gpu/gpu_kernels.cuh
+++ b/quest/src/gpu/gpu_kernels.cuh
@@ -22,12 +22,8 @@
 #include "quest/include/types.h"
 
 #include "quest/src/core/bitwise.hpp"
-#include "quest/src/gpu/gpu_types.cuh"
-
-// kernels/thrust must use cu_qcomp, never qcomp
-#define USE_CU_QCOMP
 #include "quest/src/core/fastmath.hpp"
-#undef USE_CU_QCOMP
+#include "quest/src/gpu/gpu_qcomp.cuh"
 
 #if ! COMPILE_CUDA
     #error "A file being compiled somehow included gpu_kernels.hpp despite QuEST not being compiled in GPU-accelerated mode."
@@ -93,7 +89,7 @@ __forceinline__ __device__ int cudaGetBitMaskParity(qindex mask) {
 
 template <int NumCtrls>
 __global__ void kernel_statevec_packAmpsIntoBuffer(
-    cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, 
+    gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, 
     int* qubits, int numQubits, qindex qubitStateMask
 ) {
     GET_THREAD_IND(n, numThreads);
@@ -110,7 +106,7 @@ __global__ void kernel_statevec_packAmpsIntoBuffer(
 
 
 __global__ void kernel_statevec_packPairSummedAmpsIntoBuffer(
-    cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, 
+    gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, 
     int qubit1, int qubit2, int qubit3, int bit2
 ) {
     GET_THREAD_IND(n, numThreads);
@@ -132,7 +128,7 @@ __global__ void kernel_statevec_packPairSummedAmpsIntoBuffer(
 
 template <int NumCtrls> 
 __global__ void kernel_statevec_anyCtrlSwap_subA(
-    cu_qcomp* amps, qindex numThreads, 
+    gpu_qcomp* amps, qindex numThreads, 
     int* ctrlsAndTargs, int numCtrls, qindex ctrlsAndTargsMask, int targ1, int targ2
 ) {
     GET_THREAD_IND(n, numThreads);
@@ -146,7 +142,7 @@ __global__ void kernel_statevec_anyCtrlSwap_subA(
     qindex i10 = flipTwoBits(i01, targ2, targ1);
 
     // swap amps
-    cu_qcomp amp01 = amps[i01];
+    gpu_qcomp amp01 = amps[i01];
     amps[i01] = amps[i10];
     amps[i10] = amp01;
 }
@@ -154,7 +150,7 @@ __global__ void kernel_statevec_anyCtrlSwap_subA(
 
 template <int NumCtrls> 
 __global__ void kernel_statevec_anyCtrlSwap_subB(
-    cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, 
+    gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, 
     int* ctrls, int numCtrls, qindex ctrlStateMask
 ) {
     GET_THREAD_IND(n, numThreads);
@@ -172,7 +168,7 @@ __global__ void kernel_statevec_anyCtrlSwap_subB(
 
 template <int NumCtrls> 
 __global__ void kernel_statevec_anyCtrlSwap_subC(
-    cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, 
+    gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, 
     int* ctrlsAndTarg, int numCtrls, qindex ctrlsAndTargMask
 ) {
     GET_THREAD_IND(n, numThreads);
@@ -197,9 +193,9 @@ __global__ void kernel_statevec_anyCtrlSwap_subC(
 
 template <int NumCtrls>
 __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subA(
-    cu_qcomp* amps, qindex numThreads, 
+    gpu_qcomp* amps, qindex numThreads, 
     int* ctrlsAndTarg, int numCtrls, qindex ctrlStateMask, int targ, 
-    cu_qcomp m00, cu_qcomp m01, cu_qcomp m10, cu_qcomp m11
+    gpu_qcomp m00, gpu_qcomp m01, gpu_qcomp m10, gpu_qcomp m11
 ) {
     GET_THREAD_IND(n, numThreads);
 
@@ -211,8 +207,8 @@ __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subA(
     qindex i1 = flipBit(i0, targ);
 
     // note amps are strided by 2^targ
-    cu_qcomp amp0 = amps[i0];
-    cu_qcomp amp1 = amps[i1];
+    gpu_qcomp amp0 = amps[i0];
+    gpu_qcomp amp1 = amps[i1];
 
     amps[i0] = m00*amp0 + m01*amp1;
     amps[i1] = m10*amp0 + m11*amp1;
@@ -221,9 +217,9 @@ __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subA(
 
 template <int NumCtrls>
 __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subB(
-    cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, 
+    gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, 
     int* ctrls, int numCtrls, qindex ctrlStateMask,
-    cu_qcomp fac0, cu_qcomp fac1
+    gpu_qcomp fac0, gpu_qcomp fac1
 ) {
     GET_THREAD_IND(n, numThreads);
 
@@ -246,12 +242,12 @@ __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subB(
 
 template <int NumCtrls>
 __global__ void kernel_statevec_anyCtrlTwoTargDenseMatr_sub(
-    cu_qcomp* amps, qindex numThreads, 
+    gpu_qcomp* amps, qindex numThreads, 
     int* ctrlsAndTarg, int numCtrls, qindex ctrlStateMask, int targ1, int targ2,
-    cu_qcomp m00, cu_qcomp m01, cu_qcomp m02, cu_qcomp m03,
-    cu_qcomp m10, cu_qcomp m11, cu_qcomp m12, cu_qcomp m13,
-    cu_qcomp m20, cu_qcomp m21, cu_qcomp m22, cu_qcomp m23,
-    cu_qcomp m30, cu_qcomp m31, cu_qcomp m32, cu_qcomp m33
+    gpu_qcomp m00, gpu_qcomp m01, gpu_qcomp m02, gpu_qcomp m03,
+    gpu_qcomp m10, gpu_qcomp m11, gpu_qcomp m12, gpu_qcomp m13,
+    gpu_qcomp m20, gpu_qcomp m21, gpu_qcomp m22, gpu_qcomp m23,
+    gpu_qcomp m30, gpu_qcomp m31, gpu_qcomp m32, gpu_qcomp m33
 ) {
     GET_THREAD_IND(n, numThreads);
 
@@ -265,10 +261,10 @@ __global__ void kernel_statevec_anyCtrlTwoTargDenseMatr_sub(
     qindex i11 = flipBit(i01, targ2);
 
     // note amps00 and amps01 are strided by 2^targ1, and amps00 and amps10 are strided by 2^targ2
-    cu_qcomp amp00 = amps[i00];
-    cu_qcomp amp01 = amps[i01];
-    cu_qcomp amp10 = amps[i10];
-    cu_qcomp amp11 = amps[i11];
+    gpu_qcomp amp00 = amps[i00];
+    gpu_qcomp amp01 = amps[i01];
+    gpu_qcomp amp10 = amps[i10];
+    gpu_qcomp amp11 = amps[i11];
 
     // amps[i_n] = sum_j elems[n][j] amp[i_n]
     amps[i00] = m00*amp00 + m01*amp01 + m02*amp10 + m03*amp11;
@@ -295,9 +291,9 @@ __forceinline__ __device__ qindex getThreadsNthGlobalArrInd(qindex n, qindex thr
 
 template <int NumCtrls, int NumTargs, bool ApplyConj, bool ApplyTransp>
 __global__ void kernel_statevec_anyCtrlFewTargDenseMatr(
-    cu_qcomp* amps, qindex numThreads,
+    gpu_qcomp* amps, qindex numThreads,
     int* ctrlsAndTargs, int numCtrls, qindex ctrlsAndTargsMask, int* targs,
-    cu_qcomp* flatMatrElems
+    gpu_qcomp* flatMatrElems
 ) {
     GET_THREAD_IND(n, numThreads);
 
@@ -310,7 +306,7 @@ __global__ void kernel_statevec_anyCtrlFewTargDenseMatr(
     // spill to local memory). Hence, this _subA() function is not a subroutine 
     // despite some logic being common to non-compile-time _subB(), and hence
     // why the loops below are explicitly compile-time unrolled
-    REGISTER cu_qcomp privateCache[1 << NumTargs];
+    REGISTER gpu_qcomp privateCache[1 << NumTargs];
 
     // we know NumTargs <= 5, though NumCtrls is permitted anything (including -1)
     SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls);
@@ -335,7 +331,7 @@ __global__ void kernel_statevec_anyCtrlFewTargDenseMatr(
 
         // i = nth local index where ctrls are active and targs form value k
         qindex i = setBits(i0, targs, NumTargs, k); // loop will be unrolled
-        amps[i] = getCuQcomp(0, 0);
+        amps[i] = getGpuQcomp(0, 0);
     
         // force unroll to ensure compile-time cache indices
         #pragma unroll
@@ -349,12 +345,12 @@ __global__ void kernel_statevec_anyCtrlFewTargDenseMatr(
                 h = fast_getMatrixFlatIndex(k, l, numTargAmps);
 
             // optionally conjugate matrix elem
-            cu_qcomp elem = flatMatrElems[h];
+            gpu_qcomp elem = flatMatrElems[h];
             if constexpr (ApplyConj)
-                elem.y *= -1;
+                elem = conj(elem);
 
             // thread-private cache is accessed with compile-time known index
-            amps[i] = amps[i] + (elem * privateCache[l]);
+            amps[i] += elem * privateCache[l];
         }
     }
 }
@@ -362,11 +358,11 @@ __global__ void kernel_statevec_anyCtrlFewTargDenseMatr(
 
 template <int NumCtrls, bool ApplyConj, bool ApplyTransp>
 __global__ void kernel_statevec_anyCtrlManyTargDenseMatr(
-    cu_qcomp* globalCache,
-    cu_qcomp* amps, qindex numThreads, qindex numBatchesPerThread,
+    gpu_qcomp* globalCache,
+    gpu_qcomp* amps, qindex numThreads, qindex numBatchesPerThread,
     int* ctrlsAndTargs, int numCtrls, qindex ctrlsAndTargsMask, 
     int* targs, int numTargBits, qindex numTargAmps,
-    cu_qcomp* flatMatrElems
+    gpu_qcomp* flatMatrElems
 ) {
     GET_THREAD_IND(t, numThreads);
 
@@ -398,7 +394,7 @@ __global__ void kernel_statevec_anyCtrlManyTargDenseMatr(
 
             // i = nth local index where ctrls are active and targs form value k
             qindex i = setBits(i0, targs, numTargBits, k); // loop may be unrolled
-            amps[i] = getCuQcomp(0, 0);
+            amps[i] = getGpuQcomp(0, 0);
         
             for (qindex l=0; l<numTargAmps; l++) {
                 qindex j = getThreadsNthGlobalArrInd(l, t, numThreads);
@@ -410,12 +406,12 @@ __global__ void kernel_statevec_anyCtrlManyTargDenseMatr(
                 else
                     h = fast_getMatrixFlatIndex(k, l, numTargAmps);
 
-                cu_qcomp elem = flatMatrElems[h];
+                gpu_qcomp elem = flatMatrElems[h];
 
                 if constexpr (ApplyConj)
-                    elem.y *= -1;
+                    elem = conj(elem);
 
-                amps[i] = amps[i] + (elem * globalCache[j]);
+                amps[i] += elem * globalCache[j];
 
                 /// @todo
                 /// qureg.cpuAmps[i] is being serially updated by only this thread,
@@ -435,9 +431,9 @@ __global__ void kernel_statevec_anyCtrlManyTargDenseMatr(
 
 template <int NumCtrls>
 __global__ void kernel_statevec_anyCtrlOneTargDiagMatr_sub(
-    cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode,
+    gpu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode,
     int* ctrls, int numCtrls, qindex ctrlStateMask, int targ, 
-    cu_qcomp m1, cu_qcomp m2
+    gpu_qcomp m1, gpu_qcomp m2
 ) {
     GET_THREAD_IND(n, numThreads);
 
@@ -462,7 +458,7 @@ __global__ void kernel_statevec_anyCtrlOneTargDiagMatr_sub(
     qindex i = concatenateBits(rank, j, logNumAmpsPerNode);
 
     int b = getBit(i, targ);
-    amps[j] = amps[j] * (m1 + b * (m2 - m1));
+    amps[j] *= m1 + b * (m2 - m1);
 }
 
 
@@ -474,9 +470,9 @@ __global__ void kernel_statevec_anyCtrlOneTargDiagMatr_sub(
 
 template <int NumCtrls>
 __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub(
-    cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode,
+    gpu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode,
     int* ctrls, int numCtrls, qindex ctrlStateMask, int targ1, int targ2,
-    cu_qcomp m1, cu_qcomp m2, cu_qcomp m3, cu_qcomp m4
+    gpu_qcomp m1, gpu_qcomp m2, gpu_qcomp m3, gpu_qcomp m4
 ) {
     GET_THREAD_IND(n, numThreads);
 
@@ -502,8 +498,8 @@ __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub(
 
     // k = local elem index
     int k = getTwoBits(i, targ2, targ1);
-    cu_qcomp elems[] = {m1, m2, m3, m4};
-    amps[j] = amps[j] * elems[k];
+    gpu_qcomp elems[] = {m1, m2, m3, m4};
+    amps[j] *= elems[k];
 }
 
 
@@ -515,9 +511,9 @@ __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub(
 
 template <int NumCtrls, int NumTargs, bool ApplyConj, bool HasPower>
 __global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub(
-    cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode,
+    gpu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode,
     int* ctrls, int numCtrls, qindex ctrlStateMask, int* targs, int numTargs,
-    cu_qcomp* elems, cu_qcomp exponent
+    gpu_qcomp* elems, gpu_qcomp exponent
 ) {
     GET_THREAD_IND(n, numThreads);
 
@@ -545,15 +541,15 @@ __global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub(
     // t = value of targeted bits, which may be in the prefix substate
     qindex t = getValueOfBits(i, targs, numTargBits);
 
-    cu_qcomp elem = elems[t];
+    gpu_qcomp elem = elems[t];
 
     if constexpr (HasPower)
-        elem = getCompPower(elem, exponent);
+        elem = pow(elem, exponent);
 
     if constexpr (ApplyConj)
-        elem.y *= -1;
+        elem = conj(elem);
 
-    amps[j] = amps[j] * elem;
+    amps[j] *= elem;
 }
 
 
@@ -565,20 +561,20 @@ __global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub(
 
 template <bool HasPower, bool ApplyLeft, bool ApplyRight, bool ConjRight> 
 __global__ void kernel_densmatr_allTargDiagMatr_sub(
-    cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode,
-    cu_qcomp* elems, qindex numElems, cu_qcomp exponent
+    gpu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode,
+    gpu_qcomp* elems, qindex numElems, gpu_qcomp exponent
 ) {
     GET_THREAD_IND(n, numThreads);
 
-    cu_qcomp fac = getCuQcomp(1, 0);
+    gpu_qcomp fac = getGpuQcomp(1, 0);
 
     if constexpr (ApplyLeft) {
 
         qindex i = fast_getQuregGlobalRowFromFlatIndex(n, numElems);
-        cu_qcomp term = elems[i];
+        gpu_qcomp term = elems[i];
 
         if constexpr (HasPower)
-            term = getCompPower(term, exponent);
+            term = pow(term, exponent);
 
         fac = term;
     }
@@ -587,18 +583,18 @@ __global__ void kernel_densmatr_allTargDiagMatr_sub(
 
         qindex m = concatenateBits(rank, n, logNumAmpsPerNode);
         qindex j = fast_getQuregGlobalColFromFlatIndex(m, numElems);
-        cu_qcomp term = elems[j];
+        gpu_qcomp term = elems[j];
 
         if constexpr (HasPower)
-            term = getCompPower(term, exponent);
+            term = pow(term, exponent);
 
         if constexpr (ConjRight)
-            term.y *= -1;
+            term = conj(term);
 
-        fac = fac * term;
+        fac *= term;
     }
 
-    amps[n] = amps[n] * fac;
+    amps[n] *= fac;
 }
 
 
@@ -610,10 +606,10 @@ __global__ void kernel_densmatr_allTargDiagMatr_sub(
 
 template <int NumCtrls, int NumTargs> 
 __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subA(
-    cu_qcomp* amps, qindex numThreads,
+    gpu_qcomp* amps, qindex numThreads,
     int* ctrlsAndTargs, int numCtrls, qindex ctrlsAndTargsStateMask, 
     int* targsXY, int numXY, qindex maskXY, qindex maskYZ, 
-    cu_qcomp powI, cu_qcomp ampFac, cu_qcomp pairAmpFac
+    gpu_qcomp powI, gpu_qcomp ampFac, gpu_qcomp pairAmpFac
 ) {
     GET_THREAD_IND(t, numThreads);
 
@@ -636,11 +632,11 @@ __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subA(
     // determine whether to multiply amps by +-1 or +-i
     int parA = cudaGetBitMaskParity(iA & maskYZ);
     int parB = cudaGetBitMaskParity(iB & maskYZ);
-    cu_qcomp coeffA = powI * fast_getPlusOrMinusOne(parA);
-    cu_qcomp coeffB = powI * fast_getPlusOrMinusOne(parB);
+    gpu_qcomp coeffA = powI * fast_getPlusOrMinusOne(parA);
+    gpu_qcomp coeffB = powI * fast_getPlusOrMinusOne(parB);
 
-    cu_qcomp ampA = amps[iA];
-    cu_qcomp ampB = amps[iB];
+    gpu_qcomp ampA = amps[iA];
+    gpu_qcomp ampB = amps[iB];
 
     // mix or swap scaled amp pair
     amps[iA] = (ampFac * ampA) + (pairAmpFac * coeffB * ampB);
@@ -650,10 +646,10 @@ __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subA(
 
 template <int NumCtrls>
 __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subB(
-    cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads,
+    gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads,
     int* ctrls, int numCtrls, qindex ctrlStateMask,
     qindex maskXY, qindex maskYZ, qindex bufferMaskXY,
-    cu_qcomp powI, cu_qcomp thisAmpFac, cu_qcomp otherAmpFac
+    gpu_qcomp powI, gpu_qcomp thisAmpFac, gpu_qcomp otherAmpFac
 ) {
     GET_THREAD_IND(n, numThreads);
 
@@ -671,7 +667,7 @@ __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subB(
 
     // determine whether to multiply buffer amp by +-1 or +-i
     int par = cudaGetBitMaskParity(k & maskYZ);
-    cu_qcomp coeff = powI * fast_getPlusOrMinusOne(par);
+    gpu_qcomp coeff = powI * fast_getPlusOrMinusOne(par);
 
     amps[i] = (thisAmpFac * amps[i]) + (otherAmpFac * coeff * buffer[j]);
 }
@@ -685,9 +681,9 @@ __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subB(
 
 template <int NumCtrls>
 __global__ void kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(
-    cu_qcomp* amps, qindex numThreads,
+    gpu_qcomp* amps, qindex numThreads,
     int* ctrls, int numCtrls, qindex ctrlStateMask, qindex targMask,
-    cu_qcomp fac0, cu_qcomp fac1
+    gpu_qcomp fac0, gpu_qcomp fac1
 ) {
     GET_THREAD_IND(n, numThreads);
 
@@ -700,8 +696,8 @@ __global__ void kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(
     // apply phase to amp depending on parity of targets in global index 
     int p = cudaGetBitMaskParity(i & targMask);
 
-    cu_qcomp facs[] = {fac0, fac1};
-    amps[i] = amps[i] * facs[p];
+    gpu_qcomp facs[] = {fac0, fac1};
+    amps[i] *= facs[p];
 }
 
 
@@ -713,18 +709,18 @@ __global__ void kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(
 
 template <int NumQuregs> 
 __global__ void kernel_statevec_setQuregToWeightedSum_sub(
-    cu_qcomp* outAmps, qindex numThreads,
-    cu_qcomp* coeffs, cu_qcomp** inAmps, int numQuregs
+    gpu_qcomp* outAmps, qindex numThreads,
+    gpu_qcomp* coeffs, gpu_qcomp** inAmps, int numQuregs
 ) {
     GET_THREAD_IND(n, numThreads);
 
     // use template param to compile-time unroll below loop
     SET_VAR_AT_COMPILE_TIME(int, numInner, NumQuregs, numQuregs);
 
-    cu_qcomp amp = getCuQcomp(0, 0);
+    gpu_qcomp amp = getGpuQcomp(0, 0);
 
     for (int q=0; q<numInner; q++)
-        amp = amp + coeffs[q] * inAmps[q][n];
+        amp += coeffs[q] * inAmps[q][n];
 
     // must not modify outAmps[n] before computing the amp 
     // since outAmps can legally appear among inAmps
@@ -738,7 +734,7 @@ __global__ void kernel_statevec_setQuregToWeightedSum_sub(
 
 
 __global__ void kernel_densmatr_mixQureg_subB(
-    qreal outProb, cu_qcomp* outAmps, qreal inProb, cu_qcomp* inAmps,
+    qreal outProb, gpu_qcomp* outAmps, qreal inProb, gpu_qcomp* inAmps,
     qindex numThreads, qindex numInAmps
 ) {
     GET_THREAD_IND(n, numThreads);
@@ -747,15 +743,15 @@ __global__ void kernel_densmatr_mixQureg_subB(
     qindex i = n % numInAmps;
     qindex j = n / numInAmps;
 
-    cu_qcomp iAmp = inAmps[i];
-    cu_qcomp jAmp = inAmps[j]; jAmp.y *= -1; // conj
+    gpu_qcomp iAmp = inAmps[i];
+    gpu_qcomp jAmp = conj(inAmps[j]);
     
     outAmps[n] = (outProb * outAmps[n]) + (inProb * iAmp * jAmp);
 }
 
 
 __global__ void kernel_densmatr_mixQureg_subC(
-    qreal outProb, cu_qcomp* outAmps, qreal inProb, cu_qcomp* inAmps,
+    qreal outProb, gpu_qcomp* outAmps, qreal inProb, gpu_qcomp* inAmps,
     qindex numThreads, int rank, qindex numInAmps, qindex logNumOutAmpsPerNode
 ) {
     GET_THREAD_IND(n, numThreads);
@@ -767,8 +763,8 @@ __global__ void kernel_densmatr_mixQureg_subC(
     qindex i = m % numInAmps;
     qindex j = m / numInAmps;
 
-    cu_qcomp iAmp = inAmps[i];
-    cu_qcomp jAmp = inAmps[j]; jAmp.y *= -1; // conj
+    gpu_qcomp iAmp = inAmps[i];
+    gpu_qcomp jAmp = conj(inAmps[j]);
     
     outAmps[n] = (outProb * outAmps[n]) + (inProb * iAmp * jAmp);
 }
@@ -781,7 +777,7 @@ __global__ void kernel_densmatr_mixQureg_subC(
 
 
 __global__ void kernel_densmatr_oneQubitDephasing_subA(
-    cu_qcomp* amps, qindex numThreads, 
+    gpu_qcomp* amps, qindex numThreads, 
     int ketQubit, int braQubit, qreal fac
 ) {
     GET_THREAD_IND(n, numThreads);
@@ -795,13 +791,13 @@ __global__ void kernel_densmatr_oneQubitDephasing_subA(
     qindex i01 = insertTwoBits(n, braQubit, 0, ketQubit, 1);
     qindex i10 = insertTwoBits(n, braQubit, 1, ketQubit, 0);
 
-    amps[i01] = amps[i01] * fac;
-    amps[i10] = amps[i10] * fac;
+    amps[i01] *= fac;
+    amps[i10] *= fac;
 }
 
 
 __global__ void kernel_densmatr_oneQubitDephasing_subB(
-    cu_qcomp* amps, qindex numThreads, 
+    gpu_qcomp* amps, qindex numThreads, 
     int ketQubit, int braBit, qreal fac
 ) {
     GET_THREAD_IND(n, numThreads);
@@ -813,7 +809,7 @@ __global__ void kernel_densmatr_oneQubitDephasing_subB(
 
     // i = nth local index where bra-qubit differs from ket-qubit
     qindex i = insertBit(n, ketQubit, ! braBit);
-    amps[i] = amps[i] * fac;
+    amps[i] *= fac;
 }
 
 
@@ -821,7 +817,7 @@ __global__ void kernel_densmatr_oneQubitDephasing_subB(
 
 
 __global__ void kernel_densmatr_twoQubitDephasing_subB(
-    cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, // numAmps, not numCols
+    gpu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, // numAmps, not numCols
     int ketQubit1, int ketQubit2, int braQubit1, int braQubit2, qreal term
 ) {
     GET_THREAD_IND(n, numThreads);
@@ -836,7 +832,7 @@ __global__ void kernel_densmatr_twoQubitDephasing_subB(
     int flag = bitA | bitB;
 
     // by multiplying by 1 or (1 + term)
-    amps[n] = amps[n] * ((term * flag) + 1);
+    amps[n] *= (term * flag) + 1;
 }
 
 
@@ -847,7 +843,7 @@ __global__ void kernel_densmatr_twoQubitDephasing_subB(
 
 
 __global__ void kernel_densmatr_oneQubitDepolarising_subA(
-    cu_qcomp* amps, qindex numThreads, 
+    gpu_qcomp* amps, qindex numThreads, 
     int ketQubit, int braQubit, qreal facAA, qreal facBB, qreal facAB
 ) {
     GET_THREAD_IND(n, numThreads);
@@ -859,16 +855,16 @@ __global__ void kernel_densmatr_oneQubitDepolarising_subA(
     qindex i11 = flipBit(i01, braQubit);
 
     // modify 4 amps, mixing a pair, and scaling the other
-    cu_qcomp amp00 = amps[i00];
+    gpu_qcomp amp00 = amps[i00];
     amps[i00] = (facAA * amp00) + (facBB * amps[i11]);
-    amps[i01] = amps[i01] * facAB;
-    amps[i10] = amps[i10] * facAB;
+    amps[i01] *= facAB;
+    amps[i10] *= facAB;
     amps[i11] = (facAA * amps[i11]) + (facBB * amp00);
 }
 
 
 __global__ void kernel_densmatr_oneQubitDepolarising_subB(
-    cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, 
+    gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, 
     int ketQubit, int braBit, qreal facAA, qreal facBB, qreal facAB
 ) {
     GET_THREAD_IND(n, numThreads);
@@ -879,7 +875,7 @@ __global__ void kernel_densmatr_oneQubitDepolarising_subB(
 
     // iAB = nth local index where ket qubit disagrees with bra qubit
     qindex iAB = insertBit(n, ketQubit, ! braBit);
-    amps[iAB] = facAB * amps[iAB];
+    amps[iAB] *= facAB;
 }
 
 
@@ -890,7 +886,7 @@ __global__ void kernel_densmatr_oneQubitDepolarising_subB(
 
 
 __global__ void kernel_densmatr_twoQubitDepolarising_subA(
-    cu_qcomp* amps, qindex numThreads, 
+    gpu_qcomp* amps, qindex numThreads, 
     int ketQb1, int ketQb2, int braQb1, int braQb2, qreal c3
 ) {
     GET_THREAD_IND(n, numThreads);
@@ -901,12 +897,12 @@ __global__ void kernel_densmatr_twoQubitDepolarising_subA(
     int mod   = !(flag1 & flag2);
 
     // multiply amp by 1 or (1 + c3)
-    amps[n] = amps[n] * (1 + c3 * mod);
+    amps[n] *= 1 + c3 * mod;
 }
 
 
 __global__ void kernel_densmatr_twoQubitDepolarising_subB(
-    cu_qcomp* amps, qindex numThreads, 
+    gpu_qcomp* amps, qindex numThreads, 
     int ketQb1, int ketQb2, int braQb1, int braQb2, qreal c1alt, qreal c2
 ) {
     GET_THREAD_IND(n, numThreads);
@@ -918,7 +914,7 @@ __global__ void kernel_densmatr_twoQubitDepolarising_subB(
     qindex i1111 = flipTwoBits(i0101, braQb2, ketQb2);
     
     // mix 1/16 of all amps in groups of 4
-    cu_qcomp term = amps[i0000] + amps[i0101] + amps[i1010] + amps[i1111];
+    gpu_qcomp term = amps[i0000] + amps[i0101] + amps[i1010] + amps[i1111];
 
     amps[i0000] = c1alt*amps[i0000] + c2*term;
     amps[i0101] = c1alt*amps[i0101] + c2*term;
@@ -928,7 +924,7 @@ __global__ void kernel_densmatr_twoQubitDepolarising_subB(
 
 
 __global__ void kernel_densmatr_twoQubitDepolarising_subC(
-    cu_qcomp* amps, qindex numThreads, 
+    gpu_qcomp* amps, qindex numThreads, 
     int ketQb1, int ketQb2, int braQb1, int braBit2, qreal c3
 ) {
     GET_THREAD_IND(n, numThreads);
@@ -944,12 +940,12 @@ __global__ void kernel_densmatr_twoQubitDepolarising_subC(
     bool mod   = !(flag1 & flag2);
 
     // scale amp by 1 or (1 + c3)
-    amps[n] = amps[n] * (1 + c3 * mod);
+    amps[n] *= 1 + c3 * mod;
 }
 
 
 __global__ void kernel_densmatr_twoQubitDepolarising_subD(
-    cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, 
+    gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, 
     int ketQb1, int ketQb2, int braQb1, int braBit2, qreal c1, qreal c2
 ) {
     GET_THREAD_IND(n, numThreads);
@@ -960,8 +956,8 @@ __global__ void kernel_densmatr_twoQubitDepolarising_subD(
     qindex i1b1 = flipTwoBits(i0b0, braQb1, ketQb1);
 
     // mix pair of amps using buffer
-    cu_qcomp amp0b0 = amps[i0b0];
-    cu_qcomp amp1b1 = amps[i1b1];
+    gpu_qcomp amp0b0 = amps[i0b0];
+    gpu_qcomp amp1b1 = amps[i1b1];
 
     amps[i0b0] = c1*amp0b0 + c2*(amp1b1 + buffer[n]);
     amps[i1b1] = c1*amp1b1 + c2*(amp0b0 + buffer[n]);
@@ -969,7 +965,7 @@ __global__ void kernel_densmatr_twoQubitDepolarising_subD(
 
 
 __global__ void kernel_densmatr_twoQubitDepolarising_subE(
-    cu_qcomp* amps, qindex numThreads, 
+    gpu_qcomp* amps, qindex numThreads, 
     int ketQb1, int ketQb2, int braBit1, int braBit2, qreal fac0, qreal fac1
 ) {
     GET_THREAD_IND(n, numThreads);
@@ -980,12 +976,12 @@ __global__ void kernel_densmatr_twoQubitDepolarising_subE(
     bool flag = (same1 & same2);
 
     // scale amp by c1 or (1+c3)
-    amps[n] = amps[n] * (fac1 * flag + fac0);
+    amps[n] *= fac1 * flag + fac0;
 }
 
 
 __global__ void kernel_densmatr_twoQubitDepolarising_subF(
-    cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, 
+    gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, 
     int ketQb1, int ketQb2, int braBit1, int braBit2, qreal c2
 ) {
     GET_THREAD_IND(n, numThreads);
@@ -994,7 +990,7 @@ __global__ void kernel_densmatr_twoQubitDepolarising_subF(
     qindex i = insertTwoBits(n, ketQb2, braBit2, ketQb1, braBit1);
 
     // mix local amp with received buffer amp
-    amps[i] = amps[i] + (c2 * buffer[n]);
+    amps[i] += c2 * buffer[n];
 }
 
 
@@ -1005,7 +1001,7 @@ __global__ void kernel_densmatr_twoQubitDepolarising_subF(
 
 
 __global__ void kernel_densmatr_oneQubitPauliChannel_subA(
-    cu_qcomp* amps, qindex numThreads, int ketQubit, int braQubit, 
+    gpu_qcomp* amps, qindex numThreads, int ketQubit, int braQubit, 
     qreal facAA, qreal facBB, qreal facAB, qreal facBA
 ) {
     GET_THREAD_IND(n, numThreads);
@@ -1017,10 +1013,10 @@ __global__ void kernel_densmatr_oneQubitPauliChannel_subA(
     qindex i11 = flipBit(i01, braQubit);
 
     // modify 4 amps in 2 separable pairs
-    cu_qcomp amp00 = amps[i00];
-    cu_qcomp amp01 = amps[i01];
-    cu_qcomp amp10 = amps[i10];
-    cu_qcomp amp11 = amps[i11];
+    gpu_qcomp amp00 = amps[i00];
+    gpu_qcomp amp01 = amps[i01];
+    gpu_qcomp amp10 = amps[i10];
+    gpu_qcomp amp11 = amps[i11];
 
     amps[i00] = (facAA * amp00) + (facBB * amp11);
     amps[i01] = (facAB * amp01) + (facBA * amp10);
@@ -1030,7 +1026,7 @@ __global__ void kernel_densmatr_oneQubitPauliChannel_subA(
 
 
 __global__ void kernel_densmatr_oneQubitPauliChannel_subB(
-    cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads, 
+    gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, 
     int ketQubit, int braBit, qreal facAA, qreal facBB, qreal facAB, qreal facBA
 ) {
     GET_THREAD_IND(n, numThreads);
@@ -1058,7 +1054,7 @@ __global__ void kernel_densmatr_oneQubitPauliChannel_subB(
 
 
 __global__ void kernel_densmatr_oneQubitDamping_subA(
-    cu_qcomp* amps, qindex numThreads,
+    gpu_qcomp* amps, qindex numThreads,
     int ketQubit, int braQubit, qreal prob, qreal c1, qreal c2
 ) {
     GET_THREAD_IND(n, numThreads);
@@ -1070,17 +1066,17 @@ __global__ void kernel_densmatr_oneQubitDamping_subA(
     qindex i11 = flipBit(i01, braQubit);
     
     // mix both-zero amp with both-one amp (but not vice versa)
-    amps[i00] = amps[i00] + (prob * amps[i11]);
+    amps[i00] += prob * amps[i11];
 
     // scale other amps
-    amps[i01] = amps[i01] * c1;
-    amps[i10] = amps[i10] * c1;
-    amps[i11] = amps[i11] * c2;
+    amps[i01] *= c1;
+    amps[i10] *= c1;
+    amps[i11] *= c2;
 }
 
 
 __global__ void kernel_densmatr_oneQubitDamping_subB(
-    cu_qcomp* amps, qindex numThreads,
+    gpu_qcomp* amps, qindex numThreads,
     int qubit, qreal c2
 ) {
     GET_THREAD_IND(n, numThreads);
@@ -1092,12 +1088,12 @@ __global__ void kernel_densmatr_oneQubitDamping_subB(
 
     // i = nth local index where qubit=1
     qindex i = insertBit(n, qubit, 1);
-    amps[i] = amps[i] * c2;
+    amps[i] *= c2;
 }
 
 
 __global__ void kernel_densmatr_oneQubitDamping_subC(
-    cu_qcomp* amps, qindex numThreads,
+    gpu_qcomp* amps, qindex numThreads,
     int ketQubit, int braBit, qreal c1
 ) {
     GET_THREAD_IND(n, numThreads);
@@ -1109,19 +1105,19 @@ __global__ void kernel_densmatr_oneQubitDamping_subC(
 
     // i = nth local index where ket differs from bra
     qindex i = insertBit(n, ketQubit, ! braBit);
-    amps[i] = amps[i] * c1;
+    amps[i] *= c1;
 }
 
 
 __global__ void kernel_densmatr_oneQubitDamping_subD(
-    cu_qcomp* amps, cu_qcomp* buffer, qindex numThreads,
+    gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads,
     int qubit, qreal prob
 ) {
     GET_THREAD_IND(n, numThreads);
 
     // i = nth local index where ket is 0
     qindex i = insertBit(n, qubit, 0);
-    amps[i] = amps[i] + (prob * buffer[n]);
+    amps[i] += prob * buffer[n];
 }
 
 
@@ -1133,7 +1129,7 @@ __global__ void kernel_densmatr_oneQubitDamping_subD(
 
 template <int NumTargs>
 __global__ void kernel_densmatr_partialTrace_sub(
-    cu_qcomp* ampsIn, cu_qcomp* ampsOut, qindex numThreads,
+    gpu_qcomp* ampsIn, gpu_qcomp* ampsOut, qindex numThreads,
     int* ketTargs, int* pairTargs, int* allTargs, int numKetTargs
 ) {
     GET_THREAD_IND(n, numThreads);
@@ -1142,7 +1138,7 @@ __global__ void kernel_densmatr_partialTrace_sub(
     SET_VAR_AT_COMPILE_TIME(int, numTargPairs, NumTargs, numKetTargs);
 
     // may be inferred at compile-time
-    int numAllTargs = 2*numTargPairs;
+    int numAllTargs = 2 * numTargPairs;
     qindex numIts = powerOf2(numTargPairs);
 
     /// @todo
@@ -1154,7 +1150,7 @@ __global__ void kernel_densmatr_partialTrace_sub(
     qindex k = insertBits(n, allTargs, numAllTargs, 0); // loop may be unrolled
 
     // each outQureg amp results from summing 2^targs inQureg amps
-    cu_qcomp outAmp = getCuQcomp(0, 0);
+    gpu_qcomp outAmp = getGpuQcomp(0, 0);
 
     // loop may be unrolled
     for (qindex j=0; j<numIts; j++) {
@@ -1164,7 +1160,7 @@ __global__ void kernel_densmatr_partialTrace_sub(
         i = setBits(i, ketTargs,  numTargPairs, j); // loops may be unrolled
         i = setBits(i, pairTargs, numTargPairs, j);
 
-        outAmp = outAmp + ampsIn[i];
+        outAmp += ampsIn[i];
     }
 
     ampsOut[n] = outAmp;
@@ -1179,7 +1175,7 @@ __global__ void kernel_densmatr_partialTrace_sub(
 
 template <int NumQubits>
 __global__ void kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub(
-    qreal* outProbs, cu_qcomp* amps, qindex numThreads, 
+    qreal* outProbs, gpu_qcomp* amps, qindex numThreads, 
     int rank, qindex logNumAmpsPerNode,
     int* qubits, int numQubits
 ) {
@@ -1194,7 +1190,7 @@ __global__ void kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub(
     // use template param to compile-time unroll below loops
     SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, numQubits);
 
-    qreal prob = getCompNorm(amps[n]);
+    qreal prob = norm(amps[n]);
 
     // i = global index corresponding to n
     qindex i = concatenateBits(rank, n, logNumAmpsPerNode);
@@ -1208,7 +1204,7 @@ __global__ void kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub(
 
 template <int NumQubits>
 __global__ void kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(
-    qreal* outProbs, cu_qcomp* amps, qindex numThreads, 
+    qreal* outProbs, gpu_qcomp* amps, qindex numThreads, 
     qindex firstDiagInd, qindex numAmpsPerCol,
     int rank, qindex logNumAmpsPerNode,
     int* qubits, int numQubits
@@ -1220,7 +1216,7 @@ __global__ void kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(
 
     // i = index of nth local diagonal elem
     qindex i = fast_getQuregLocalIndexOfDiagonalAmp(n, firstDiagInd, numAmpsPerCol);
-    qreal prob = getCompReal(amps[i]);
+    qreal prob = real(amps[i]);
 
     // j = global index of i
     qindex j = concatenateBits(rank, i, logNumAmpsPerNode);
diff --git a/quest/src/gpu/gpu_qcomp.cuh b/quest/src/gpu/gpu_qcomp.cuh
new file mode 100644
index 00000000..b7d48e13
--- /dev/null
+++ b/quest/src/gpu/gpu_qcomp.cuh
@@ -0,0 +1,131 @@
+/** @file
+ * Definition of gpu_qcomp, an extension of base_qcomp and a
+ * compatible alternative to the user-facing qcomp, used
+ * exclusively by the GPU backend and which is compatible with
+ * both CUDA and HIP.
+ * 
+ * This header is safe to re-include by multiple files because typedef 
+ * redefinition is legal in C++, and all functions herein are inline. 
+ * Furthermore, since it is only ever parsed by nvcc, the __host__ symbols 
+ * are safely processed by other nvcc-only GPU files, like the cuquantum backend.
+ * 
+ * @author Tyson Jones
+ * @author Oliver Brown (patched former HIP arithmetic overloads)
+ * @author Erich Essmann (patched former ROCm build issues)
+ */
+
+#ifndef GPU_QCOMP_CUH
+#define GPU_QCOMP_CUH
+
+#include "quest/include/config.h"
+#include "quest/include/types.h"
+#include "quest/include/precision.h"
+
+#include "quest/src/core/inliner.hpp"
+#include "quest/src/core/base_qcomp.hpp"
+
+#if ! COMPILE_CUDA
+    #error "A file being compiled somehow included gpu_qcomp.hpp despite QuEST not being compiled in GPU-accelerated mode."
+#endif
+
+#if (FLOAT_PRECISION == 4)
+    #error "Build bug; precision.h should have prevented non-float non-double qcomp precision on GPU."
+#endif
+
+#if defined(__HIP__)
+    #include "quest/src/gpu/cuda_to_hip.hpp"
+#endif
+
+#include <array>
+
+
+
+/*
+ * DEFINE GPU_QCOMP
+ *
+ * which is safe to typdef and define additional overloads
+ * below, since never witnessed outside the GPU Backend
+ */
+
+typedef base_qcomp gpu_qcomp;
+
+
+
+/*
+ * CONVERTERS
+ *
+ * which merely wrap the base_qcomp functions for clarity
+ * in the GPU source code, disambiguating from cpu_qcomp
+ */
+
+INLINE gpu_qcomp* getGpuQcompPtr(qcomp* list) {
+    return getBaseQcompPtr(list);
+}
+
+INLINE gpu_qcomp getGpuQcomp(qreal re, qreal im) {
+    return getBaseQcomp(re, im);
+}
+
+INLINE gpu_qcomp getGpuQcomp(const qcomp& a) {
+    return getBaseQcomp(a);
+}
+
+template <int Dim>
+__host__ inline std::array<gpu_qcomp,Dim> getGpuQcompArray(qcomp matr[Dim]) {
+    static_assert(Dim == 2 || Dim == 4);
+
+    // it's crucial we explicitly copy over the elements,
+    // rather than just reinterpret the pointer, to avoid
+    // segmentation faults when memory misaligns (like on HIP)
+
+    std::array<gpu_qcomp,Dim> out;
+    for (int i=0; i<Dim; i++)
+        out[i] = getGpuQcomp(matr[i]);
+
+    return out;
+}
+
+template <int Dim>
+__host__ inline std::array<gpu_qcomp,Dim*Dim> getFlattenedGpuQcompMatrix(qcomp matr[Dim][Dim]) {
+    static_assert(Dim == 2 || Dim == 4);
+
+    std::array<gpu_qcomp,Dim*Dim> out;
+    for (int i=0; i<Dim*Dim; i++)
+        out[i] = getGpuQcomp(matr[i/Dim][i%Dim]);
+
+    return out;
+}
+
+
+
+/*
+ * GPU-SPECIFIC MATHS
+ */
+
+INLINE gpu_qcomp pow(gpu_qcomp base, gpu_qcomp exponent) {
+
+    // using https://mathworld.wolfram.com/ComplexExponentiation.html,
+    // and the principal argument of 'base'
+
+    // base = a + b i, exponent = c + d i
+    qreal a = base.re;
+    qreal b = base.im;
+    qreal c = exponent.re;
+    qreal d = exponent.im;
+
+    // intermediate quantities (uses CUDA atan2,log,pow,exp,cos,sin)
+    qreal arg = atan2(b, a);
+    qreal mag = a*a + b*b;
+    qreal ln = log(mag);
+    qreal fac = pow(mag, c/2) * exp(-d * arg);
+    qreal ang = c*arg + d*ln/2;
+
+    // output scalar
+    qreal re = fac * cos(ang);
+    qreal im = fac * sin(ang);
+    return getGpuQcomp(re, im);
+}
+
+
+
+#endif // GPU_QCOMP_CUH
\ No newline at end of file
diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp
index 5e18048f..cd473ee1 100644
--- a/quest/src/gpu/gpu_subroutines.cpp
+++ b/quest/src/gpu/gpu_subroutines.cpp
@@ -53,7 +53,7 @@
 #include "quest/src/gpu/gpu_subroutines.hpp"
 
 #if COMPILE_CUDA
-    #include "quest/src/gpu/gpu_types.cuh"
+    #include "quest/src/gpu/gpu_qcomp.cuh"
     #include "quest/src/gpu/gpu_kernels.cuh"
     #include "quest/src/gpu/gpu_thrust.cuh"
 #endif
@@ -86,7 +86,7 @@ qcomp gpu_statevec_getAmp_sub(Qureg qureg, qindex ind) {
     qcomp amp;
 
     // compiler guards harmlessly duplicated therein
-    gpu_copyGpuToCpu(&qureg.gpuAmps[ind], &amp, 1);
+    gpu_copyGpuToCpu(qureg.gpuAmps + ind, &amp, 1);
 
     return amp;
 
@@ -148,7 +148,7 @@ qindex gpu_statevec_packAmpsIntoBuffer(Qureg qureg, vector<int> qubits, vector<i
     qindex qubitStateMask  = util_getBitMask(qubits, qubitStates);
 
     kernel_statevec_packAmpsIntoBuffer <NumQubits> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[sendInd], numThreads, 
+        getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + sendInd, numThreads, 
         getPtr(sortedQubits), qubits.size(), qubitStateMask
     );
 
@@ -173,7 +173,7 @@ qindex gpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu
     qindex sendInd = getSubBufferSendInd(qureg);
 
     kernel_statevec_packPairSummedAmpsIntoBuffer <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[sendInd], numThreads, 
+        getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + sendInd, numThreads, 
         qubit1, qubit2, qubit3, bit2
     );
 
@@ -214,7 +214,7 @@ void gpu_statevec_anyCtrlSwap_subA(Qureg qureg, vector<int> ctrls, vector<int> c
     qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ2, targ1}, {0, 1});
 
     kernel_statevec_anyCtrlSwap_subA <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), numThreads, 
+        getGpuQcompPtr(qureg.gpuAmps), numThreads, 
         getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2
     );
 
@@ -239,7 +239,7 @@ void gpu_statevec_anyCtrlSwap_subB(Qureg qureg, vector<int> ctrls, vector<int> c
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
 
     kernel_statevec_anyCtrlSwap_subB <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, 
+        getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, 
         getPtr(sortedCtrls), ctrls.size(), ctrlStateMask
     );
 
@@ -264,7 +264,7 @@ void gpu_statevec_anyCtrlSwap_subC(Qureg qureg, vector<int> ctrls, vector<int> c
     qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {targState});
 
     kernel_statevec_anyCtrlSwap_subC <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, 
+        getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, 
         getPtr(sortedQubits), ctrls.size(), qubitStateMask
     );
 
@@ -293,7 +293,7 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, v
 #if COMPILE_CUQUANTUM
 
     bool applyAdj = false;
-    auto arr = unpackMatrixToCuQcomps(matr);
+    auto arr = getFlattenedGpuQcompMatrix<2>(matr.elems); // explicit template for MSVC, grr!
     cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(qureg, ctrls, ctrlStates, {targ}, arr.data(), applyAdj);
 
 #elif COMPILE_CUDA
@@ -304,10 +304,10 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, v
     devints sortedQubits = util_getSorted(ctrls, {targ});
     qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0});
 
-    auto [m00, m01, m10, m11] = unpackMatrixToCuQcomps(matr);
+    auto [m00, m01, m10, m11] = getFlattenedGpuQcompMatrix<2>(matr.elems); // explicit template for MSVC, grr!
 
     kernel_statevec_anyCtrlOneTargDenseMatr_subA <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), numThreads, 
+        getGpuQcompPtr(qureg.gpuAmps), numThreads, 
         getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ, 
         m00, m01, m10, m11
     );
@@ -333,9 +333,9 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector<int> ctrls, v
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
 
     kernel_statevec_anyCtrlOneTargDenseMatr_subB <NumCtrls> <<<numBlocks,NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, 
+        getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, 
         getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, 
-        toCuQcomp(fac0), toCuQcomp(fac1)
+        getGpuQcomp(fac0), getGpuQcomp(fac1)
     );
 
 #else
@@ -362,7 +362,7 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
 #if COMPILE_CUQUANTUM
 
     bool applyAdj = false;
-    auto arr = unpackMatrixToCuQcomps(matr);
+    auto arr = getFlattenedGpuQcompMatrix<4>(matr.elems); // explicit template for MSVC, grr!
     cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(qureg, ctrls, ctrlStates, {targ1, targ2}, arr.data(), applyAdj);
 
 #elif COMPILE_CUDA
@@ -374,10 +374,10 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
     qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1,targ2}, {0,0});
 
     // unpack matrix elems which are more efficiently accessed by kernels as args than shared mem (... maybe...)
-    auto m = unpackMatrixToCuQcomps(matr);
+    auto m = getFlattenedGpuQcompMatrix<4>(matr.elems); // explicit template for MSVC, grr!
 
     kernel_statevec_anyCtrlTwoTargDenseMatr_sub <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), numThreads, 
+        getGpuQcompPtr(qureg.gpuAmps), numThreads, 
         getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2,
         m[0], m[1], m[2],  m[3],  m[4],  m[5],  m[6],  m[7],
         m[8], m[9], m[10], m[11], m[12], m[13], m[14], m[15]
@@ -405,7 +405,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
 
 #if COMPILE_CUQUANTUM
 
-    auto matrElemsPtr = toCuQcomps(matr.gpuElemsFlat);
+    auto matrElemsPtr = getGpuQcompPtr(matr.gpuElemsFlat);
     auto matrElemsLen = matr.numRows * matr.numRows;
 
     // assert the pre-condition assumed below
@@ -437,8 +437,8 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
     qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, targs, vector<int>(targs.size(),0));
 
     // unpacking args (to better distinguish below signatures)
-    auto ampsPtr   = toCuQcomps(qureg.gpuAmps);
-    auto matrPtr   = toCuQcomps(matr.gpuElemsFlat);
+    auto ampsPtr   = getGpuQcompPtr(qureg.gpuAmps);
+    auto matrPtr   = getGpuQcompPtr(matr.gpuElemsFlat);
     auto qubitsPtr = getPtr(deviceQubits);
     auto targsPtr  = getPtr(deviceTargs);
     auto nCtrls    = ctrls.size();
@@ -506,7 +506,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
         kernel_statevec_anyCtrlManyTargDenseMatr 
             <NumCtrls, ApplyConj, ApplyTransp> 
             <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-                toCuQcomps(cache),
+                getGpuQcompPtr(cache),
                 ampsPtr, numThreads, numBatchesPerThread, 
                 qubitsPtr, nCtrls, qubitStateMask, 
                 targsPtr, targs.size(), powerOf2(targs.size()), matrPtr
@@ -550,7 +550,7 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
         bool conj = false;
 
         // we can pass 1D CPU .elems array directly to cuQuantum which will recognise host pointers
-        cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, {targ}, toCuQcomps(matr.elems), conj);
+        cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, {targ}, getGpuQcompPtr(matr.elems), conj);
         
         // explicitly return to avoid re-simulation below
         return;
@@ -570,10 +570,10 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
 
     devints deviceCtrls = util_getSorted(ctrls);
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
-    auto elems = unpackMatrixToCuQcomps(matr);
+    auto elems = getGpuQcompArray<2>(matr.elems); // explicit template for MSVC, grr!
 
     kernel_statevec_anyCtrlOneTargDiagMatr_sub <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
+        getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
         getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ, elems[0], elems[1]
     );
 
@@ -618,7 +618,7 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
         bool conj = false;
 
         // we can pass 1D CPU array directly to cuQuantum, and it will recognise host pointers
-        cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, {targ1, targ2}, toCuQcomps(matr.elems), conj);
+        cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, {targ1, targ2}, getGpuQcompPtr(matr.elems), conj);
 
         // explicitly return to avoid re-simulation below
         return;
@@ -638,10 +638,10 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
 
     devints deviceCtrls = util_getSorted(ctrls);
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
-    auto elems = unpackMatrixToCuQcomps(matr);
+    auto elems = getGpuQcompArray<4>(matr.elems); // explicit template for MSVC, grr!
 
     kernel_statevec_anyCtrlTwoTargDiagMatr_sub <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
+        getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
         getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ1, targ2,
         elems[0], elems[1], elems[2], elems[3]
     );
@@ -686,7 +686,7 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
 
     // cuQuantum cannot handle HasPower, in which case we fall back to custom kernel
     if (!HasPower && util_areAllQubitsInSuffix(targs, qureg)) {
-        cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, targs, toCuQcomps(util_getGpuMemPtr(matr)), ApplyConj);
+        cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, targs, getGpuQcompPtr(util_getGpuMemPtr(matr)), ApplyConj);
 
         // must return to avoid re-simulation below
         return;
@@ -709,9 +709,9 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
 
     kernel_statevec_anyCtrlAnyTargDiagMatr_sub <NumCtrls, NumTargs, ApplyConj, HasPower> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
+        getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
         getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, getPtr(deviceTargs), targs.size(), 
-        toCuQcomps(util_getGpuMemPtr(matr)), toCuQcomp(exponent)
+        getGpuQcompPtr(util_getGpuMemPtr(matr)), getGpuQcomp(exponent)
     );
 
     // must return to avoid runtime error below
@@ -743,7 +743,7 @@ void gpu_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp
     // we always use Thrust because we are doubtful that cuQuantum's
     // diagonal-matrix facilities are optimised for the all-qubit case
 
-    thrust_statevec_allTargDiagMatr_sub<HasPower>(qureg, matr, toCuQcomp(exponent));
+    thrust_statevec_allTargDiagMatr_sub<HasPower>(qureg, matr, getGpuQcomp(exponent));
 
 #else
     error_gpuSimButGpuNotCompiled();
@@ -764,8 +764,8 @@ void gpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp
     kernel_densmatr_allTargDiagMatr_sub 
         <HasPower, ApplyLeft, ApplyRight, ConjRight> 
         <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-            toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
-            toCuQcomps(util_getGpuMemPtr(matr)), matr.numElems, toCuQcomp(exponent)
+            getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
+            getGpuQcompPtr(util_getGpuMemPtr(matr)), matr.numElems, getGpuQcomp(exponent)
     );
 
 #else
@@ -823,10 +823,10 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, vector<int> ct
     qindex numThreads = (qureg.numAmpsPerNode / powerOf2(ctrls.size())) / 2; // divides evenly
     qindex numBlocks = getNumBlocks(numThreads);
     kernel_statevector_anyCtrlPauliTensorOrGadget_subA <NumCtrls, NumTargs> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), numThreads,
+        getGpuQcompPtr(qureg.gpuAmps), numThreads,
         getPtr(deviceQubits), ctrls.size(), qubitStateMask, 
         getPtr(deviceTargs), deviceTargs.size(),
-        maskXY, maskYZ, toCuQcomp(powI), toCuQcomp(ampFac), toCuQcomp(pairAmpFac)
+        maskXY, maskYZ, getGpuQcomp(powI), getGpuQcomp(ampFac), getGpuQcomp(pairAmpFac)
     );
 
 #else
@@ -854,10 +854,10 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, vector<int> ct
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
 
     kernel_statevector_anyCtrlPauliTensorOrGadget_subB <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, 
+        getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, 
         getPtr(sortedCtrls), ctrls.size(), ctrlStateMask,
         maskXY, maskYZ, bufferMaskXY,
-        toCuQcomp(powI), toCuQcomp(ampFac), toCuQcomp(pairAmpFac)
+        getGpuQcomp(powI), getGpuQcomp(ampFac), getGpuQcomp(pairAmpFac)
     );
 
 #else
@@ -891,9 +891,9 @@ void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, vector<int> c
     qindex targMask = util_getBitMask(targs);
 
     kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), numThreads,
+        getGpuQcompPtr(qureg.gpuAmps), numThreads,
         getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, targMask,
-        toCuQcomp(fac0), toCuQcomp(fac1)
+        getGpuQcomp(fac0), getGpuQcomp(fac1)
     );
 
 #else
@@ -920,17 +920,17 @@ void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs
     qindex numBlocks = getNumBlocks(numThreads);
 
     // extract amp ptrs from qureg list
-    vector<cu_qcomp*> ptrs;
+    vector<gpu_qcomp*> ptrs;
     ptrs.reserve(inQuregs.size());
     for (auto& qureg : inQuregs)
-        ptrs.push_back(toCuQcomps(qureg.gpuAmps));
+        ptrs.push_back(getGpuQcompPtr(qureg.gpuAmps));
     
     // copy coeff and qureg lists into GPU memory
-    devcuqcompptrs devQuregAmps = ptrs;
+    devgpuqcompptrs devQuregAmps = ptrs;
     devcomps devCoeffs = coeffs;
 
     kernel_statevec_setQuregToWeightedSum_sub <NumQuregs> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(outQureg.gpuAmps), numThreads,
+        getGpuQcompPtr(outQureg.gpuAmps), numThreads,
         getPtr(devCoeffs), getPtr(devQuregAmps), inQuregs.size()
     );
 
@@ -960,7 +960,7 @@ void gpu_densmatr_mixQureg_subB(qreal outProb, Qureg outQureg, qreal inProb, Qur
     qindex numBlocks = getNumBlocks(numThreads);
 
     kernel_densmatr_mixQureg_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        outProb, toCuQcomps(outQureg.gpuAmps), inProb, toCuQcomps(inQureg.gpuAmps),
+        outProb, getGpuQcompPtr(outQureg.gpuAmps), inProb, getGpuQcompPtr(inQureg.gpuAmps),
         numThreads, inQureg.numAmps
     );
 
@@ -978,7 +978,7 @@ void gpu_densmatr_mixQureg_subC(qreal outProb, Qureg outQureg, qreal inProb) {
     qindex numBlocks = getNumBlocks(numThreads);
 
     kernel_densmatr_mixQureg_subC <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        outProb, toCuQcomps(outQureg.gpuAmps), inProb, toCuQcomps(outQureg.gpuCommBuffer),
+        outProb, getGpuQcompPtr(outQureg.gpuAmps), inProb, getGpuQcompPtr(outQureg.gpuCommBuffer),
         numThreads, outQureg.rank, powerOf2(outQureg.numQubits), outQureg.logNumAmpsPerNode        
     );
 
@@ -1013,7 +1013,7 @@ void gpu_densmatr_oneQubitDephasing_subA(Qureg qureg, int ketQubit, qreal prob)
     int braQubit = util_getBraQubit(ketQubit, qureg);
 
     kernel_densmatr_oneQubitDephasing_subA <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, fac
+        getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braQubit, fac
     );
 
 #else
@@ -1039,7 +1039,7 @@ void gpu_densmatr_oneQubitDephasing_subB(Qureg qureg, int ketQubit, qreal prob)
     int braBit = util_getRankBitOfBraQubit(ketQubit, qureg);
 
     kernel_densmatr_oneQubitDephasing_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braBit, fac
+        getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braBit, fac
     );
 
 #else
@@ -1085,7 +1085,7 @@ void gpu_densmatr_twoQubitDephasing_subB(Qureg qureg, int ketQubitA, int ketQubi
     int braQubitB = util_getBraQubit(ketQubitB, qureg);
 
     kernel_densmatr_twoQubitDephasing_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, // numAmps, not numCols
+        getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, // numAmps, not numCols
         ketQubitA, ketQubitB, braQubitA, braQubitB, term
     );
     
@@ -1112,7 +1112,7 @@ void gpu_densmatr_oneQubitDepolarising_subA(Qureg qureg, int ketQubit, qreal pro
     auto factors = util_getOneQubitDepolarisingFactors(prob);
 
     kernel_densmatr_oneQubitDepolarising_subA <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, factors.c1, factors.c2, factors.c3
+        getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braQubit, factors.c1, factors.c2, factors.c3
     );
 
 #else
@@ -1133,7 +1133,7 @@ void gpu_densmatr_oneQubitDepolarising_subB(Qureg qureg, int ketQubit, qreal pro
     auto factors = util_getOneQubitDepolarisingFactors(prob);
 
     kernel_densmatr_oneQubitDepolarising_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, 
+        getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, 
         ketQubit, braBit, factors.c1, factors.c2, factors.c3
     );
 
@@ -1161,7 +1161,7 @@ void gpu_densmatr_twoQubitDepolarising_subA(Qureg qureg, int ketQb1, int ketQb2,
     auto c3 = util_getTwoQubitDepolarisingFactors(prob).c3;
 
     kernel_densmatr_twoQubitDepolarising_subA <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), numThreads,
+        getGpuQcompPtr(qureg.gpuAmps), numThreads,
         ketQb1, ketQb2, braQb1, braQb2, c3
     );
 
@@ -1186,7 +1186,7 @@ void gpu_densmatr_twoQubitDepolarising_subB(Qureg qureg, int ketQb1, int ketQb2,
     qreal altc1 = factors.c1 - factors.c2;
 
     kernel_densmatr_twoQubitDepolarising_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), numThreads,
+        getGpuQcompPtr(qureg.gpuAmps), numThreads,
         ketQb1, ketQb2, braQb1, braQb2, altc1, factors.c2
     );
 
@@ -1208,7 +1208,7 @@ void gpu_densmatr_twoQubitDepolarising_subC(Qureg qureg, int ketQb1, int ketQb2,
     auto c3 = util_getTwoQubitDepolarisingFactors(prob).c3;
 
     kernel_densmatr_twoQubitDepolarising_subC <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), numThreads,
+        getGpuQcompPtr(qureg.gpuAmps), numThreads,
         ketQb1, ketQb2, braQb1, braBit2, c3
     );
 
@@ -1231,7 +1231,7 @@ void gpu_densmatr_twoQubitDepolarising_subD(Qureg qureg, int ketQb1, int ketQb2,
     auto factors = util_getTwoQubitDepolarisingFactors(prob);
 
     kernel_densmatr_twoQubitDepolarising_subD <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[offset], numThreads,
+        getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + offset, numThreads,
         ketQb1, ketQb2, braQb1, braBit2, factors.c1, factors.c2
     );
 
@@ -1256,7 +1256,7 @@ void gpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int ketQb1, int ketQb2,
     qreal fac1 = factors.c1 - fac0;
 
     kernel_densmatr_twoQubitDepolarising_subE <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), numThreads,
+        getGpuQcompPtr(qureg.gpuAmps), numThreads,
         ketQb1, ketQb2, braBit1, braBit2, fac0, fac1
     );
 
@@ -1279,7 +1279,7 @@ void gpu_densmatr_twoQubitDepolarising_subF(Qureg qureg, int ketQb1, int ketQb2,
     auto c2 = util_getTwoQubitDepolarisingFactors(prob).c2;
 
     kernel_densmatr_twoQubitDepolarising_subF <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[offset], numThreads,
+        getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + offset, numThreads,
         ketQb1, ketQb2, braBit1, braBit2, c2
     );
 
@@ -1306,7 +1306,7 @@ void gpu_densmatr_oneQubitPauliChannel_subA(Qureg qureg, int ketQubit, qreal pI,
     auto factors = util_getOneQubitPauliChannelFactors(pI, pX, pY, pZ);
 
     kernel_densmatr_oneQubitPauliChannel_subA <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, 
+        getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braQubit, 
         factors.c1, factors.c2, factors.c3, factors.c4
     );
 
@@ -1328,7 +1328,7 @@ void gpu_densmatr_oneQubitPauliChannel_subB(Qureg qureg, int ketQubit, qreal pI,
     auto factors = util_getOneQubitPauliChannelFactors(pI, pX, pY, pZ);
 
     kernel_densmatr_oneQubitPauliChannel_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, 
+        getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, 
         ketQubit, braBit, factors.c1, factors.c2, factors.c3, factors.c4
     );
 
@@ -1355,7 +1355,7 @@ void gpu_densmatr_oneQubitDamping_subA(Qureg qureg, int ketQubit, qreal prob) {
     auto factors = util_getOneQubitDampingFactors(prob);
 
     kernel_densmatr_oneQubitDamping_subA <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), numThreads,
+        getGpuQcompPtr(qureg.gpuAmps), numThreads,
         ketQubit, braQubit, prob, factors.c1, factors.c2
     );
 
@@ -1375,7 +1375,7 @@ void gpu_densmatr_oneQubitDamping_subB(Qureg qureg, int qubit, qreal prob) {
     auto c2 = util_getOneQubitDampingFactors(prob).c2;
 
     kernel_densmatr_oneQubitDamping_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), numThreads, qubit, c2
+        getGpuQcompPtr(qureg.gpuAmps), numThreads, qubit, c2
     );
 
 #else
@@ -1395,7 +1395,7 @@ void gpu_densmatr_oneQubitDamping_subC(Qureg qureg, int ketQubit, qreal prob) {
     auto c1 = util_getOneQubitDampingFactors(prob).c1;
 
     kernel_densmatr_oneQubitDamping_subC <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braBit, c1
+        getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braBit, c1
     );
 
 #else
@@ -1413,7 +1413,7 @@ void gpu_densmatr_oneQubitDamping_subD(Qureg qureg, int qubit, qreal prob) {
     qindex recvInd = getBufferRecvInd();
 
     kernel_densmatr_oneQubitDamping_subD <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, 
+        getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, 
         qubit, prob
     );
 
@@ -1444,7 +1444,7 @@ void gpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector<int> ta
     devints devAllTargs = util_getSorted(targs, pairTargs);
 
     kernel_densmatr_partialTrace_sub <NumTargs> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(inQureg.gpuAmps), toCuQcomps(outQureg.gpuAmps), numThreads,
+        getGpuQcompPtr(inQureg.gpuAmps), getGpuQcompPtr(outQureg.gpuAmps), numThreads,
         getPtr(devTargs), getPtr(devPairTargs), getPtr(devAllTargs), targs.size()
     );
 
@@ -1564,7 +1564,7 @@ void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
     devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws
 
     kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub<NumQubits> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        getPtr(devProbs), toCuQcomps(qureg.gpuAmps), numThreads, 
+        getPtr(devProbs), getGpuQcompPtr(qureg.gpuAmps), numThreads, 
         qureg.rank, qureg.logNumAmpsPerNode, getPtr(devQubits), devQubits.size()
     );
 
@@ -1601,7 +1601,7 @@ void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
     devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws
 
     kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub<NumQubits> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        getPtr(devProbs), toCuQcomps(qureg.gpuAmps), 
+        getPtr(devProbs), getGpuQcompPtr(qureg.gpuAmps), 
         numThreads, firstDiagInd, numAmpsPerCol,
         qureg.rank, qureg.logNumAmpsPerNode, 
         getPtr(devQubits), devQubits.size()
@@ -1633,8 +1633,8 @@ qcomp gpu_statevec_calcInnerProduct_sub(Qureg quregA, Qureg quregB) {
 
 #if COMPILE_CUQUANTUM || COMPILE_CUDA
 
-    cu_qcomp prod = thrust_statevec_calcInnerProduct_sub(quregA, quregB);
-    return toQcomp(prod);
+    gpu_qcomp prod = thrust_statevec_calcInnerProduct_sub(quregA, quregB);
+    return getQcomp(prod);
 
 #else
     error_gpuSimButGpuNotCompiled();
@@ -1661,8 +1661,8 @@ qcomp gpu_densmatr_calcFidelityWithPureState_sub(Qureg rho, Qureg psi) {
 
 #if COMPILE_CUQUANTUM || COMPILE_CUDA
 
-    cu_qcomp fid = thrust_densmatr_calcFidelityWithPureState_sub<Conj>(rho, psi);
-    return toQcomp(fid);
+    gpu_qcomp fid = thrust_densmatr_calcFidelityWithPureState_sub<Conj>(rho, psi);
+    return getQcomp(fid);
 
 #else
     error_gpuSimButGpuNotCompiled();
@@ -1702,8 +1702,8 @@ qcomp gpu_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs) {
 
 #if COMPILE_CUQUANTUM || COMPILE_CUDA
 
-    cu_qcomp value = thrust_densmatr_calcExpecAnyTargZ_sub(qureg, targs);
-    return toQcomp(value);
+    gpu_qcomp value = thrust_densmatr_calcExpecAnyTargZ_sub(qureg, targs);
+    return getQcomp(value);
 
 #else
     error_gpuSimButGpuNotCompiled();
@@ -1720,8 +1720,8 @@ qcomp gpu_statevec_calcExpecPauliStr_subA(Qureg qureg, vector<int> x, vector<int
 
 #elif COMPILE_CUDA
 
-    cu_qcomp value = thrust_statevec_calcExpecPauliStr_subA(qureg, x, y, z);
-    return toQcomp(value);
+    gpu_qcomp value = thrust_statevec_calcExpecPauliStr_subA(qureg, x, y, z);
+    return getQcomp(value);
 
 #else
     error_gpuSimButGpuNotCompiled();
@@ -1734,8 +1734,8 @@ qcomp gpu_statevec_calcExpecPauliStr_subB(Qureg qureg, vector<int> x, vector<int
 
 #if COMPILE_CUQUANTUM || COMPILE_CUDA
 
-    cu_qcomp value = thrust_statevec_calcExpecPauliStr_subB(qureg, x, y, z);
-    return toQcomp(value);
+    gpu_qcomp value = thrust_statevec_calcExpecPauliStr_subB(qureg, x, y, z);
+    return getQcomp(value);
 
 #else
     error_gpuSimButGpuNotCompiled();
@@ -1748,8 +1748,8 @@ qcomp gpu_densmatr_calcExpecPauliStr_sub(Qureg qureg, vector<int> x, vector<int>
     
 #if COMPILE_CUQUANTUM || COMPILE_CUDA
 
-    cu_qcomp value = thrust_densmatr_calcExpecPauliStr_sub(qureg, x, y, z);
-    return toQcomp(value);
+    gpu_qcomp value = thrust_densmatr_calcExpecPauliStr_sub(qureg, x, y, z);
+    return getQcomp(value);
 
 #else
     error_gpuSimButGpuNotCompiled();
@@ -1770,9 +1770,9 @@ qcomp gpu_statevec_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr
 
 #if COMPILE_CUQUANTUM || COMPILE_CUDA
 
-    cu_qcomp expo = toCuQcomp(exponent);
-    cu_qcomp value = thrust_statevec_calcExpecFullStateDiagMatr_sub<HasPower,UseRealPow>(qureg, matr, expo);
-    return toQcomp(value);
+    gpu_qcomp expo = getGpuQcomp(exponent);
+    gpu_qcomp value = thrust_statevec_calcExpecFullStateDiagMatr_sub<HasPower,UseRealPow>(qureg, matr, expo);
+    return getQcomp(value);
 
 #else
     error_gpuSimButGpuNotCompiled();
@@ -1787,9 +1787,9 @@ qcomp gpu_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr
 
 #if COMPILE_CUQUANTUM || COMPILE_CUDA
 
-    cu_qcomp expo = toCuQcomp(exponent);
-    cu_qcomp value = thrust_densmatr_calcExpecFullStateDiagMatr_sub<HasPower,UseRealPow>(qureg, matr, expo);
-    return toQcomp(value);
+    gpu_qcomp expo = getGpuQcomp(exponent);
+    gpu_qcomp value = thrust_densmatr_calcExpecFullStateDiagMatr_sub<HasPower,UseRealPow>(qureg, matr, expo);
+    return getQcomp(value);
 
 #else
     error_gpuSimButGpuNotCompiled();
@@ -1866,7 +1866,7 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_densmatr_multiQubitProjector
 void gpu_statevec_initUniformState_sub(Qureg qureg, qcomp amp) {
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
-    thrust_statevec_initUniformState(qureg, toCuQcomp(amp));
+    thrust_statevec_initUniformState(qureg, getGpuQcomp(amp));
 
 #else
     error_gpuSimButGpuNotCompiled();
diff --git a/quest/src/gpu/gpu_thrust.cuh b/quest/src/gpu/gpu_thrust.cuh
index 3b653dfd..07a65054 100644
--- a/quest/src/gpu/gpu_thrust.cuh
+++ b/quest/src/gpu/gpu_thrust.cuh
@@ -33,18 +33,14 @@
 #include "quest/include/paulis.h"
 #include "quest/include/matrices.h"
 
-#include "quest/src/gpu/gpu_types.cuh"
+#include "quest/src/gpu/gpu_qcomp.cuh"
 #include "quest/src/core/errors.hpp"
 #include "quest/src/core/bitwise.hpp"
 #include "quest/src/core/constants.hpp"
 #include "quest/src/core/utilities.hpp"
 #include "quest/src/core/randomiser.hpp"
-#include "quest/src/comm/comm_config.hpp"
-
-// kernels/thrust must use cu_qcomp, never qcomp
-#define USE_CU_QCOMP
 #include "quest/src/core/fastmath.hpp"
-#undef USE_CU_QCOMP
+#include "quest/src/comm/comm_config.hpp"
 
 #include <thrust/random.h>
 #include <thrust/complex.h>
@@ -68,7 +64,7 @@
  * copy constructor (devicevec d_vec = hostvec). The pointer 
  * to the data (d_vec.data()) can be cast into a raw pointer
  * and passed directly to CUDA kernels (though qcomp must be
- * reinterpreted to cu_qcomp)
+ * reinterpreted to gpu_qcomp)
  */
 
 
@@ -110,18 +106,18 @@ devreals getDeviceRealsVec(qindex dim) {
 
 using devcomps = thrust::device_vector<qcomp>;
 
-cu_qcomp* getPtr(devcomps& comps) {
+gpu_qcomp* getPtr(devcomps& comps) {
 
-    // devcomps -> qcomp -> cu_qcomp
+    // devcomps -> qcomp -> gpu_qcomp
     qcomp* ptr =  thrust::raw_pointer_cast(comps.data());
-    return toCuQcomps(ptr);
+    return getGpuQcompPtr(ptr);
 }
 
 
 // father forgive me for I have sinned
-using devcuqcompptrs = thrust::device_vector<cu_qcomp*>;
+using devgpuqcompptrs = thrust::device_vector<gpu_qcomp*>;
 
-cu_qcomp** getPtr(devcuqcompptrs& ptrs) {
+gpu_qcomp** getPtr(devgpuqcompptrs& ptrs) {
 
     return thrust::raw_pointer_cast(ptrs.data());
 }
@@ -136,13 +132,13 @@ cu_qcomp** getPtr(devcuqcompptrs& ptrs) {
  */
 
 
-thrust::device_ptr<cu_qcomp> getStartPtr(cu_qcomp* amps) {
+thrust::device_ptr<gpu_qcomp> getStartPtr(gpu_qcomp* amps) {
 
     return thrust::device_pointer_cast(amps);
 }
 auto getStartPtr(qcomp* amps) {
 
-    return getStartPtr(toCuQcomps(amps));
+    return getStartPtr(getGpuQcompPtr(amps));
 }
 
 
@@ -177,37 +173,37 @@ auto getEndPtr(FullStateDiagMatr matr) {
 
 struct functor_getAmpConj {
 
-    __host__ __device__ cu_qcomp operator()(cu_qcomp amp) {
-        return getCompConj(amp);
+    __host__ __device__ gpu_qcomp operator()(gpu_qcomp amp) {
+        return conj(amp);
     }
 };
 
 struct functor_getAmpNorm {
 
-    __host__ __device__ qreal operator()(cu_qcomp amp) {
-        return getCompNorm(amp);
+    __host__ __device__ qreal operator()(gpu_qcomp amp) {
+        return norm(amp);
     }
 };
 
 struct functor_getAmpReal {
 
-    __host__ __device__ qreal operator()(cu_qcomp amp) {
-        return getCompReal(amp);
+    __host__ __device__ qreal operator()(gpu_qcomp amp) {
+        return real(amp);
     }
 };
 
 
 struct functor_getAmpConjProd {
 
-    __host__ __device__ cu_qcomp operator()(cu_qcomp braAmp, cu_qcomp ketAmp) { 
-        return getCompConj(braAmp) * ketAmp;
+    __host__ __device__ gpu_qcomp operator()(gpu_qcomp braAmp, gpu_qcomp ketAmp) { 
+        return conj(braAmp) * ketAmp;
     }
 };
 
 struct functor_getNormOfAmpDif {
 
-    __host__ __device__ qreal operator()(cu_qcomp amp1, cu_qcomp amp2) { 
-        return getCompNorm(amp1 - amp2);
+    __host__ __device__ qreal operator()(gpu_qcomp amp1, gpu_qcomp amp2) { 
+        return norm(amp1 - amp2);
     }
 };
 
@@ -220,11 +216,11 @@ struct functor_getExpecStateVecZTerm {
     qindex targMask;
     functor_getExpecStateVecZTerm(qindex mask) : targMask(mask) {}
 
-    __device__ qreal operator()(qindex ind, cu_qcomp amp) {
+    __device__ qreal operator()(qindex ind, gpu_qcomp amp) {
         
         int par = cudaGetBitMaskParity(ind & targMask); // device-only
         int sign = fast_getPlusOrMinusOne(par);
-        return sign * getCompNorm(amp);
+        return sign * norm(amp);
     }
 };
 
@@ -235,12 +231,12 @@ struct functor_getExpecDensMatrZTerm {
     // in the expectation value of Z of a density matrix
 
     qindex numAmpsPerCol, firstDiagInd, targMask;
-    cu_qcomp* amps;
+    gpu_qcomp* amps;
 
-    functor_getExpecDensMatrZTerm(qindex dim, qindex diagInd, qindex mask, cu_qcomp* _amps) : 
+    functor_getExpecDensMatrZTerm(qindex dim, qindex diagInd, qindex mask, gpu_qcomp* _amps) : 
         numAmpsPerCol(dim), firstDiagInd(diagInd), targMask(mask), amps(_amps) {}
 
-    __device__ cu_qcomp operator()(qindex n) {
+    __device__ gpu_qcomp operator()(qindex n) {
 
         qindex i = fast_getQuregLocalIndexOfDiagonalAmp(n, firstDiagInd, numAmpsPerCol);
         qindex r = n + firstDiagInd;
@@ -259,19 +255,19 @@ struct functor_getExpecStateVecPauliTerm {
     // at least one X or Y) of a statevector
 
     qindex maskXY, maskYZ;
-    cu_qcomp *amps, *pairAmps;
+    gpu_qcomp *amps, *pairAmps;
 
-    functor_getExpecStateVecPauliTerm(qindex _maskXY, qindex _maskYZ, cu_qcomp* _amps, cu_qcomp* _pairAmps) : 
+    functor_getExpecStateVecPauliTerm(qindex _maskXY, qindex _maskYZ, gpu_qcomp* _amps, gpu_qcomp* _pairAmps) : 
         maskXY(_maskXY), maskYZ(_maskYZ), amps(_amps), pairAmps(_pairAmps) {}
 
-    __device__ cu_qcomp operator()(qindex n) {
+    __device__ gpu_qcomp operator()(qindex n) {
 
         qindex j = flipBits(n, maskXY);
         int par = cudaGetBitMaskParity(j & maskYZ); // device-only
         int sign = fast_getPlusOrMinusOne(par);
 
         // sign excludes i^numY contribution
-        return sign * getCompConj(amps[n]) * pairAmps[j]; // pairAmps may be amps or buffer
+        return sign * conj(amps[n]) * pairAmps[j]; // pairAmps may be amps or buffer
     }
 };
 
@@ -284,12 +280,12 @@ struct functor_getExpecDensMatrPauliTerm {
 
     qindex maskXY, maskYZ;
     qindex numAmpsPerCol, firstDiagInd;
-    cu_qcomp *amps;
+    gpu_qcomp *amps;
 
-    functor_getExpecDensMatrPauliTerm(qindex _maskXY, qindex _maskYZ, qindex _numAmpsPerCol, qindex _firstDiagInd, cu_qcomp* _amps) :
+    functor_getExpecDensMatrPauliTerm(qindex _maskXY, qindex _maskYZ, qindex _numAmpsPerCol, qindex _firstDiagInd, gpu_qcomp* _amps) :
         maskXY(_maskXY), maskYZ(_maskYZ), numAmpsPerCol(_numAmpsPerCol), firstDiagInd(_firstDiagInd), amps(_amps) {}
 
-    __device__ cu_qcomp operator()(qindex n) {
+    __device__ gpu_qcomp operator()(qindex n) {
 
         qindex r = n + firstDiagInd;
         qindex i = flipBits(r, maskXY);
@@ -310,19 +306,19 @@ struct functor_getExpecDensMatrDiagMatrTerm {
     // value of a FullStateDiagMatr upon a density matrix
 
     qindex numAmpsPerCol, firstDiagInd;
-    cu_qcomp *amps, *elems, expo;
+    gpu_qcomp *amps, *elems, expo;
 
-    functor_getExpecDensMatrDiagMatrTerm(qindex dim, qindex diagInd, cu_qcomp* _amps, cu_qcomp* _elems, cu_qcomp _expo) : 
+    functor_getExpecDensMatrDiagMatrTerm(qindex dim, qindex diagInd, gpu_qcomp* _amps, gpu_qcomp* _elems, gpu_qcomp _expo) : 
         numAmpsPerCol(dim), firstDiagInd(diagInd), amps(_amps), elems(_elems), expo(_expo) {}
 
-    __device__ cu_qcomp operator()(qindex n) {
+    __device__ gpu_qcomp operator()(qindex n) {
 
-        cu_qcomp elem = elems[n];
+        gpu_qcomp elem = elems[n];
 
         if constexpr (HasPower && ! UseRealPow)
-            elem = getCompPower(elem, expo);
+            elem = pow(elem, expo);
         if constexpr (HasPower &&   UseRealPow)
-            elem = getCuQcomp(pow(getCompReal(elem), getCompReal(expo)),0); // CUDA pow(qreal,qreal)
+            elem = getGpuQcomp(pow(real(elem), real(expo)), 0); // CUDA pow(qreal,qreal)
 
         qindex i = fast_getQuregLocalIndexOfDiagonalAmp(n, firstDiagInd, numAmpsPerCol);
 
@@ -339,13 +335,13 @@ struct functor_setAmpToPauliStrSumElem {
     qindex suffixLen;
     qindex numTerms;
 
-    cu_qcomp* amps;
-    cu_qcomp* coeffs;
+    gpu_qcomp* amps;
+    gpu_qcomp* coeffs;
     PauliStr* strings;
     
     functor_setAmpToPauliStrSumElem(
         int rank, qindex dim, qindex suffixLen, qindex numTerms, 
-        cu_qcomp* amps, cu_qcomp* coeffs, PauliStr* strings
+        gpu_qcomp* amps, gpu_qcomp* coeffs, PauliStr* strings
     ) :
         rank(rank), dim(dim), suffixLen(suffixLen), numTerms(numTerms), 
         amps(amps), coeffs(coeffs), strings(strings)
@@ -381,7 +377,7 @@ struct functor_mixAmps {
     qreal outProb, inProb;
     functor_mixAmps(qreal out, qreal in) : outProb(out), inProb(in) {}
 
-    __host__ __device__ cu_qcomp operator()(cu_qcomp outAmp, cu_qcomp inAmp) {
+    __host__ __device__ gpu_qcomp operator()(gpu_qcomp outAmp, gpu_qcomp inAmp) {
         
         return (outProb * outAmp) + (inProb * inAmp);
     }
@@ -397,18 +393,18 @@ struct functor_multiplyElemPowerWithAmpOrNorm {
     // a statevector amp (used when modifying the state)
     // or its norm (used when calculating expected values)
 
-    cu_qcomp exponent;
-    functor_multiplyElemPowerWithAmpOrNorm(cu_qcomp power) : exponent(power) {}
+    gpu_qcomp exponent;
+    functor_multiplyElemPowerWithAmpOrNorm(gpu_qcomp power) : exponent(power) {}
 
-    __host__ __device__ cu_qcomp operator()(cu_qcomp quregAmp, cu_qcomp matrElem) {
+    __host__ __device__ gpu_qcomp operator()(gpu_qcomp quregAmp, gpu_qcomp matrElem) {
 
         if constexpr (HasPower && ! UseRealPow)
-            matrElem = getCompPower(matrElem, exponent);
+            matrElem = pow(matrElem, exponent);
         if constexpr (HasPower &&   UseRealPow)
-            matrElem = getCuQcomp(pow(getCompReal(matrElem), getCompReal(exponent)),0); // CUDA pow(qreal,qreal)
+            matrElem = getGpuQcomp(pow(real(matrElem), real(exponent)), 0); // CUDA pow(qreal,qreal)
 
         if constexpr (Norm)
-            quregAmp = getCuQcomp(getCompNorm(quregAmp), 0);
+            quregAmp = getGpuQcomp(norm(quregAmp), 0);
 
         return matrElem * quregAmp;
     }
@@ -466,15 +462,15 @@ template <bool Conj>
 struct functor_getFidelityTerm {
     int rank, numQubits;
     qindex logNumAmpsPerNode, numAmpsPerCol;
-    cu_qcomp *rho, *psi;
+    gpu_qcomp *rho, *psi;
 
     functor_getFidelityTerm(
-        int _rank, int _numQubits, qindex _logNumAmpsPerNode, qindex _numAmpsPerCol, cu_qcomp* _rho, cu_qcomp* _psi
+        int _rank, int _numQubits, qindex _logNumAmpsPerNode, qindex _numAmpsPerCol, gpu_qcomp* _rho, gpu_qcomp* _psi
     ) :
         rank(_rank), numQubits(_numQubits), logNumAmpsPerNode(_logNumAmpsPerNode), numAmpsPerCol(_numAmpsPerCol), rho(_rho), psi(_psi)
     {}
 
-    __host__ __device__ cu_qcomp operator()(qindex n) {
+    __host__ __device__ gpu_qcomp operator()(qindex n) {
 
         // i = global index of nth local amp of rho
         qindex i = concatenateBits(rank, n, logNumAmpsPerNode);
@@ -484,18 +480,18 @@ struct functor_getFidelityTerm {
         qindex c = getBitsLeftOfIndex(i, numQubits-1);
 
         // collect amps involved in this term
-        cu_qcomp rhoAmp = rho[n];
-        cu_qcomp rowAmp = psi[r];
-        cu_qcomp colAmp = psi[c];
+        gpu_qcomp rhoAmp = rho[n];
+        gpu_qcomp rowAmp = psi[r];
+        gpu_qcomp colAmp = psi[c];
 
         // compute term of <psi|rho^dagger|psi> or <psi|rho|psi>
         if constexpr (Conj) {
-            rhoAmp = getCompConj(rhoAmp);
-            colAmp = getCompConj(colAmp);
+            rhoAmp = conj(rhoAmp);
+            colAmp = conj(colAmp);
         } else
-            rowAmp = getCompConj(rowAmp);
+            rowAmp = conj(rowAmp);
 
-        cu_qcomp fid = rhoAmp * rowAmp * colAmp;
+        gpu_qcomp fid = rhoAmp * rowAmp * colAmp;
         return fid;
     }
 };
@@ -525,7 +521,7 @@ struct functor_projectStateVec {
         assert_numTargsMatchesTemplateParam(numTargets, NumTargets);
     }
 
-    __host__ __device__ cu_qcomp operator()(qindex n, cu_qcomp amp) {
+    __host__ __device__ gpu_qcomp operator()(qindex n, gpu_qcomp amp) {
 
         // use the compile-time value if possible, to auto-unroll the getValueOfBits() loop below
         SET_VAR_AT_COMPILE_TIME(int, numBits, NumTargets, numTargets);
@@ -562,7 +558,7 @@ struct functor_projectDensMatr {
         assert_numTargsMatchesTemplateParam(numTargets, NumTargets);
     }
 
-    __host__ __device__ cu_qcomp operator()(qindex n, cu_qcomp amp) {
+    __host__ __device__ gpu_qcomp operator()(qindex n, gpu_qcomp amp) {
 
         // use the compile-time value if possible, to auto-unroll the getValueOfBits() loop below
         SET_VAR_AT_COMPILE_TIME(int, numBits, NumTargets, numTargets);
@@ -603,7 +599,7 @@ struct functor_setRandomStateVecAmp {
     unsigned baseSeed;
     functor_setRandomStateVecAmp(unsigned seed) : baseSeed(seed) {}
 
-    __host__ __device__ cu_qcomp operator()(qindex ampInd) {
+    __host__ __device__ gpu_qcomp operator()(qindex ampInd) {
 
         // wastefully create new distributions for every amp
         thrust::random::normal_distribution<qreal> normDist(0, 1); // mean=0, var=1
@@ -634,8 +630,8 @@ struct functor_setRandomStateVecAmp {
         auto iphase = thrust::complex<qreal>(0, phase);
         auto amp = sqrt(prob) * thrust::exp(iphase); // CUDA sqrt
 
-        // cast thrust::complex to cu_qcomp
-        return getCuQcomp(amp.real(), amp.imag());
+        // cast thrust::complex to gpu_qcomp
+        return getGpuQcomp(amp.real(), amp.imag());
     }
 };
 
@@ -654,7 +650,7 @@ void thrust_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, Pauli
     thrust::device_vector<PauliStr> devStrings(in.strings, in.strings + in.numTerms);
     
     // obtain raw pointers which can be passed to fastmath.hpp routines
-    cu_qcomp* devCoeffsPtr = toCuQcomps(thrust::raw_pointer_cast(devCoeffs.data()));
+    gpu_qcomp* devCoeffsPtr = getGpuQcompPtr(thrust::raw_pointer_cast(devCoeffs.data()));
     PauliStr* devStringsPtr = thrust::raw_pointer_cast(devStrings.data());
 
     int rank = out.isDistributed? comm_getRank() : 0;
@@ -663,7 +659,7 @@ void thrust_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, Pauli
     // <true> indicates the PauliStrSum is diagonal (contains only I or Z)
     auto functor = functor_setAmpToPauliStrSumElem<true>(
         rank, out.numElems, logNumElemsPerNode,
-        in.numTerms, toCuQcomps(out.gpuElems), devCoeffsPtr, devStringsPtr);
+        in.numTerms, getGpuQcompPtr(out.gpuElems), devCoeffsPtr, devStringsPtr);
 
     auto indIter = thrust::make_counting_iterator(QINDEX_ZERO);
     auto endIter = indIter + out.numElemsPerNode;
@@ -677,7 +673,7 @@ void thrust_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, Pauli
  */
 
 
-void thrust_setElemsToConjugate(cu_qcomp* matrElemsPtr, qindex matrElemsLen) {
+void thrust_setElemsToConjugate(gpu_qcomp* matrElemsPtr, qindex matrElemsLen) {
 
     auto ptr = getStartPtr(matrElemsPtr);
     thrust::transform(ptr, ptr + matrElemsLen, ptr, functor_getAmpConj());
@@ -700,13 +696,13 @@ void thrust_densmatr_setAmpsToPauliStrSum_sub(Qureg qureg, PauliStrSum sum) {
     thrust::device_vector<PauliStr> devStrings(sum.strings, sum.strings + sum.numTerms);
     
     // obtain raw pointers which can be passed to fastmath.hpp routines
-    cu_qcomp* devCoeffsPtr = toCuQcomps(thrust::raw_pointer_cast(devCoeffs.data()));
+    gpu_qcomp* devCoeffsPtr = getGpuQcompPtr(thrust::raw_pointer_cast(devCoeffs.data()));
     PauliStr* devStringsPtr = thrust::raw_pointer_cast(devStrings.data());
 
     // <false> indicates the PauliStrSum is not diagonal (contains X or Y)
     auto functor = functor_setAmpToPauliStrSumElem<false>(
         qureg.rank, powerOf2(qureg.numQubits), qureg.logNumAmpsPerNode,
-        sum.numTerms, toCuQcomps(qureg.gpuAmps), devCoeffsPtr, devStringsPtr);
+        sum.numTerms, getGpuQcompPtr(qureg.gpuAmps), devCoeffsPtr, devStringsPtr);
 
     auto indIter = thrust::make_counting_iterator(QINDEX_ZERO);
     auto endIter = indIter + qureg.numAmpsPerNode;
@@ -724,7 +720,7 @@ void thrust_densmatr_mixQureg_subA(qreal outProb, Qureg outQureg, qreal inProb,
 
 
 template <bool HasPower>
-void thrust_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, cu_qcomp exponent) {
+void thrust_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, gpu_qcomp exponent) {
 
     thrust::transform(
         getStartPtr(qureg), getEndPtr(qureg), 
@@ -832,13 +828,13 @@ qreal thrust_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> q
  */
 
 
-cu_qcomp thrust_statevec_calcInnerProduct_sub(Qureg quregA, Qureg quregB) {
+gpu_qcomp thrust_statevec_calcInnerProduct_sub(Qureg quregA, Qureg quregB) {
 
-    cu_qcomp init = getCuQcomp(0, 0);
+    gpu_qcomp init = getGpuQcomp(0, 0);
 
-    cu_qcomp prod = thrust::inner_product(
+    gpu_qcomp prod = thrust::inner_product(
         getStartPtr(quregA), getEndPtr(quregA), getStartPtr(quregB), 
-        init, thrust::plus<cu_qcomp>(), functor_getAmpConjProd());
+        init, thrust::plus<gpu_qcomp>(), functor_getAmpConjProd());
 
     return prod;
 }
@@ -857,20 +853,20 @@ qreal thrust_densmatr_calcHilbertSchmidtDistance_sub(Qureg quregA, Qureg quregB)
 
 
 template <bool Conj>
-cu_qcomp thrust_densmatr_calcFidelityWithPureState_sub(Qureg rho, Qureg psi) {
+gpu_qcomp thrust_densmatr_calcFidelityWithPureState_sub(Qureg rho, Qureg psi) {
 
-    // functor accepts an index and produces a cu_qcomp
+    // functor accepts an index and produces a gpu_qcomp
     auto functor = functor_getFidelityTerm<Conj>(
         rho.rank, rho.numQubits, rho.logNumAmpsPerNode, 
-        psi.numAmps, toCuQcomps(rho.gpuAmps), toCuQcomps(psi.gpuAmps));
+        psi.numAmps, getGpuQcompPtr(rho.gpuAmps), getGpuQcompPtr(psi.gpuAmps));
 
     auto indIter = thrust::make_counting_iterator(QINDEX_ZERO);
     qindex numIts = rho.numAmpsPerNode;
 
-    cu_qcomp init = getCuQcomp(0, 0);
-    cu_qcomp fid = thrust::transform_reduce(
+    gpu_qcomp init = getGpuQcomp(0, 0);
+    gpu_qcomp fid = thrust::transform_reduce(
         indIter, indIter + numIts, 
-        functor, init, thrust::plus<cu_qcomp>());
+        functor, init, thrust::plus<gpu_qcomp>());
 
     return fid;
 }
@@ -897,71 +893,71 @@ qreal thrust_statevec_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs) {
 }
 
 
-cu_qcomp thrust_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs) {
+gpu_qcomp thrust_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs) {
 
     qindex dim = powerOf2(qureg.numQubits);
     qindex ind = util_getLocalIndexOfFirstDiagonalAmp(qureg);
     qindex mask = util_getBitMask(targs);
-    auto functor = functor_getExpecDensMatrZTerm(dim, ind, mask, toCuQcomps(qureg.gpuAmps));
+    auto functor = functor_getExpecDensMatrZTerm(dim, ind, mask, getGpuQcompPtr(qureg.gpuAmps));
 
-    cu_qcomp init = getCuQcomp(0, 0);
+    gpu_qcomp init = getGpuQcomp(0, 0);
     auto indIter = thrust::make_counting_iterator(QINDEX_ZERO);
     auto endIter = indIter + powerOf2(qureg.logNumColsPerNode);
 
-    return thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus<cu_qcomp>());
+    return thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus<gpu_qcomp>());
 }
 
 
-cu_qcomp thrust_statevec_calcExpecPauliStr_subA(Qureg qureg, vector<int> x, vector<int> y, vector<int> z) {
+gpu_qcomp thrust_statevec_calcExpecPauliStr_subA(Qureg qureg, vector<int> x, vector<int> y, vector<int> z) {
 
     qindex maskXY = util_getBitMask(util_getConcatenated(x, y));
     qindex maskYZ = util_getBitMask(util_getConcatenated(y, z));
-    auto ampsPtr = toCuQcomps(qureg.gpuAmps);
+    auto ampsPtr = getGpuQcompPtr(qureg.gpuAmps);
     auto functor = functor_getExpecStateVecPauliTerm(maskXY, maskYZ, ampsPtr, ampsPtr); // amps=pairAmps
 
-    cu_qcomp init = getCuQcomp(0, 0);
+    gpu_qcomp init = getGpuQcomp(0, 0);
     auto indIter = thrust::make_counting_iterator(QINDEX_ZERO);
     auto endIter = indIter + qureg.numAmpsPerNode;
 
-    cu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus<cu_qcomp>());
+    gpu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus<gpu_qcomp>());
 
-    return value * toCuQcomp(util_getPowerOfI(y.size()));
+    return value * getGpuQcomp(util_getPowerOfI(y.size()));
 }
 
 
-cu_qcomp thrust_statevec_calcExpecPauliStr_subB(Qureg qureg, vector<int> x, vector<int> y, vector<int> z) {
+gpu_qcomp thrust_statevec_calcExpecPauliStr_subB(Qureg qureg, vector<int> x, vector<int> y, vector<int> z) {
 
     qindex maskXY = util_getBitMask(util_getConcatenated(x, y));
     qindex maskYZ = util_getBitMask(util_getConcatenated(y, z));
-    auto ampsPtr = toCuQcomps(qureg.gpuAmps);
-    auto buffPtr = toCuQcomps(qureg.gpuCommBuffer);
+    auto ampsPtr = getGpuQcompPtr(qureg.gpuAmps);
+    auto buffPtr = getGpuQcompPtr(qureg.gpuCommBuffer);
     auto functor = functor_getExpecStateVecPauliTerm(maskXY, maskYZ, ampsPtr, buffPtr);
 
-    cu_qcomp init = getCuQcomp(0, 0);
+    gpu_qcomp init = getGpuQcomp(0, 0);
     auto indIter = thrust::make_counting_iterator(QINDEX_ZERO);
     auto endIter = indIter + qureg.numAmpsPerNode;
 
-    cu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus<cu_qcomp>());
+    gpu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus<gpu_qcomp>());
 
-    return value * toCuQcomp(util_getPowerOfI(y.size()));
+    return value * getGpuQcomp(util_getPowerOfI(y.size()));
 }
 
 
-cu_qcomp thrust_densmatr_calcExpecPauliStr_sub(Qureg qureg, vector<int> x, vector<int> y, vector<int> z) {
+gpu_qcomp thrust_densmatr_calcExpecPauliStr_sub(Qureg qureg, vector<int> x, vector<int> y, vector<int> z) {
 
     qindex mXY = util_getBitMask(util_getConcatenated(x, y));
     qindex mYZ = util_getBitMask(util_getConcatenated(y, z));
     qindex dim = powerOf2(qureg.numQubits);
     qindex ind = util_getLocalIndexOfFirstDiagonalAmp(qureg);
-    auto functor = functor_getExpecDensMatrPauliTerm(mXY, mYZ, dim, ind, toCuQcomps(qureg.gpuAmps));
+    auto functor = functor_getExpecDensMatrPauliTerm(mXY, mYZ, dim, ind, getGpuQcompPtr(qureg.gpuAmps));
 
-    cu_qcomp init = getCuQcomp(0, 0);
+    gpu_qcomp init = getGpuQcomp(0, 0);
     auto indIter = thrust::make_counting_iterator(QINDEX_ZERO);
     auto endIter = indIter + powerOf2(qureg.logNumColsPerNode);
 
-    cu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus<cu_qcomp>());
+    gpu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus<gpu_qcomp>());
 
-    return value * toCuQcomp(util_getPowerOfI(y.size()));
+    return value * getGpuQcomp(util_getPowerOfI(y.size()));
 }
 
 
@@ -972,33 +968,33 @@ cu_qcomp thrust_densmatr_calcExpecPauliStr_sub(Qureg qureg, vector<int> x, vecto
 
 
 template <bool HasPower, bool UseRealPow> 
-cu_qcomp thrust_statevec_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, cu_qcomp expo) {
+gpu_qcomp thrust_statevec_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, gpu_qcomp expo) {
 
-    cu_qcomp init = getCuQcomp(0, 0);
+    gpu_qcomp init = getGpuQcomp(0, 0);
     auto functor = functor_multiplyElemPowerWithAmpOrNorm<HasPower,UseRealPow,true>(expo);
 
-    cu_qcomp value = thrust::inner_product(
+    gpu_qcomp value = thrust::inner_product(
         getStartPtr(qureg), getEndPtr(qureg), getStartPtr(matr), 
-        init, thrust::plus<cu_qcomp>(), functor);
+        init, thrust::plus<gpu_qcomp>(), functor);
 
     return value;
 }
 
 
 template <bool HasPower, bool UseRealPow> 
-cu_qcomp thrust_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, cu_qcomp expo) {
+gpu_qcomp thrust_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, gpu_qcomp expo) {
 
     qindex dim = powerOf2(qureg.numQubits);
     qindex ind = util_getLocalIndexOfFirstDiagonalAmp(qureg);
-    auto ampsPtr = toCuQcomps(qureg.gpuAmps);
-    auto elemsPtr = toCuQcomps(matr.gpuElems);
+    auto ampsPtr = getGpuQcompPtr(qureg.gpuAmps);
+    auto elemsPtr = getGpuQcompPtr(matr.gpuElems);
     auto functor = functor_getExpecDensMatrDiagMatrTerm<HasPower,UseRealPow>(dim, ind, ampsPtr, elemsPtr, expo);
 
-    cu_qcomp init = getCuQcomp(0, 0);
+    gpu_qcomp init = getGpuQcomp(0, 0);
     auto indIter = thrust::make_counting_iterator(QINDEX_ZERO);
     auto endIter = indIter + powerOf2(qureg.logNumColsPerNode);
 
-    return thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus<cu_qcomp>());
+    return thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus<gpu_qcomp>());
 }
 
 
@@ -1047,7 +1043,7 @@ void thrust_densmatr_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, ve
  */
 
 
-void thrust_statevec_initUniformState(Qureg qureg, cu_qcomp amp) {
+void thrust_statevec_initUniformState(Qureg qureg, gpu_qcomp amp) {
 
     thrust::fill(getStartPtr(qureg), getEndPtr(qureg), amp);
 }
@@ -1057,11 +1053,11 @@ void thrust_statevec_initDebugState_sub(Qureg qureg) {
 
     // globally, |n> gains coefficient 2n/10 + i(2n+1)/10,
     // which is a step-size of 2/10 + i(2/10)...
-    cu_qcomp step = getCuQcomp(2/10., 2/10.);
+    gpu_qcomp step = getGpuQcomp(2/10., 2/10.);
 
     // and each node begins from a unique n (if distributed)
     qindex n = util_getGlobalIndexOfFirstLocalAmp(qureg);
-    cu_qcomp init = getCuQcomp(2*n/10., (2*n+1)/10.);
+    gpu_qcomp init = getGpuQcomp(2*n/10., (2*n+1)/10.);
 
     thrust::sequence(getStartPtr(qureg), getEndPtr(qureg), init, step);
 }
diff --git a/quest/src/gpu/gpu_types.cuh b/quest/src/gpu/gpu_types.cuh
deleted file mode 100644
index a934ecef..00000000
--- a/quest/src/gpu/gpu_types.cuh
+++ /dev/null
@@ -1,274 +0,0 @@
-/** @file
- * CUDA and HIP-compatible complex types. This file is only ever included
- * when COMPILE_CUDA=1 so it can safely invoke CUDA signatures without guards. 
- * 
- * This header is safe to re-include by multiple files because typedef 
- * redefinition is legal in C++, and all functions herein are inline. 
- * Furthermore, since it is only ever parsed by nvcc, the __host__ symbols 
- * are safely processed by other nvcc-only GPU files, like the cuquantum backend.
- * 
- * @author Tyson Jones
- * @author Oliver Brown (patched HIP arithmetic overloads)
- */
-
-#ifndef GPU_TYPES_HPP
-#define GPU_TYPES_HPP
-
-#include "quest/include/config.h"
-#include "quest/include/types.h"
-#include "quest/include/precision.h"
-
-#include "quest/src/core/inliner.hpp"
-
-#if ! COMPILE_CUDA
-    #error "A file being compiled somehow included gpu_types.hpp despite QuEST not being compiled in GPU-accelerated mode."
-#endif
-
-#if defined(__NVCC__)
-    #include <cuComplex.h>
-#elif defined(__HIP__)
-    #include "quest/src/gpu/cuda_to_hip.hpp"
-#endif
-
-#include <array>
-#include <vector>
-
-
-
-/*
- * CUDA-COMPATIBLE QCOMP ALIAS (cu_qcomp)
- *
- * which we opt to use over a Thrust complex type to gaurantee
- * compatibility with cuQuantum, though this irritatingly 
- * requires explicitly defining operator overloads below
- */
-
-
-#if (FLOAT_PRECISION == 1)
-    typedef cuFloatComplex cu_qcomp;
-
-#elif (FLOAT_PRECISION == 2)
-    typedef cuDoubleComplex cu_qcomp;
-
-#else
-    #error "Build bug; precision.h should have prevented non-float non-double qcomp precision on GPU."
-
-#endif
-
-
-
-/*
- * TRANSFORMING qcomp AND cu_qcomp
- */
-
-
-INLINE cu_qcomp getCuQcomp(qreal re, qreal im) {
-
-#if (FLOAT_PRECISION == 1)
-    return make_cuFloatComplex(re, im);
-#else
-    return make_cuDoubleComplex(re, im);
-#endif
-}
-
-
-__host__ inline cu_qcomp toCuQcomp(qcomp a) {
-    return getCuQcomp(std::real(a), std::imag(a));
-}
-__host__ inline qcomp toQcomp(cu_qcomp a) {
-    return getQcomp(a.x, a.y);
-}
-
-
-__host__ inline cu_qcomp* toCuQcomps(qcomp* a) {
-
-    // reinterpret a qcomp ptr as a cu_qcomp ptr,
-    // which is ONLY SAFE when comp and cu_qcomp 
-    // have identical memory layouts. Be very
-    // careful; HIP stack arrays (e.g. qcomp[])
-    // seg-fault when passed here, so this funciton
-    // should only ever be used on malloc'd data!
-    // Stack objects should use the below unpacks.
-
-    return reinterpret_cast<cu_qcomp*>(a);
-}
-
-
-__host__ inline std::array<cu_qcomp,2> unpackMatrixToCuQcomps(DiagMatr1 in) {
-
-    // it's crucial we explicitly copy over the elements,
-    // rather than just reinterpret the pointer, to avoid
-    // segmentation faults when memory misaligns (like on HIP)
-
-    return {toCuQcomp(in.elems[0]), toCuQcomp(in.elems[1])};
-}
-
-
-__host__ inline std::array<cu_qcomp,4> unpackMatrixToCuQcomps(DiagMatr2 in) {
-
-    return {
-        toCuQcomp(in.elems[0]), toCuQcomp(in.elems[1]),
-        toCuQcomp(in.elems[2]), toCuQcomp(in.elems[3])};
-}
-
-
-__host__ inline std::array<cu_qcomp,4> unpackMatrixToCuQcomps(CompMatr1 in) {
-
-    std::array<cu_qcomp,4> out{};
-    for (int i=0; i<4; i++)
-        out[i] = toCuQcomp(in.elems[i/2][i%2]);
-
-    return out;
-}
-
-
-__host__ inline std::array<cu_qcomp,16> unpackMatrixToCuQcomps(CompMatr2 in) {
-
-    std::array<cu_qcomp,16> out{};
-    for (int i=0; i<16; i++)
-        out[i] = toCuQcomp(in.elems[i/4][i%4]);
-
-    return out;
-}
-
-
-
-
-/*
- * cu_qcomp ARITHMETIC OVERLOADS
- *
- * which are only needed by NVCC because
- * HIP defines them for us. This good deed
- * goes punished; a HIP bug disables our
- * use of *= and += overloads, so kernels.cuh
- * has disgusting (x = x * y) statements. Bah!
- */
-
-
-/// @todo
-/// - clean this up (with templates?)
-/// - use getCuQcomp() rather than struct creation,
-///   to make the algebra implementation-agnostic
-
-
-#if defined(__NVCC__)
-
-INLINE cu_qcomp operator + (const cu_qcomp& a, const cu_qcomp& b) {
-    cu_qcomp out = {
-        .x = a.x + b.x,
-        .y = a.y + b.y
-    };
-    return out;
-}
-
-INLINE cu_qcomp operator - (const cu_qcomp& a, const cu_qcomp& b) {
-    cu_qcomp out = {
-        .x = a.x - b.x,
-        .y = a.y - b.y
-    };
-    return out;
-}
-
-INLINE cu_qcomp operator * (const cu_qcomp& a, const cu_qcomp& b) {
-    cu_qcomp out = {
-        .x = a.x * b.x - a.y * b.y,
-        .y = a.x * b.y + a.y * b.x
-    };
-    return out;
-}
-
-
-INLINE cu_qcomp operator + (const cu_qcomp& a, const qreal& b) {
-    cu_qcomp out = {
-        .x = a.x + b,
-        .y = a.y + b
-    };
-    return out;
-}
-INLINE cu_qcomp operator + (const qreal& b, const cu_qcomp& a) {
-    cu_qcomp out = {
-        .x = a.x + b,
-        .y = a.y + b
-    };
-    return out;
-}
-
-INLINE cu_qcomp operator - (const cu_qcomp& a, const qreal& b) {
-    cu_qcomp out = {
-        .x = a.x - b,
-        .y = a.y - b
-    };
-    return out;
-}
-INLINE cu_qcomp operator - (const qreal& b, const cu_qcomp& a) {
-    cu_qcomp out = {
-        .x = a.x - b,
-        .y = a.y - b
-    };
-    return out;
-}
-
-INLINE cu_qcomp operator * (const cu_qcomp& a, const qreal& b) {
-    cu_qcomp out = {
-        .x = a.x * b,
-        .y = a.y * b
-    };
-    return out;
-}
-INLINE cu_qcomp operator * (const qreal& b, const cu_qcomp& a) {
-    cu_qcomp out = {
-        .x = a.x * b,
-        .y = a.y * b
-    };
-    return out;
-}
-
-#endif
-
-
-
-/*
- * cu_qcomp UNARY FUNCTIONS
- */
-
-
-INLINE qreal getCompReal(cu_qcomp num) {
-    return num.x;
-}
-
-INLINE cu_qcomp getCompConj(cu_qcomp num) {
-    num.y *= -1;
-    return num;
-}
-
-INLINE qreal getCompNorm(cu_qcomp num) {
-    return (num.x * num.x) + (num.y * num.y);
-}
-
-INLINE cu_qcomp getCompPower(cu_qcomp base, cu_qcomp exponent) {
-
-    // using https://mathworld.wolfram.com/ComplexExponentiation.html,
-    // and the principal argument of 'base'
-
-    // base = a + b i, exponent = c + d i
-    qreal a = base.x;
-    qreal b = base.y;
-    qreal c = exponent.x;
-    qreal d = exponent.y;
-
-    // intermediate quantities (uses CUDA atan2,log,pow,exp,cos,sin)
-    qreal arg = atan2(b, a);
-    qreal mag = a*a + b*b;
-    qreal ln = log(mag);
-    qreal fac = pow(mag, c/2) * exp(-d * arg);
-    qreal ang = c*arg + d*ln/2;
-
-    // output scalar
-    qreal re = fac * cos(ang);
-    qreal im = fac * sin(ang);
-    return getCuQcomp(re, im);
-}
-
-
-
-#endif // GPU_TYPES_HPP
\ No newline at end of file
diff --git a/utils/scripts/compile.sh b/utils/scripts/compile.sh
index 8eca0ae2..609f85be 100755
--- a/utils/scripts/compile.sh
+++ b/utils/scripts/compile.sh
@@ -312,8 +312,6 @@ echo "deployment modes:"
 ALL_LINK_FLAGS="${USER_LINK_FLAGS}"
 
 # choose compiler and flags for CPU/OMP files
-CPU_FILES_FLAGS='-Ofast -DCOMPLEX_OVERLOADS_PATCHED=1'
-
 if [ $ENABLE_MULTITHREADING == 1 ]
 then
     echo "${INDENT}(multithreading enabled)"