QuEST-Kit · TysonRayJones · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026
diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml
@@ -28,6 +28,7 @@ on:
     branches:
       - main
       - devel
+      - unifying-base-qcomp
   pull_request:
     branches:
       - main
@@ -253,6 +254,11 @@ jobs:
           -DCMAKE_HIP_ARCHITECTURES=${{ env.hip_arch }}
           -DCMAKE_CXX_COMPILER=${{ matrix.compiler }}
           -DCMAKE_CXX_FLAGS=${{ matrix.mpi == 'ON' && matrix.cuda == 'ON' && '-fno-lto' || '' }}
+          -DCMAKE_CXX_FLAGS_DEBUG=${{ matrix.compiler == 'cl' && '/MP1' || '' }}
+          -DCMAKE_CXX_FLAGS_RELEASE=${{ matrix.compiler == 'cl' && '/MP1' || '' }}
+
+      ### DEBUG:
+      ### above disables parallel compilation with MSVC
 
       # force 'Release' build (needed by MSVC to enable optimisations)
       - name: Compile

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -469,71 +469,6 @@ set(COMPILE_HIP ${ENABLE_HIP})
 
 
 
-# ============================
-# Patch CPU performance
-# ============================
-
-
-# Patch performance of CPU std::complex arithmetic operator overloads.
-# The cpu_subroutines.cpp file makes extensive use of std::complex operator
-# overloads, and alas these are significantly slower than hand-rolled 
-# arithmetic, due to their NaN and inf checks, and interference with SIMD.
-# It is crucial to pass additional optimisation flags to this file to restore
-# hand-rolled performance (else QuEST v3 is faster than v4 eep). In theory,
-# we can achieve this with specific, relatively 'safe' flags such as LLVM's:
-#     -ffinite-math-only -fno-signed-zeros -ffp-contract=fast
-# However, it is a nuisance to find equivalent flags for different compilers
-# and monitor their performance vs accuracy trade-offs. So instead, we use the
-# much more aggressive and ubiquitous -Ofast flag to guarantee performance. 
-# This introduces many potentially dangerous optimisations, such as asserting
-# associativity of flops, which would break techniques like Kahan summation.
-# The cpu_subroutines.cpp must ergo be very conscious of these optimisations.
-# We here also explicitly inform the file cpu_subroutines.cpp whether or not
-# we are passing the flags, so it can detect/error when flags are forgotten.
-
-if (CMAKE_BUILD_TYPE STREQUAL "Release")
-
-  # Release build will pass -Ofast when known for the given compiler, and
-  # fallback to giving a performance warning and proceeding with compilation
-
-  if (CMAKE_CXX_COMPILER_ID MATCHES "AppleClang|Clang|Cray|CrayClang|GNU|HP|Intel|IntelLLVM|NVHPC|NVIDIA|XL|XLClang")
-    set(patch_flags "-Ofast")
-    set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=1")
-  elseif (CMAKE_CXX_COMPILER_ID MATCHES "HP")
-    set(patch_flags "+Ofast")
-    set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=1")
-  elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
-    set(patch_flags "/fp:fast")
-    set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=1")
-  else()
-    message(WARNING 
-      "The compiler (${CMAKE_CXX_COMPILER_ID}) is unrecognised and so crucial optimisation flags have not been "
-      "passed to the CPU backend. These flags are necessary for full performance when performing complex algebra, "
-      "otherwise a slowdown of 3-50x may be observed. Please edit the root CMakeLists.txt to include flags which are "
-      "equivalent to GNU's -Ofast flag for your compiler (search this warning), or contact the QuEST developers for help."
-    )
-    set(patch_flags "")
-    set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=0")
-  endif()
-
-else()
-
-  # Non-release builds (e.g. Debug) will pass no optimisation flags, and will
-  # communicate to cpu_subroutines.cpp that this is intentional via a macro
-
-  set(patch_flags "")
-  set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=0")
-
-endif()
-
-set_source_files_properties(
-  quest/src/cpu/cpu_subroutines.cpp
-  PROPERTIES
-  COMPILE_FLAGS "${patch_flags} ${patch_macro}"
-)
-
-
-
 # ============================
 # Pass files to library
 # ============================

diff --git a/quest/include/types.h b/quest/include/types.h
@@ -158,9 +158,10 @@ static inline qcomp getQcomp(qreal re, qreal im) {
     // not the same precision as qcomp, so compilation will fail depending
     // on the setting of PRECISION. To avoid this, we'll define overloads
     // between all type/precision permutations, always returning qcomp. These
-    // overloads are also used by the QuEST source code. Via the unholy macros 
-    // below, we create 312 overloads; no doubt this is going to break something
-    // in the future, for which I am already sorry :'(
+    // overloads are also used by the QuEST source code (though incidentally,
+    // not the high performance backend which uses custom complex overloads).
+    // Via the unholy macros below, we create 312 overloads; no doubt this is 
+    // going to break something in the future, for which I am already sorry :'(
 
     /// @cond EXCLUDE_FROM_DOXYGEN
 

diff --git a/quest/src/core/base_qcomp.hpp b/quest/src/core/base_qcomp.hpp
@@ -0,0 +1,227 @@
+/** @file
+ * Definition of base_qcomp, which is extended by the CPU and GPU
+ * backends (into cpu_qcomp and gpu_qcomp) and used in hot loops
+ * and kernels.
+ * 
+ * The user-facing qcomp (which in the QuEST middle-end, resolves to 
+ * std::complex) is not used by the CPU backend, since it creates
+ * performance pitfalls (e.g. expensive NaN checks within arithmetic
+ * operators) in some compilers, and is furthermore illegal in the
+ * GPU backend (i.e. within CUDA kernels). So the backends instead
+ * use custom complex types with identical memory layouts/alignment
+ * to qcomp. Those types extend base_qcomp defined in this file,
+ * since they otherwise share all the same arithmetic boilerplate.
+ * 
+ * @author Tyson Jones
+ */
+
+#ifndef BASE_QCOMP_HPP
+#define BASE_QCOMP_HPP
+
+#include "quest/include/types.h"
+
+#include "quest/src/core/inliner.hpp"
+
+
+
+/*
+ * BASE DEFINITION
+ *
+ * which must remain POD (a simple {re,im}) and with an identical
+ * memory layout and alignment to qcomp (i.e. std::complex). Only
+ * the in-place arithmetic overloads are defined below which are
+ * reused by the subsequent out-of-place overloads, to avoid
+ * code duplication.
+ */
+
+struct base_qcomp {
+
+    qreal re;
+    qreal im;
+
+
+    /*
+     * IN-PLACE COMPLEX ARITHMETIC
+     */
+
+    INLINE base_qcomp& operator += (const base_qcomp& a) noexcept {
+        re += a.re;
+        im += a.im;
+        return *this;
+    }
+
+    INLINE base_qcomp& operator -= (const base_qcomp& a) noexcept {
+        re -= a.re;
+        im -= a.im;
+        return *this;
+    }
+
+    INLINE base_qcomp& operator *= (const base_qcomp& a) noexcept {
+        qreal re_ = re;
+        qreal im_ = im;
+        re = (re_ * a.re) - (im_ * a.im);
+        im = (re_ * a.im) + (im_ * a.re);
+        return *this;
+    }
+
+
+    /*
+     * IN-PLACE MIXED-TYPE ARITHMETIC
+     */
+
+    INLINE base_qcomp& operator *= (const int& a) noexcept {
+        re *= a;
+        im *= a;
+        return *this;
+    }
+
+    INLINE base_qcomp& operator *= (const qreal& a) noexcept {
+        re *= a;
+        im *= a;
+        return *this;
+    }
+
+    INLINE base_qcomp& operator *= (const size_t& a) noexcept {
+        re *= a;
+        im *= a;
+        return *this;
+    }
+
+}; // base_qcomp
+
+
+
+/*
+ * OUT-OF-PLACE COMPLEX ARITHMETIC
+ * 
+ * which avoid code duplication by re-using the
+ * in-place arithmetic operator overloads above
+ */
+
+INLINE base_qcomp operator + (base_qcomp a, const base_qcomp& b) noexcept {
+    a += b;
+    return a;
+}
+
+INLINE base_qcomp operator - (base_qcomp a, const base_qcomp& b) noexcept {
+    a -= b;
+    return a;
+}
+
+INLINE base_qcomp operator * (base_qcomp a, const base_qcomp& b) noexcept {
+    a *= b;
+    return a;
+}
+
+
+
+/*
+ * OUT-OF-PLACE MIXED-TYPE ARITHMETIC
+ * 
+ * which avoid code duplication by re-using the
+ * in-place arithmetic operator overloads above
+ */
+
+
+// base_qcomp * other
+
+INLINE base_qcomp operator * (base_qcomp a, const int& b) noexcept {
+    a *= b;
+    return a;
+}
+
+INLINE base_qcomp operator * (base_qcomp a, const qreal& b) noexcept {
+    a *= b;
+    return a;
+}
+
+INLINE base_qcomp operator * (base_qcomp a, const size_t& b) noexcept {
+    a *= b;
+    return a;
+}
+
+
+// other * base_qcomp (via commutation)
+
+INLINE base_qcomp operator * (const int& a, const base_qcomp& b) noexcept {
+    return b * a;
+}
+
+INLINE base_qcomp operator * (const qreal& a, const base_qcomp& b) noexcept {
+    return b * a;
+}
+
+INLINE base_qcomp operator * (const size_t& a, const base_qcomp& b) noexcept {
+    return b * a;
+}
+
+
+
+/*
+ * BACKEND-AGNOSTIC MATHS
+ */
+
+INLINE qreal real(const base_qcomp& a) {
+    return a.re;
+}
+
+INLINE qreal imag(const base_qcomp& a) {
+    return a.im;
+}
+
+INLINE base_qcomp conj(const base_qcomp& a) {
+    return {a.re, - a.im};
+}
+
+INLINE qreal norm(const base_qcomp& a) noexcept {
+    return (a.re * a.re) + (a.im * a.im);
+}
+
+
+
+/*
+ * CONVERTERS
+ */
+
+INLINE base_qcomp* getBaseQcompPtr(qcomp* list) {
+    return reinterpret_cast<base_qcomp*>(list);
+}
+
+INLINE base_qcomp getBaseQcomp(qreal re, qreal im) {
+    return { re, im };
+}
+
+INLINE base_qcomp getBaseQcomp(const qcomp& a) {
+    return { a.real(), a.imag() };
+}
+
+INLINE qcomp getQcomp(const base_qcomp& a) {
+    return qcomp( a.re, a.im );
+}
+
+
+
+/*
+ * CHECK COMPATIBILITY WITH QCOMP
+ */
+
+
+// check the memory layout of base_qcomp agrees with qcomp, since
+// it is not formally gauranteed, unlike _Complex and std::complex
+static_assert(sizeof (base_qcomp) == sizeof (qcomp));
+static_assert(alignof(base_qcomp) == alignof(qcomp));
+static_assert(std::is_standard_layout_v   <base_qcomp>);
+static_assert(std::is_trivially_copyable_v<base_qcomp>);
+
+
+// TODO:
+// the above checks are potentially inadequate to identify an
+// insidious incompatibility between qcomp and base_qcomp - perhaps
+// we should perform a compile-time duck-check, casting a small
+// array between them and checking no data is corrupted? Perhaps
+// a runtime check in initQuESTEnv() is also necessary, checking the
+// casting is safe for all circumstances (e.g. heap mem, static lists)
+
+
+
+#endif // BASE_QCOMP_HPP