Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/compile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ on:
branches:
- main
- devel
- unifying-base-qcomp
pull_request:
branches:
- main
Expand Down Expand Up @@ -253,6 +254,11 @@ jobs:
-DCMAKE_HIP_ARCHITECTURES=${{ env.hip_arch }}
-DCMAKE_CXX_COMPILER=${{ matrix.compiler }}
-DCMAKE_CXX_FLAGS=${{ matrix.mpi == 'ON' && matrix.cuda == 'ON' && '-fno-lto' || '' }}
-DCMAKE_CXX_FLAGS_DEBUG=${{ matrix.compiler == 'cl' && '/MP1' || '' }}
-DCMAKE_CXX_FLAGS_RELEASE=${{ matrix.compiler == 'cl' && '/MP1' || '' }}

### DEBUG:
### above disables parallel compilation with MSVC

# force 'Release' build (needed by MSVC to enable optimisations)
- name: Compile
Expand Down
65 changes: 0 additions & 65 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -469,71 +469,6 @@ set(COMPILE_HIP ${ENABLE_HIP})



# ============================
# Patch CPU performance
# ============================


# Patch performance of CPU std::complex arithmetic operator overloads.
# The cpu_subroutines.cpp file makes extensive use of std::complex operator
# overloads, and alas these are significantly slower than hand-rolled
# arithmetic, due to their NaN and inf checks, and interference with SIMD.
# It is crucial to pass additional optimisation flags to this file to restore
# hand-rolled performance (else QuEST v3 is faster than v4 eep). In theory,
# we can achieve this with specific, relatively 'safe' flags such as LLVM's:
# -ffinite-math-only -fno-signed-zeros -ffp-contract=fast
# However, it is a nuisance to find equivalent flags for different compilers
# and monitor their performance vs accuracy trade-offs. So instead, we use the
# much more aggressive and ubiquitous -Ofast flag to guarantee performance.
# This introduces many potentially dangerous optimisations, such as asserting
# associativity of flops, which would break techniques like Kahan summation.
# The cpu_subroutines.cpp must ergo be very conscious of these optimisations.
# We here also explicitly inform the file cpu_subroutines.cpp whether or not
# we are passing the flags, so it can detect/error when flags are forgotten.

if (CMAKE_BUILD_TYPE STREQUAL "Release")

# Release build will pass -Ofast when known for the given compiler, and
# fallback to giving a performance warning and proceeding with compilation

if (CMAKE_CXX_COMPILER_ID MATCHES "AppleClang|Clang|Cray|CrayClang|GNU|HP|Intel|IntelLLVM|NVHPC|NVIDIA|XL|XLClang")
set(patch_flags "-Ofast")
set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=1")
elseif (CMAKE_CXX_COMPILER_ID MATCHES "HP")
set(patch_flags "+Ofast")
set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=1")
elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
set(patch_flags "/fp:fast")
set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=1")
else()
message(WARNING
"The compiler (${CMAKE_CXX_COMPILER_ID}) is unrecognised and so crucial optimisation flags have not been "
"passed to the CPU backend. These flags are necessary for full performance when performing complex algebra, "
"otherwise a slowdown of 3-50x may be observed. Please edit the root CMakeLists.txt to include flags which are "
"equivalent to GNU's -Ofast flag for your compiler (search this warning), or contact the QuEST developers for help."
)
set(patch_flags "")
set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=0")
endif()

else()

# Non-release builds (e.g. Debug) will pass no optimisation flags, and will
# communicate to cpu_subroutines.cpp that this is intentional via a macro

set(patch_flags "")
set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=0")

endif()

set_source_files_properties(
quest/src/cpu/cpu_subroutines.cpp
PROPERTIES
COMPILE_FLAGS "${patch_flags} ${patch_macro}"
)



# ============================
# Pass files to library
# ============================
Expand Down
7 changes: 4 additions & 3 deletions quest/include/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -158,9 +158,10 @@ static inline qcomp getQcomp(qreal re, qreal im) {
// not the same precision as qcomp, so compilation will fail depending
// on the setting of PRECISION. To avoid this, we'll define overloads
// between all type/precision permutations, always returning qcomp. These
// overloads are also used by the QuEST source code. Via the unholy macros
// below, we create 312 overloads; no doubt this is going to break something
// in the future, for which I am already sorry :'(
// overloads are also used by the QuEST source code (though incidentally,
// not the high performance backend which uses custom complex overloads).
// Via the unholy macros below, we create 312 overloads; no doubt this is
// going to break something in the future, for which I am already sorry :'(

/// @cond EXCLUDE_FROM_DOXYGEN

Expand Down
227 changes: 227 additions & 0 deletions quest/src/core/base_qcomp.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
/** @file
* Definition of base_qcomp, which is extended by the CPU and GPU
* backends (into cpu_qcomp and gpu_qcomp) and used in hot loops
* and kernels.
*
* The user-facing qcomp (which in the QuEST middle-end, resolves to
* std::complex) is not used by the CPU backend, since it creates
* performance pitfalls (e.g. expensive NaN checks within arithmetic
* operators) in some compilers, and is furthermore illegal in the
* GPU backend (i.e. within CUDA kernels). So the backends instead
* use custom complex types with identical memory layouts/alignment
* to qcomp. Those types extend base_qcomp defined in this file,
* since they otherwise share all the same arithmetic boilerplate.
*
* @author Tyson Jones
*/

#ifndef BASE_QCOMP_HPP
#define BASE_QCOMP_HPP

#include "quest/include/types.h"

#include "quest/src/core/inliner.hpp"



/*
* BASE DEFINITION
*
* which must remain POD (a simple {re,im}) and with an identical
* memory layout and alignment to qcomp (i.e. std::complex). Only
* the in-place arithmetic overloads are defined below which are
* reused by the subsequent out-of-place overloads, to avoid
* code duplication.
*/

struct base_qcomp {

qreal re;
qreal im;


/*
* IN-PLACE COMPLEX ARITHMETIC
*/

INLINE base_qcomp& operator += (const base_qcomp& a) noexcept {
re += a.re;
im += a.im;
return *this;
}

INLINE base_qcomp& operator -= (const base_qcomp& a) noexcept {
re -= a.re;
im -= a.im;
return *this;
}

INLINE base_qcomp& operator *= (const base_qcomp& a) noexcept {
qreal re_ = re;
qreal im_ = im;
re = (re_ * a.re) - (im_ * a.im);
im = (re_ * a.im) + (im_ * a.re);
return *this;
}


/*
* IN-PLACE MIXED-TYPE ARITHMETIC
*/

INLINE base_qcomp& operator *= (const int& a) noexcept {
re *= a;
im *= a;
return *this;
}

INLINE base_qcomp& operator *= (const qreal& a) noexcept {
re *= a;
im *= a;
return *this;
}

INLINE base_qcomp& operator *= (const size_t& a) noexcept {
re *= a;
im *= a;
return *this;
}

}; // base_qcomp



/*
* OUT-OF-PLACE COMPLEX ARITHMETIC
*
* which avoid code duplication by re-using the
* in-place arithmetic operator overloads above
*/

INLINE base_qcomp operator + (base_qcomp a, const base_qcomp& b) noexcept {
a += b;
return a;
}

INLINE base_qcomp operator - (base_qcomp a, const base_qcomp& b) noexcept {
a -= b;
return a;
}

INLINE base_qcomp operator * (base_qcomp a, const base_qcomp& b) noexcept {
a *= b;
return a;
}



/*
* OUT-OF-PLACE MIXED-TYPE ARITHMETIC
*
* which avoid code duplication by re-using the
* in-place arithmetic operator overloads above
*/


// base_qcomp * other

INLINE base_qcomp operator * (base_qcomp a, const int& b) noexcept {
a *= b;
return a;
}

INLINE base_qcomp operator * (base_qcomp a, const qreal& b) noexcept {
a *= b;
return a;
}

INLINE base_qcomp operator * (base_qcomp a, const size_t& b) noexcept {
a *= b;
return a;
}


// other * base_qcomp (via commutation)

INLINE base_qcomp operator * (const int& a, const base_qcomp& b) noexcept {
return b * a;
}

INLINE base_qcomp operator * (const qreal& a, const base_qcomp& b) noexcept {
return b * a;
}

INLINE base_qcomp operator * (const size_t& a, const base_qcomp& b) noexcept {
return b * a;
}



/*
* BACKEND-AGNOSTIC MATHS
*/

INLINE qreal real(const base_qcomp& a) {
return a.re;
}

INLINE qreal imag(const base_qcomp& a) {
return a.im;
}

INLINE base_qcomp conj(const base_qcomp& a) {
return {a.re, - a.im};
}

INLINE qreal norm(const base_qcomp& a) noexcept {
return (a.re * a.re) + (a.im * a.im);
}



/*
* CONVERTERS
*/

INLINE base_qcomp* getBaseQcompPtr(qcomp* list) {
return reinterpret_cast<base_qcomp*>(list);
}

INLINE base_qcomp getBaseQcomp(qreal re, qreal im) {
return { re, im };
}

INLINE base_qcomp getBaseQcomp(const qcomp& a) {
return { a.real(), a.imag() };
}

INLINE qcomp getQcomp(const base_qcomp& a) {
return qcomp( a.re, a.im );
}



/*
* CHECK COMPATIBILITY WITH QCOMP
*/


// check the memory layout of base_qcomp agrees with qcomp, since
// it is not formally gauranteed, unlike _Complex and std::complex
static_assert(sizeof (base_qcomp) == sizeof (qcomp));
static_assert(alignof(base_qcomp) == alignof(qcomp));
static_assert(std::is_standard_layout_v <base_qcomp>);
static_assert(std::is_trivially_copyable_v<base_qcomp>);


// TODO:
// the above checks are potentially inadequate to identify an
// insidious incompatibility between qcomp and base_qcomp - perhaps
// we should perform a compile-time duck-check, casting a small
// array between them and checking no data is corrupted? Perhaps
// a runtime check in initQuESTEnv() is also necessary, checking the
// casting is safe for all circumstances (e.g. heap mem, static lists)



#endif // BASE_QCOMP_HPP
Loading
Loading