Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/compile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ on:
branches:
- main
- devel
- msvc-so-debug
pull_request:
branches:
- main
Expand Down Expand Up @@ -61,7 +62,7 @@ jobs:
# compile QuEST with all combinations of below flags
matrix:
os: [windows-latest, ubuntu-latest, macos-latest]
precision: [1, 2, 4]
precision: [4, 2, 1] ################################### DEBUG! long double first for MSVC debug
omp: [ON, OFF]
mpi: [ON, OFF]
cuda: [ON, OFF]
Expand Down
154 changes: 77 additions & 77 deletions quest/src/core/fastmath.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,83 +100,83 @@ INLINE void fast_getSubQuregValues(qindex basisStateIndex, int* numQubitsPerSubQ
*/


// T = qcomp, cpu_qcomp, gpu_qcomp
template <typename T>
INLINE T fast_getPauliStrElem(PauliStr str, qindex row, qindex col) {

// this function is called by both fullstatediagmatr_setElemsToPauliStrSum()
// and densmatr_setAmpsToPauliStrSum_sub(). The former's PauliStr can have
// Paulis on any of the 64 sites, but the latter's PauliStr is always
// constrainted to the lower 32 sites (because a 32-qubit density matrix
// is already too large for the world's computers). As such, the latter
// scenario can be optimised since str.highPaulis == 0, making the second
// loop below redundant. Avoiding this loop can at most half the runtime,
// though opens the risk that the former caller erroneously has its upper
// Paulis ignore. We forego this optimisation in defensive design, and
// because this function is only invoked during data structure initilisation
// and ergo infrequently.

// regrettably duplicated from paulis.cpp which is inaccessible here
constexpr int numPaulisPerMask = sizeof(PAULI_MASK_TYPE) * 8 / 2;

// T-agnostic complex literals
T p0, p1,n1, pI,nI;
p0 = {0, 0}; // 0
p1 = {+1, 0}; // 1
n1 = {-1, 0}; // -1
pI = {0, +1}; // i
nI = {0, -1}; // -i

// 'matrices' below is not declared constexpr or static const, even though
// it is fixed/known at compile-time, because this makes it incompatible
// with CUDA kernels/thrust. It is instead left as runtime innitialisation
// but this poses no real slowdown; this function, and its caller, are inlined
// so these 16 amps are re-processed one for each full enumeration of the
// PauliStrSum which is expected to have significantly more terms/coeffs
T matrices[][2][2] = {
{{p1,p0},{p0,p1}}, // I
{{p0,p1},{p1,p0}}, // X
{{p0,nI},{pI,p0}}, // Y
{{p1,p0},{p0,n1}}}; // Z

T elem = p1; // 1

// could be compile-time unrolled into 32 iterations
for (int t=0; t<numPaulisPerMask; t++) {
int p = getTwoAdjacentBits(str.lowPaulis, 2*t);
int i = getBit(row, t);
int j = getBit(col, t);
elem *= matrices[p][i][j];
}

// could be compile-time unrolled into 32 iterations
for (int t=0; t<numPaulisPerMask; t++) {
int p = getTwoAdjacentBits(str.highPaulis, 2*t);
int i = getBit(row, t + numPaulisPerMask);
int j = getBit(col, t + numPaulisPerMask);
elem *= matrices[p][i][j];
}

return elem;
}


// T = qcomp, cpu_qcomp, gpu_qcomp
template <typename T>
INLINE T fast_getPauliStrSumElem(T* coeffs, PauliStr* strings, qindex numTerms, qindex row, qindex col) {

// this function accepts unpacked PauliStrSum fields since a PauliStrSum cannot
// be directly processed in CUDA kernels/thrust due to its 'qcomp' field.
// it also assumes str.highPaulis==0 for all str in strings, as per above func.

T elem = {0, 0}; // type-agnostic complex literal

// this loop is expected exponentially smaller than caller's loop
for (qindex n=0; n<numTerms; n++)
elem += coeffs[n] * fast_getPauliStrElem<T>(strings[n], row, col);

return elem;
}
// // T = qcomp, cpu_qcomp, gpu_qcomp
// template <typename T>
// INLINE T fast_getPauliStrElem(PauliStr str, qindex row, qindex col) {

// // this function is called by both fullstatediagmatr_setElemsToPauliStrSum()
// // and densmatr_setAmpsToPauliStrSum_sub(). The former's PauliStr can have
// // Paulis on any of the 64 sites, but the latter's PauliStr is always
// // constrainted to the lower 32 sites (because a 32-qubit density matrix
// // is already too large for the world's computers). As such, the latter
// // scenario can be optimised since str.highPaulis == 0, making the second
// // loop below redundant. Avoiding this loop can at most half the runtime,
// // though opens the risk that the former caller erroneously has its upper
// // Paulis ignore. We forego this optimisation in defensive design, and
// // because this function is only invoked during data structure initilisation
// // and ergo infrequently.

// // regrettably duplicated from paulis.cpp which is inaccessible here
// constexpr int numPaulisPerMask = sizeof(PAULI_MASK_TYPE) * 8 / 2;

// // T-agnostic complex literals
// T p0, p1,n1, pI,nI;
// p0 = {0, 0}; // 0
// p1 = {+1, 0}; // 1
// n1 = {-1, 0}; // -1
// pI = {0, +1}; // i
// nI = {0, -1}; // -i

// // 'matrices' below is not declared constexpr or static const, even though
// // it is fixed/known at compile-time, because this makes it incompatible
// // with CUDA kernels/thrust. It is instead left as runtime innitialisation
// // but this poses no real slowdown; this function, and its caller, are inlined
// // so these 16 amps are re-processed one for each full enumeration of the
// // PauliStrSum which is expected to have significantly more terms/coeffs
// T matrices[][2][2] = {
// {{p1,p0},{p0,p1}}, // I
// {{p0,p1},{p1,p0}}, // X
// {{p0,nI},{pI,p0}}, // Y
// {{p1,p0},{p0,n1}}}; // Z

// T elem = p1; // 1

// // could be compile-time unrolled into 32 iterations
// for (int t=0; t<numPaulisPerMask; t++) {
// int p = getTwoAdjacentBits(str.lowPaulis, 2*t);
// int i = getBit(row, t);
// int j = getBit(col, t);
// elem *= matrices[p][i][j];
// }

// // could be compile-time unrolled into 32 iterations
// for (int t=0; t<numPaulisPerMask; t++) {
// int p = getTwoAdjacentBits(str.highPaulis, 2*t);
// int i = getBit(row, t + numPaulisPerMask);
// int j = getBit(col, t + numPaulisPerMask);
// elem *= matrices[p][i][j];
// }

// return elem;
// }


// // T = qcomp, cpu_qcomp, gpu_qcomp
// template <typename T>
// INLINE T fast_getPauliStrSumElem(T* coeffs, PauliStr* strings, qindex numTerms, qindex row, qindex col) {

// // this function accepts unpacked PauliStrSum fields since a PauliStrSum cannot
// // be directly processed in CUDA kernels/thrust due to its 'qcomp' field.
// // it also assumes str.highPaulis==0 for all str in strings, as per above func.

// T elem = {0, 0}; // type-agnostic complex literal

// // this loop is expected exponentially smaller than caller's loop
// for (qindex n=0; n<numTerms; n++)
// elem += coeffs[n] * fast_getPauliStrElem<T>(strings[n], row, col);

// return elem;
// }


#endif // FASTMATH_HPP
50 changes: 42 additions & 8 deletions quest/src/cpu/cpu_subroutines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,11 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, v

// use cpu_qcomp arithmetic overloads (avoid qcomp's)
cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
auto elems = getCpuQcomps<2>(matr.elems); // MSVC requires explicit template param, bah!
// auto elems = getCpuQcompsMatr1(matr.elems);
cpu_qcomp m00 = getCpuQcomp(matr.elems[0][0]); // MSVC cannot handle 2D cpu_qcomps
cpu_qcomp m01 = getCpuQcomp(matr.elems[0][1]);
cpu_qcomp m10 = getCpuQcomp(matr.elems[1][0]);
cpu_qcomp m11 = getCpuQcomp(matr.elems[1][1]);

auto sortedQubits = util_getSorted(ctrls, {targ});
auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0});
Expand All @@ -432,8 +436,10 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, v
cpu_qcomp amp0 = amps[i0];
cpu_qcomp amp1 = amps[i1];

amps[i0] = elems[0][0]*amp0 + elems[0][1]*amp1;
amps[i1] = elems[1][0]*amp0 + elems[1][1]*amp1;
// amps[i0] = elems[0][0]*amp0 + elems[0][1]*amp1;
// amps[i1] = elems[1][0]*amp0 + elems[1][1]*amp1;
amps[i0] = m00*amp0 + m01*amp1;
amps[i1] = m10*amp0 + m11*amp1;
}
}

Expand Down Expand Up @@ -495,7 +501,30 @@ void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve

// use cpu_qcomp arithmetic overloads (avoid qcomp's)
cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
auto elems = getCpuQcomps<4>(matr.elems); // MSVC requires explicit template param, bah!

// auto elems = getCpuQcompsMatr2(matr.elems); // MSVC requires explicit template param, bah!

cpu_qcomp m00 = getCpuQcomp(matr.elems[0][0]);
cpu_qcomp m01 = getCpuQcomp(matr.elems[0][1]);
cpu_qcomp m02 = getCpuQcomp(matr.elems[0][2]);
cpu_qcomp m03 = getCpuQcomp(matr.elems[0][3]);

cpu_qcomp m10 = getCpuQcomp(matr.elems[1][0]);
cpu_qcomp m11 = getCpuQcomp(matr.elems[1][1]);
cpu_qcomp m12 = getCpuQcomp(matr.elems[1][2]);
cpu_qcomp m13 = getCpuQcomp(matr.elems[1][3]);

cpu_qcomp m20 = getCpuQcomp(matr.elems[2][0]);
cpu_qcomp m21 = getCpuQcomp(matr.elems[2][1]);
cpu_qcomp m22 = getCpuQcomp(matr.elems[2][2]);
cpu_qcomp m23 = getCpuQcomp(matr.elems[2][3]);

cpu_qcomp m30 = getCpuQcomp(matr.elems[3][0]);
cpu_qcomp m31 = getCpuQcomp(matr.elems[3][1]);
cpu_qcomp m32 = getCpuQcomp(matr.elems[3][2]);
cpu_qcomp m33 = getCpuQcomp(matr.elems[3][3]);



auto sortedQubits = util_getSorted(ctrls, {targ1, targ2});
auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1, targ2}, {0, 0});
Expand All @@ -520,10 +549,15 @@ void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
cpu_qcomp amp11 = amps[i11];

// amps[i_n] = sum_j matr.elems[n][j] amp[i_n]
amps[i00] = elems[0][0]*amp00 + elems[0][1]*amp01 + elems[0][2]*amp10 + elems[0][3]*amp11;
amps[i01] = elems[1][0]*amp00 + elems[1][1]*amp01 + elems[1][2]*amp10 + elems[1][3]*amp11;
amps[i10] = elems[2][0]*amp00 + elems[2][1]*amp01 + elems[2][2]*amp10 + elems[2][3]*amp11;
amps[i11] = elems[3][0]*amp00 + elems[3][1]*amp01 + elems[3][2]*amp10 + elems[3][3]*amp11;
// amps[i00] = elems[0][0]*amp00 + elems[0][1]*amp01 + elems[0][2]*amp10 + elems[0][3]*amp11;
// amps[i01] = elems[1][0]*amp00 + elems[1][1]*amp01 + elems[1][2]*amp10 + elems[1][3]*amp11;
// amps[i10] = elems[2][0]*amp00 + elems[2][1]*amp01 + elems[2][2]*amp10 + elems[2][3]*amp11;
// amps[i11] = elems[3][0]*amp00 + elems[3][1]*amp01 + elems[3][2]*amp10 + elems[3][3]*amp11;

amps[i00] = m00*amp00 + m01*amp01 + m02*amp10 + m03*amp11;
amps[i01] = m10*amp00 + m11*amp01 + m12*amp10 + m13*amp11;
amps[i10] = m20*amp00 + m21*amp01 + m22*amp10 + m23*amp11;
amps[i11] = m30*amp00 + m31*amp01 + m32*amp10 + m33*amp11;
}
}

Expand Down
106 changes: 99 additions & 7 deletions quest/src/cpu/cpu_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,16 +127,27 @@ INLINE qcomp getQcomp(const cpu_qcomp& a) {


// creator for fixed-size dense matrices (CompMatr1 and CompMatr2) ((not inlined!))
template <int dim>
std::array<std::array<cpu_qcomp,dim>,dim> getCpuQcomps(qcomp matr[dim][dim]) {
std::array<std::array<cpu_qcomp,2>,2> getCpuQcompsMatr1(qcomp matr[2][2]) {

// detect brain-dead compiler inferencing (looking at you MSVC...)
static_assert(dim == 2 || dim == 4, "getCpuQcomps called with unexpected dim");
// dumb and explicit here because MSVC + OpenMP breaks
// when templating this - not worth fixing here because
// we are considering a refactor which merges cpu_types.hpp
// with gpu_types.cuh anyway

std::array<std::array<cpu_qcomp,dim>,dim> out;
std::array<std::array<cpu_qcomp,2>,2> out;

for (int i=0; i<dim; i++)
for (int j=0; j<dim; j++)
for (int i=0; i<2; i++)
for (int j=0; j<2; j++)
out[i][j] = getCpuQcomp(matr[i][j]);

return out;
}
std::array<std::array<cpu_qcomp,4>,4> getCpuQcompsMatr2(qcomp matr[4][4]) {

std::array<std::array<cpu_qcomp,4>,4> out;

for (int i=0; i<4; i++)
for (int j=0; j<4; j++)
out[i][j] = getCpuQcomp(matr[i][j]);

return out;
Expand Down Expand Up @@ -187,4 +198,85 @@ static_assert(std::is_trivially_copyable_v<cpu_qcomp>);
// casting is safe for all circumstances (e.g. heap mem, static lists)





INLINE cpu_qcomp fast_getPauliStrElem(PauliStr str, qindex row, qindex col) {

// this function is called by both fullstatediagmatr_setElemsToPauliStrSum()
// and densmatr_setAmpsToPauliStrSum_sub(). The former's PauliStr can have
// Paulis on any of the 64 sites, but the latter's PauliStr is always
// constrainted to the lower 32 sites (because a 32-qubit density matrix
// is already too large for the world's computers). As such, the latter
// scenario can be optimised since str.highPaulis == 0, making the second
// loop below redundant. Avoiding this loop can at most half the runtime,
// though opens the risk that the former caller erroneously has its upper
// Paulis ignore. We forego this optimisation in defensive design, and
// because this function is only invoked during data structure initilisation
// and ergo infrequently.

// regrettably duplicated from paulis.cpp which is inaccessible here
constexpr int numPaulisPerMask = sizeof(PAULI_MASK_TYPE) * 8 / 2;

// T-agnostic complex literals
cpu_qcomp p0, p1,n1, pI,nI;
p0 = {0, 0}; // 0
p1 = {+1, 0}; // 1
n1 = {-1, 0}; // -1
pI = {0, +1}; // i
nI = {0, -1}; // -i

// 'matrices' below is not declared constexpr or static const, even though
// it is fixed/known at compile-time, because this makes it incompatible
// with CUDA kernels/thrust. It is instead left as runtime innitialisation
// but this poses no real slowdown; this function, and its caller, are inlined
// so these 16 amps are re-processed one for each full enumeration of the
// PauliStrSum which is expected to have significantly more terms/coeffs
cpu_qcomp matrices[][2][2] = {
{{p1,p0},{p0,p1}}, // I
{{p0,p1},{p1,p0}}, // X
{{p0,nI},{pI,p0}}, // Y
{{p1,p0},{p0,n1}}}; // Z

cpu_qcomp elem = p1; // 1

// could be compile-time unrolled into 32 iterations
for (int t=0; t<numPaulisPerMask; t++) {
int p = getTwoAdjacentBits(str.lowPaulis, 2*t);
int i = getBit(row, t);
int j = getBit(col, t);
elem *= matrices[p][i][j];
}

// could be compile-time unrolled into 32 iterations
for (int t=0; t<numPaulisPerMask; t++) {
int p = getTwoAdjacentBits(str.highPaulis, 2*t);
int i = getBit(row, t + numPaulisPerMask);
int j = getBit(col, t + numPaulisPerMask);
elem *= matrices[p][i][j];
}

return elem;
}



INLINE cpu_qcomp fast_getPauliStrSumElem(cpu_qcomp* coeffs, PauliStr* strings, qindex numTerms, qindex row, qindex col) {

// this function accepts unpacked PauliStrSum fields since a PauliStrSum cannot
// be directly processed in CUDA kernels/thrust due to its 'qcomp' field.
// it also assumes str.highPaulis==0 for all str in strings, as per above func.

cpu_qcomp elem = {0, 0}; // type-agnostic complex literal

// this loop is expected exponentially smaller than caller's loop
for (qindex n=0; n<numTerms; n++)
elem += coeffs[n] * fast_getPauliStrElem(strings[n], row, col);

return elem;
}




#endif // CPU_TYPES_HPP
Loading
Loading