-
Notifications
You must be signed in to change notification settings - Fork 85
Benchmark: Micro benchmark - Add float datatype support and other refinements to GPU Stream #769
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
7f23c75
d63fe8c
242714e
e8d0282
5a18946
fddf56e
3c359a3
e445363
60b130c
f31933f
d8a91ab
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -235,15 +235,15 @@ template <typename T> int GpuStream::PrepareBufAndStream(std::unique_ptr<BenchAr | |
| cudaError_t cuda_err = cudaSuccess; | ||
|
|
||
| if (args->check_data) { | ||
| // Generate data to copy | ||
| args->sub.data_buf = static_cast<T *>(numa_alloc_onnode(args->size * sizeof(T), args->numa_id)); | ||
| // Generate data to copy - use local NUMA node for best CPU access | ||
| args->sub.data_buf = static_cast<T *>(numa_alloc_local(args->size)); | ||
|
|
||
| for (int j = 0; j < args->size / sizeof(T); j++) { | ||
| args->sub.data_buf[j] = static_cast<T>(j % kUInt8Mod); | ||
| } | ||
|
|
||
| // Allocate check buffer | ||
| args->sub.check_buf = static_cast<T *>(numa_alloc_onnode(args->size * sizeof(T), args->numa_id)); | ||
| // Allocate check buffer on local NUMA node | ||
| args->sub.check_buf = static_cast<T *>(numa_alloc_local(args->size)); | ||
WenqingLan1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| // Allocate buffers | ||
|
|
@@ -257,7 +257,7 @@ template <typename T> int GpuStream::PrepareBufAndStream(std::unique_ptr<BenchAr | |
| // Allocate buffers | ||
| for (auto &buf_ptr : args->sub.gpu_buf_ptrs) { | ||
| T *raw_ptr = nullptr; | ||
| cuda_err = GpuMallocDataBuf(&raw_ptr, args->size * sizeof(T)); | ||
| cuda_err = GpuMallocDataBuf(&raw_ptr, args->size); | ||
| if (cuda_err != cudaSuccess) { | ||
| std::cerr << "PrepareBufAndStream::cudaMalloc error: " << cuda_err << std::endl; | ||
| return -1; | ||
|
|
@@ -420,10 +420,12 @@ int GpuStream::RunStreamKernel(std::unique_ptr<BenchArgs<T>> &args, Kernel kerne | |
| int size_factor = 2; | ||
|
|
||
| // Validate data size | ||
| uint64_t num_elements_in_thread_block = kNumLoopUnroll * num_threads_per_block; | ||
| uint64_t num_bytes_in_thread_block = num_elements_in_thread_block * sizeof(T); | ||
| // Each thread processes 128 bits (16 bytes) for optimal memory bandwidth. | ||
| // For double: uses double2 (16 bytes). For float: would use float4 (16 bytes). | ||
| constexpr uint64_t kBytesPerThread = 16; // 128-bit aligned access | ||
| uint64_t num_bytes_in_thread_block = num_threads_per_block * kBytesPerThread; | ||
| if (args->size % num_bytes_in_thread_block) { | ||
| std::cerr << "RunCopy: Data size should be multiple of " << num_bytes_in_thread_block << std::endl; | ||
| std::cerr << "RunStreamKernel: Data size should be multiple of " << num_bytes_in_thread_block << std::endl; | ||
| return -1; | ||
| } | ||
| num_thread_blocks = args->size / num_bytes_in_thread_block; | ||
|
|
@@ -448,30 +450,30 @@ int GpuStream::RunStreamKernel(std::unique_ptr<BenchArgs<T>> &args, Kernel kerne | |
|
|
||
| switch (kernel) { | ||
| case Kernel::kCopy: | ||
| CopyKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>( | ||
| reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()), | ||
| reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get())); | ||
| CopyKernel<T><<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>( | ||
| reinterpret_cast<VecT<T> *>(args->sub.gpu_buf_ptrs[2].get()), | ||
| reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[0].get())); | ||
| args->sub.kernel_name = "COPY"; | ||
| break; | ||
| case Kernel::kScale: | ||
| ScaleKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>( | ||
| reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()), | ||
| reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()), static_cast<T>(scalar)); | ||
| ScaleKernel<T><<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>( | ||
| reinterpret_cast<VecT<T> *>(args->sub.gpu_buf_ptrs[2].get()), | ||
| reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[0].get()), static_cast<T>(scalar)); | ||
| args->sub.kernel_name = "SCALE"; | ||
| break; | ||
| case Kernel::kAdd: | ||
| AddKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>( | ||
| reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()), | ||
| reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()), | ||
| reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[1].get())); | ||
| AddKernel<T><<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>( | ||
| reinterpret_cast<VecT<T> *>(args->sub.gpu_buf_ptrs[2].get()), | ||
| reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[0].get()), | ||
| reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[1].get())); | ||
| size_factor = 3; | ||
| args->sub.kernel_name = "ADD"; | ||
| break; | ||
| case Kernel::kTriad: | ||
| TriadKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>( | ||
| reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()), | ||
| reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()), | ||
| reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[1].get()), static_cast<T>(scalar)); | ||
| TriadKernel<T><<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>( | ||
| reinterpret_cast<VecT<T> *>(args->sub.gpu_buf_ptrs[2].get()), | ||
| reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[0].get()), | ||
| reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[1].get()), static_cast<T>(scalar)); | ||
| size_factor = 3; | ||
| args->sub.kernel_name = "TRIAD"; | ||
| break; | ||
|
|
@@ -583,10 +585,9 @@ int GpuStream::RunStream(std::unique_ptr<BenchArgs<T>> &args, const std::string | |
|
|
||
| // output formatted results to stdout | ||
| // Tags are of format: | ||
| // STREAM_<Kernelname>_datatype_gpu_<gpu_id>_buffer_<buffer_size>_block_<block_size> | ||
| // STREAM_<Kernelname>_datatype_buffer_<buffer_size>_block_<block_size> | ||
| for (int i = 0; i < args->sub.times_in_ms.size(); i++) { | ||
| std::string tag = "STREAM_" + KernelToString(i) + "_" + data_type + "_gpu_" + std::to_string(args->gpu_id) + | ||
| "_buffer_" + std::to_string(args->size); | ||
| std::string tag = "STREAM_" + KernelToString(i) + "_" + data_type + "_buffer_" + std::to_string(args->size); | ||
| for (int j = 0; j < args->sub.times_in_ms[i].size(); j++) { | ||
| // Calculate and display bandwidth | ||
| double bw = args->size * args->num_loops / args->sub.times_in_ms[i][j] / 1e6; | ||
|
|
@@ -608,9 +609,9 @@ int GpuStream::RunStream(std::unique_ptr<BenchArgs<T>> &args, const std::string | |
| /** | ||
| * @brief Runs the Stream benchmark. | ||
| * | ||
| * @details This function processes the input args, validates and composes the BenchArgs structure for the | ||
| availavble | ||
| * GPUs, and runs the benchmark. | ||
| * @details This function processes the input args, validates and composes the BenchArgs structure for | ||
| * the first visible GPU (CUDA device 0). When running under Superbench's default_local_mode, | ||
| * CUDA_VISIBLE_DEVICES is set per process, so device 0 maps to the assigned physical GPU. | ||
| * | ||
| * @return int The status code indicating success or failure of the benchmark execution. | ||
| * */ | ||
|
|
@@ -631,21 +632,29 @@ int GpuStream::Run() { | |
| return ret; | ||
| } | ||
|
|
||
| // find all GPUs and compose the Benchmarking data structure | ||
| for (int j = 0; j < gpu_count; j++) { | ||
| auto args = std::make_unique<BenchArgs<double>>(); | ||
| args->numa_id = 0; | ||
| args->gpu_id = j; | ||
| cudaGetDeviceProperties(&args->gpu_device_prop, j); | ||
| if (gpu_count < 1) { | ||
| std::cerr << "Run::No GPU available" << std::endl; | ||
| return -1; | ||
| } | ||
|
|
||
| // Run on CUDA device 0 (the visible GPU assigned by CUDA_VISIBLE_DEVICES). | ||
| if (opts_.data_type == "float") { | ||
| auto args = std::make_unique<BenchArgs<float>>(); | ||
| args->gpu_id = 0; | ||
| cudaGetDeviceProperties(&args->gpu_device_prop, 0); | ||
| args->num_warm_up = opts_.num_warm_up; | ||
| args->num_loops = opts_.num_loops; | ||
| args->size = opts_.size; | ||
| args->check_data = opts_.check_data; | ||
| bench_args_.emplace_back(std::move(args)); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we make it a template function? |
||
| } else { | ||
| auto args = std::make_unique<BenchArgs<double>>(); | ||
| args->gpu_id = 0; | ||
| cudaGetDeviceProperties(&args->gpu_device_prop, 0); | ||
| args->num_warm_up = opts_.num_warm_up; | ||
| args->num_loops = opts_.num_loops; | ||
| args->size = opts_.size; | ||
| args->check_data = opts_.check_data; | ||
| args->numa_id = 0; | ||
| args->gpu_id = j; | ||
|
|
||
| // add data to vector | ||
| bench_args_.emplace_back(std::move(args)); | ||
| } | ||
|
|
||
|
|
@@ -668,14 +677,6 @@ int GpuStream::Run() { | |
| // Print device info with both the memory clock and peak bandwidth | ||
| PrintCudaDeviceInfo(curr_args->gpu_id, curr_args->gpu_device_prop, memory_clock_mhz, peak_bw); | ||
|
|
||
| // Set the NUMA node | ||
| ret = numa_run_on_node(curr_args->numa_id); | ||
| if (ret != 0) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. numa_alloc_local allocates on the current NUMA node of the calling thread at allocation time, which is correct. But if the OS later migrates the thread to a remote NUMA node, remote memory accesses introduce latency. Should we pin the thread to the fix NUMA node? |
||
| std::cerr << "Run::numa_run_on_node error: " << errno << std::endl; | ||
| has_error = true; | ||
| return; | ||
| } | ||
|
|
||
| // Run the stream benchmark for the configured data, passing the peak bandwidth | ||
| if constexpr (std::is_same_v<std::decay_t<decltype(*curr_args)>, BenchArgs<float>>) { | ||
| ret = RunStream<float>(curr_args, "float", peak_bw); | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.