public inbox for git-commits@fedoraproject.org
help / color / mirror / Atom feed
From: Tom Rix <Tom.Rix@amd.com>
To: git-commits@fedoraproject.org
Subject: [rpms/rocwmma] epel10: A smoke test from rocm-examples
Date: Sun, 14 Jun 2026 12:46:00 GMT [thread overview]
Message-ID: <178144116073.1.332152274305352524.rpms-rocwmma-3545b47a0713@fedoraproject.org> (raw)
A new commit has been pushed.
Repo : rpms/rocwmma
Branch : epel10
Commit : 3545b47a07132ae1cdb4e5a83c163ab0ea6ad572
Author : Tom Rix <Tom.Rix@amd.com>
Date : 2026-03-04T07:07:46-08:00
Stats : +630/-0 in 2 file(s)
URL : https://src.fedoraproject.org/rpms/rocwmma/c/3545b47a07132ae1cdb4e5a83c163ab0ea6ad572?branch=epel10
Log:
A smoke test from rocm-examples
Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
diff --git a/test.cpp b/test.cpp
new file mode 100644
index 0000000..28c6c9e
--- /dev/null
+++ b/test.cpp
@@ -0,0 +1,621 @@
+// MIT License
+//
+// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+// This file is a modified version of the original rocWMMA simple_sgemm example
+// from the rocm-examples project: https://github.com/ROCm/rocm-examples
+// The original file was main.hip and has been converted to C++ for compatibility
+//
+// rocWMMA Simple Single-Precision General Matrix Multiplication (SGEMM)
+//
+// This example demonstrates a basic single-precision General Matrix Multiplication (GEMM) using rocWMMA,
+// showcasing the library's support for FP32 computations on AMD GPUs.
+//
+// The rocWMMA utilities (rocwmma_utils.hpp) have been inlined from the rocm-examples Common/ directory:
+// https://github.com/ROCm/rocm-examples/tree/main/Common
+
+// MIT License
+//
+// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <hip/hip_runtime.h>
+#include <rocwmma/rocwmma.hpp>
+
+#include <iostream>
+#include <limits>
+#include <random>
+#include <vector>
+#include <string>
+#include <ctime>
+#include <iomanip>
+#include <sstream>
+#include <algorithm>
+#include <type_traits>
+#include <chrono>
+#include <iomanip>
+#include <iterator>
+#include <fstream>
+
+constexpr int error_exit_code = -1;
+
+/// \brief Checks if the provided error code is \p hipSuccess and if not,
+/// prints an error message to the standard error output and terminates the program
+/// with an error code.
+#define HIP_CHECK(condition) \
+ { \
+ const hipError_t error = condition; \
+ if(error != hipSuccess) \
+ { \
+ std::cerr << "An error encountered: \"" << hipGetErrorString(error) << "\" at " \
+ << __FILE__ << ':' << __LINE__ << std::endl; \
+ std::exit(error_exit_code); \
+ } \
+ }
+
+/// \brief Get current device warp size
+inline uint32_t get_warp_size()
+{
+ hipDeviceProp_t device_prop;
+ int device_id;
+ HIP_CHECK(hipGetDevice(&device_id));
+ HIP_CHECK(hipGetDeviceProperties(&device_prop, device_id));
+ return device_prop.warpSize;
+}
+
+/// \brief Check if current device supports F64 operations
+inline bool is_f64_supported()
+{
+ hipDevice_t handle;
+ hipDeviceProp_t props;
+
+ HIP_CHECK(hipGetDevice(&handle));
+ HIP_CHECK(hipGetDeviceProperties(&props, handle));
+
+ std::string device_name(props.gcnArchName);
+
+ return ((device_name.find("gfx90a") != std::string::npos)
+ || (device_name.find("gfx942") != std::string::npos)
+ || (device_name.find("gfx950") != std::string::npos));
+}
+
+/// \brief Check if current device supports F32 operations
+inline bool is_f32_supported()
+{
+ hipDevice_t handle;
+ hipDeviceProp_t props;
+
+ HIP_CHECK(hipGetDevice(&handle));
+ HIP_CHECK(hipGetDeviceProperties(&props, handle));
+
+ std::string device_name(props.gcnArchName);
+
+ return ((device_name.find("gfx908") != std::string::npos)
+ || (device_name.find("gfx90a") != std::string::npos)
+ || (device_name.find("gfx942") != std::string::npos)
+ || (device_name.find("gfx950") != std::string::npos));
+}
+
+/// \brief Calculate GFlops for GEMM operation
+inline double calculate_gflops(uint32_t m, uint32_t n, uint32_t k)
+{
+ return 2.0 * static_cast<double>(m) * static_cast<double>(n) * static_cast<double>(k) * 1.0e-9;
+}
+
+/// \brief Calculate TFlops per second
+inline double calculate_tflops_per_sec(
+ uint32_t m, uint32_t n, uint32_t k, double elapsed_time_ms, uint32_t repeats = 1u)
+{
+ return calculate_gflops(m, n, k) / elapsed_time_ms * static_cast<double>(repeats);
+}
+
+/// \brief Matrix initialization with random values
+template<typename data_t>
+__host__ static inline void fill_rand(data_t* mat, uint32_t m, uint32_t n)
+{
+ auto rand_init = []()
+ {
+ srand(time(0));
+ return 0u;
+ };
+ static auto init = rand_init();
+ (void) init;
+
+ for(uint32_t i = 0; i < m; ++i)
+ {
+ auto rando = rand() % 5u;
+ for(uint32_t j = 0; j < n; j++)
+ {
+ auto value = (rando + j) % 5u;
+ mat[i * n + j] = ((value % 3u == 0u) && std::is_signed<data_t>::value)
+ ? -static_cast<data_t>(value)
+ : static_cast<data_t>(value);
+ }
+ }
+}
+
+/// \brief CPU GEMM reference implementation
+template<typename input_t,
+ typename output_t,
+ typename compute_t,
+ typename layout_a,
+ typename layout_b,
+ typename layout_c,
+ typename layout_d = layout_c>
+__host__ void gemm_cpu_h(uint32_t m,
+ uint32_t n,
+ uint32_t k,
+ input_t const* a,
+ input_t const* b,
+ output_t const* c,
+ output_t* d,
+ uint32_t lda,
+ uint32_t ldb,
+ uint32_t ldc,
+ uint32_t ldd,
+ compute_t alpha,
+ compute_t beta)
+{
+ auto row_mjr = [](uint32_t row, uint32_t col, uint32_t ld) { return row * ld + col; };
+ auto col_mjr = [](uint32_t row, uint32_t col, uint32_t ld) { return col * ld + row; };
+
+ auto a_index = std::is_same<layout_a, rocwmma::row_major>::value ? row_mjr : col_mjr;
+ auto b_index = std::is_same<layout_b, rocwmma::row_major>::value ? row_mjr : col_mjr;
+ auto c_index = std::is_same<layout_c, rocwmma::row_major>::value ? row_mjr : col_mjr;
+ auto d_index = std::is_same<layout_d, rocwmma::row_major>::value ? row_mjr : col_mjr;
+
+ for(uint32_t i = 0; i < m; ++i)
+ {
+ for(uint32_t j = 0; j < n; ++j)
+ {
+ compute_t accum = static_cast<compute_t>(0);
+ for(uint32_t h = 0; h < k; ++h)
+ {
+ accum += static_cast<compute_t>(a[a_index(i, h, lda)])
+ * static_cast<compute_t>(b[b_index(h, j, ldb)]);
+ }
+ d[d_index(i, j, ldd)] = static_cast<output_t>(
+ alpha * accum + beta * static_cast<compute_t>(c[c_index(i, j, ldc)]));
+ }
+ }
+}
+
+/// \brief Element-wise comparison
+template<typename data_t>
+__host__ std::pair<bool, double>
+ compare_equal(data_t const* a, data_t const* b, uint32_t size, double tolerance = 10.0)
+{
+ bool retval = true;
+ double max_relative_error = 0.0;
+
+ auto to_double = [](data_t const& val) { return static_cast<double>(static_cast<float>(val)); };
+
+ for(uint32_t i = 0; i < size; ++i)
+ {
+ auto val_a = a[i];
+ auto val_b = b[i];
+
+ auto numerator = fabs(to_double(val_a) - to_double(val_b));
+ auto divisor = fabs(to_double(val_a)) + fabs(to_double(val_b)) + 1.0;
+
+ if(std::isinf(numerator) || std::isinf(divisor))
+ {
+ retval = false;
+ max_relative_error = std::numeric_limits<data_t>::infinity();
+ break;
+ }
+ else
+ {
+ auto relative_error = numerator / divisor;
+ if(std::isnan(relative_error))
+ {
+ retval = false;
+ max_relative_error = std::numeric_limits<data_t>::signaling_NaN();
+ break;
+ }
+ else if(relative_error > max_relative_error)
+ {
+ max_relative_error = relative_error;
+ }
+ }
+ }
+
+ auto eps = to_double(std::numeric_limits<data_t>::epsilon());
+ if(max_relative_error > (eps * tolerance))
+ {
+ retval = false;
+ }
+
+ return std::make_pair(retval, max_relative_error);
+}
+
+/// \brief Matrix initialization with batch support
+template<typename data_t>
+__host__ static inline void
+ fill(data_t* mat, uint32_t m, uint32_t k, uint32_t b, uint32_t normalization = 1)
+{
+ auto batch_offset = m * k;
+ for(uint32_t t = 0; t < b; ++t)
+ {
+ for(uint32_t i = 0; i < m; ++i)
+ {
+ for(uint32_t j = 0; j < k; ++j)
+ {
+ auto value
+ = static_cast<float>(rand() / normalization) / static_cast<float>(RAND_MAX);
+ mat[t * batch_offset + i * k + j] = static_cast<data_t>(value);
+ }
+ }
+ }
+}
+
+/// \brief Check if current device is GFX9 architecture
+inline bool is_gfx9()
+{
+ hipDevice_t handle;
+ hipDeviceProp_t props;
+
+ HIP_CHECK(hipGetDevice(&handle));
+ HIP_CHECK(hipGetDeviceProperties(&props, handle));
+
+ std::string device_name(props.gcnArchName);
+
+ return ((device_name.find("gfx908") != std::string::npos)
+ || (device_name.find("gfx90a") != std::string::npos)
+ || (device_name.find("gfx942") != std::string::npos)
+ || (device_name.find("gfx950") != std::string::npos));
+}
+
+/// \brief Check if current device is GFX11 architecture
+inline bool is_gfx11()
+{
+ hipDevice_t handle;
+ hipDeviceProp_t props;
+
+ HIP_CHECK(hipGetDevice(&handle));
+ HIP_CHECK(hipGetDeviceProperties(&props, handle));
+
+ std::string device_name(props.gcnArchName);
+
+ return ((device_name.find("gfx1100") != std::string::npos)
+ || (device_name.find("gfx1101") != std::string::npos)
+ || (device_name.find("gfx1102") != std::string::npos)
+ || (device_name.find("gfx1151") != std::string::npos));
+}
+
+/// \brief Check if current device is GFX12 architecture
+inline bool is_gfx12()
+{
+ hipDevice_t handle;
+ hipDeviceProp_t props;
+
+ HIP_CHECK(hipGetDevice(&handle));
+ HIP_CHECK(hipGetDeviceProperties(&props, handle));
+
+ std::string device_name(props.gcnArchName);
+
+ return ((device_name.find("gfx1200") != std::string::npos)
+ || (device_name.find("gfx1201") != std::string::npos));
+}
+
+/// \brief Check HIPRTC error and exit on failure
+#ifndef HIPRTC_CHECK
+ #define HIPRTC_CHECK(expression) \
+ if(auto status = (expression); status != HIPRTC_SUCCESS) \
+ { \
+ fprintf(stderr, \
+ "hipRTC error: '%s'(%d) at %s:%d\n", \
+ hiprtcGetErrorString(status), \
+ status, \
+ __FILE__, \
+ __LINE__); \
+ exit(error_exit_code); \
+ }
+#endif
+
+// End of inlined rocwmma_utils.hpp content
+
+#include <array>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+using rocwmma::accumulator;
+using rocwmma::col_major;
+using rocwmma::float32_t;
+using rocwmma::matrix_a;
+using rocwmma::matrix_b;
+using rocwmma::row_major;
+
+// WMMA block dimensions
+constexpr int rocwmma_m = 16;
+constexpr int rocwmma_n = 16;
+constexpr int rocwmma_k = 16;
+
+// Device kernel implementation
+__global__ void sgemm_rocwmma_d(uint32_t m,
+ uint32_t n,
+ uint32_t k,
+ float32_t const* a,
+ float32_t const* b,
+ float32_t const* c,
+ float32_t* d,
+ uint32_t lda,
+ uint32_t ldb,
+ uint32_t ldc,
+ uint32_t ldd,
+ float32_t alpha,
+ float32_t beta)
+{
+ // Create fragments
+ auto frag_a
+ = rocwmma::fragment<matrix_a, rocwmma_m, rocwmma_n, rocwmma_k, float32_t, row_major>();
+ auto frag_b
+ = rocwmma::fragment<matrix_b, rocwmma_m, rocwmma_n, rocwmma_k, float32_t, col_major>();
+ auto frag_c = rocwmma::fragment<accumulator, rocwmma_m, rocwmma_n, rocwmma_k, float32_t>();
+ auto frag_acc = rocwmma::fragment<accumulator, rocwmma_m, rocwmma_n, rocwmma_k, float32_t>();
+
+ rocwmma::fill_fragment(frag_acc, 0.0f);
+
+ // Determine output block coordinates
+ auto major_warp
+ = (blockIdx.x * blockDim.x + threadIdx.x) / rocwmma::Constants::AMDGCN_WAVE_SIZE;
+ auto minor_warp = (blockIdx.y * blockDim.y + threadIdx.y);
+
+ auto c_row = major_warp * rocwmma_m;
+ auto c_col = minor_warp * rocwmma_n;
+
+ // Bounds check
+ if(c_row < m && c_col < n)
+ {
+ // Accumulate A x B over K dimension
+ for(uint32_t i = 0; i < k; i += rocwmma_k)
+ {
+ rocwmma::load_matrix_sync(frag_a, a + (c_row * lda + i), lda);
+ rocwmma::load_matrix_sync(frag_b, b + (i + c_col * ldb), ldb);
+ rocwmma::mma_sync(frag_acc, frag_a, frag_b, frag_acc);
+ }
+
+ // Load C matrix
+ rocwmma::load_matrix_sync(frag_c, c + (c_row * ldc + c_col), ldc, rocwmma::mem_row_major);
+
+ // D = alpha * A x B + beta * C
+ for(uint32_t i = 0; i < frag_c.num_elements; ++i)
+ {
+ frag_c.x[i] = alpha * frag_acc.x[i] + beta * frag_c.x[i];
+ }
+
+ // Store result
+ rocwmma::store_matrix_sync(d + (c_row * ldd + c_col), frag_c, ldd, rocwmma::mem_row_major);
+ }
+}
+
+int main()
+{
+ // 1. Set up input data
+ constexpr uint32_t m = 256;
+ constexpr uint32_t n = 256;
+ constexpr uint32_t k = 256;
+
+ constexpr float32_t alpha = 2.1f;
+ constexpr float32_t beta = 2.1f;
+#if 0
+ // Check device support for F32 operations
+ if(!is_f32_supported())
+ {
+ std::cout << "F32 SGEMM not supported on this device" << std::endl;
+ return 0;
+ }
+#endif
+ // Thread block configuration
+ const uint32_t wave_size = get_warp_size();
+ const int t_block_x = 4 * wave_size;
+ const int t_block_y = 4;
+
+ // Matrix layouts and leading dimensions
+ // A: row-major (M x K), B: col-major (K x N), C,D: row-major (M x N)
+ int lda = k; // row-major A
+ int ldb = k; // col-major B
+ int ldc = n; // row-major C
+ int ldd = ldc;
+
+ // Validate matrix dimensions
+ if((m < (rocwmma_m * t_block_x / wave_size) || n < (rocwmma_n * t_block_y) || k < rocwmma_k)
+ || (m % rocwmma_m | n % rocwmma_n | k % rocwmma_k))
+ {
+ std::cerr << "Error: Matrix dimensions not supported" << std::endl;
+ return 0;
+ }
+
+ std::cout << "rocWMMA Simple SGEMM Example" << std::endl;
+ std::cout << "Matrix dimensions: A(" << m << "x" << k << ") * B(" << k << "x" << n << ") + C("
+ << m << "x" << n << ") = D(" << m << "x" << n << ")" << std::endl;
+
+ // 2. Initialize host matrices
+ std::vector<float32_t> matrix_a(m * k);
+ std::vector<float32_t> matrix_b(k * n);
+ std::vector<float32_t> matrix_c(m * n);
+ std::vector<float32_t> matrix_d(m * n, std::numeric_limits<float32_t>::signaling_NaN());
+
+ fill_rand(matrix_a.data(), m, k);
+ fill_rand(matrix_b.data(), k, n);
+ fill_rand(matrix_c.data(), m, n);
+
+ // 3. Allocate device memory and copy input data
+ float32_t* d_a;
+ float32_t* d_b;
+ float32_t* d_c;
+ float32_t* d_d;
+
+ const size_t bytes_a = matrix_a.size() * sizeof(float32_t);
+ const size_t bytes_b = matrix_b.size() * sizeof(float32_t);
+ const size_t bytes_c = matrix_c.size() * sizeof(float32_t);
+ const size_t bytes_d = matrix_d.size() * sizeof(float32_t);
+
+ HIP_CHECK(hipMalloc(&d_a, bytes_a));
+ HIP_CHECK(hipMalloc(&d_b, bytes_b));
+ HIP_CHECK(hipMalloc(&d_c, bytes_c));
+ HIP_CHECK(hipMalloc(&d_d, bytes_d));
+
+ HIP_CHECK(hipMemcpy(d_a, matrix_a.data(), bytes_a, hipMemcpyHostToDevice));
+ HIP_CHECK(hipMemcpy(d_b, matrix_b.data(), bytes_b, hipMemcpyHostToDevice));
+ HIP_CHECK(hipMemcpy(d_c, matrix_c.data(), bytes_c, hipMemcpyHostToDevice));
+ HIP_CHECK(hipMemcpy(d_d, matrix_d.data(), bytes_d, hipMemcpyHostToDevice));
+
+ // 4. Configure kernel launch parameters
+ auto block_dim = dim3(t_block_x, t_block_y);
+ auto grid_dim = dim3(rocwmma::ceil_div(m, rocwmma_m * t_block_x / wave_size),
+ rocwmma::ceil_div(n, rocwmma_n * t_block_y));
+
+ std::cout << "Launching kernel with grid(" << grid_dim.x << "," << grid_dim.y << ") block("
+ << block_dim.x << "," << block_dim.y << ")" << std::endl;
+
+ // 5. Launch rocWMMA SGEMM kernel
+ // Warm-up
+ for(uint32_t i = 0; i < 5; ++i)
+ {
+ sgemm_rocwmma_d<<<grid_dim, block_dim>>>(m,
+ n,
+ k,
+ d_a,
+ d_b,
+ d_c,
+ d_d,
+ lda,
+ ldb,
+ ldc,
+ ldd,
+ alpha,
+ beta);
+ }
+
+ constexpr uint32_t record_runs = 100u;
+
+ // Actual recorded runs
+ hipEvent_t start_event, stop_event;
+ HIP_CHECK(hipEventCreate(&start_event));
+ HIP_CHECK(hipEventCreate(&stop_event));
+
+ HIP_CHECK(hipEventRecord(start_event));
+ for(uint32_t i = 0; i < record_runs; ++i)
+ {
+ sgemm_rocwmma_d<<<grid_dim, block_dim>>>(m,
+ n,
+ k,
+ d_a,
+ d_b,
+ d_c,
+ d_d,
+ lda,
+ ldb,
+ ldc,
+ ldd,
+ alpha,
+ beta);
+ }
+ HIP_CHECK(hipEventRecord(stop_event));
+ HIP_CHECK(hipEventSynchronize(stop_event));
+
+ auto elapsed_time_ms = 0.0f;
+ HIP_CHECK(hipEventElapsedTime(&elapsed_time_ms, start_event, stop_event));
+
+ auto gflops = calculate_gflops(m, n, k);
+ auto tflops_per_sec
+ = calculate_tflops_per_sec(m, n, k, static_cast<double>(elapsed_time_ms), record_runs);
+
+ HIP_CHECK(hipEventDestroy(start_event));
+ HIP_CHECK(hipEventDestroy(stop_event));
+
+ // Echo performance
+ std::cout << std::left << std::setw(8) << "TBlockX" << std::setw(8) << "TBlockY" << std::setw(8)
+ << "BlocksM" << std::setw(8) << "BlocksN" << std::setw(6) << "BlkM" << std::setw(6)
+ << "BlkN" << std::setw(6) << "BlkK" << std::setw(8) << "MatM" << std::setw(8)
+ << "MatN" << std::setw(8) << "MatK" << std::setw(8) << "alpha" << std::setw(8)
+ << "lda" << std::setw(8) << "ldb" << std::setw(8) << "beta" << std::setw(8) << "ldc"
+ << std::setw(8) << "ldd" << std::setw(13) << "elapsedMs" << std::setw(23)
+ << "Problem Size(GFlops)" << std::setw(10) << "TFlops/s" << std::endl;
+
+ std::cout << std::left << std::setw(8) << t_block_x << std::setw(8) << t_block_y << std::setw(8)
+ << "N/A" << std::setw(8) << "N/A" << std::setw(6) << rocwmma_m << std::setw(6)
+ << rocwmma_n << std::setw(6) << rocwmma_k << std::setw(8) << m << std::setw(8) << n
+ << std::setw(8) << k << std::setw(8) << alpha << std::setw(8) << lda << std::setw(8)
+ << ldb << std::setw(8) << beta << std::setw(8) << ldc << std::setw(8) << ldd
+ << std::setw(13) << elapsed_time_ms << std::setw(23) << gflops << std::setw(10)
+ << tflops_per_sec << std::endl;
+
+ // 6. Copy result back to host
+ HIP_CHECK(hipMemcpy(matrix_d.data(), d_d, bytes_d, hipMemcpyDeviceToHost));
+
+ // 7. Validate result (CPU reference calculation)
+ std::vector<float32_t> matrix_d_ref(m * n, std::numeric_limits<float32_t>::signaling_NaN());
+ gemm_cpu_h<float32_t, float32_t, float32_t, row_major, col_major, row_major>(
+ m,
+ n,
+ k,
+ matrix_a.data(),
+ matrix_b.data(),
+ matrix_c.data(),
+ matrix_d_ref.data(),
+ lda,
+ ldb,
+ ldc,
+ ldd,
+ alpha,
+ beta);
+
+ auto res = compare_equal<float32_t>(matrix_d.data(), matrix_d_ref.data(), m * n);
+
+ if(std::get<0>(res) == false)
+ {
+ std::cout << "FAILED!" << std::endl;
+ std::cout << "Max relative error: " << std::get<1>(res) << std::endl;
+ std::exit(EXIT_FAILURE);
+ }
+ else
+ {
+ std::cout << "PASSED!" << std::endl;
+ }
+
+ // 8. Clean up device memory
+ HIP_CHECK(hipFree(d_a));
+ HIP_CHECK(hipFree(d_b));
+ HIP_CHECK(hipFree(d_c));
+ HIP_CHECK(hipFree(d_d));
+
+ return 0;
+}
diff --git a/test.sh b/test.sh
new file mode 100755
index 0000000..4cdf5fc
--- /dev/null
+++ b/test.sh
@@ -0,0 +1,9 @@
+#! /usr/bin/env sh
+
+BPATH=/usr/bin
+IPATH=/usr/include
+LPATH=/usr/lib64
+
+OUT=$(mktemp -d)
+${BPATH}/hipcc -o "$OUT"/test test.cpp -I${IPATH} -L${LPATH} -lamdhip64
+"$OUT"/test
reply other threads:[~2026-06-14 12:46 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=178144116073.1.332152274305352524.rpms-rocwmma-3545b47a0713@fedoraproject.org \
--to=tom.rix@amd.com \
--cc=git-commits@fedoraproject.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox