public inbox for git-commits@fedoraproject.org
help / color / mirror / Atom feed
* [rpms/rocblas] epel10: Merge branch 'rawhide' into epel10
@ 2026-06-11 14:33 Tom Rix
0 siblings, 0 replies; only message in thread
From: Tom Rix @ 2026-06-11 14:33 UTC (permalink / raw)
To: git-commits
A new commit has been pushed.
Repo : rpms/rocblas
Branch : epel10
Commit : f3593b072209cb0d490f34a4ed6dd74d4d09bf0f
Author : Tom Rix <Tom.Rix@amd.com>
Date : 2026-06-11T07:32:42-07:00
Stats : +1203/-95 in 17 file(s)
URL : https://src.fedoraproject.org/rpms/rocblas/c/f3593b072209cb0d490f34a4ed6dd74d4d09bf0f?branch=epel10
Log:
Merge branch 'rawhide' into epel10
---
diff --git a/.gitignore b/.gitignore
index 985899d..eada8e5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,6 @@
/rocBLAS-7.1.0.tar.gz
/rocblas-7.1.1.tar.gz
/Tensile-7.1.1.tar.gz
+/Tensile-7.2.0.tar.gz
+/rocblas-7.2.0.tar.gz
+/tensile-7.2.0.tar.gz
diff --git a/0001-improve-the-warning-for-asm-caps-mismatches.patch b/0001-improve-the-warning-for-asm-caps-mismatches.patch
new file mode 100644
index 0000000..7e1cf2d
--- /dev/null
+++ b/0001-improve-the-warning-for-asm-caps-mismatches.patch
@@ -0,0 +1,43 @@
+From 393571163851ee7dd5508007dbd887cde2141c5c Mon Sep 17 00:00:00 2001
+From: Tom Rix <Tom.Rix@amd.com>
+Date: Sun, 8 Mar 2026 10:48:50 -0700
+Subject: [PATCH 1/6] improve the warning for asm caps mismatches
+
+This change prints out the different keys/value pairt when there
+is a difference between the derrived and cached asm tables.
+
+Signed-off-by: Tom Rix <Tom.Rix@amd.com>
+---
+ shared/tensile/Tensile/Common.py | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+diff --git a/shared/tensile/Tensile/Common.py b/shared/tensile/Tensile/Common.py
+index 55954296e907..a4db814f542f 100644
+--- a/shared/tensile/Tensile/Common.py
++++ b/shared/tensile/Tensile/Common.py
+@@ -2011,6 +2011,14 @@ def locateExe( defaultPath, exeName ): # /opt/rocm/bin, hip-clang
+ return exePath
+ return None
+
++def PrintDiff(d1, d2):
++ keys = set(d1.keys() | d2.keys())
++ for key in keys:
++ v1 = d1.get(key)
++ v2 = d2.get(key)
++ if v1 != v2:
++ printWarning(f"{key}: {v1} != {v2}")
++
+ def GetAsmCaps(isaVersion: IsaVersion, hipVersion: SemanticVersion, cachedAsmCaps: Dict[IsaVersion, dict]) -> Dict[IsaVersion, dict]:
+ """ Determine assembler capabilities by testing short instructions sequences """
+ if globalParameters["AssemblerPath"] is not None:
+@@ -2133,6 +2141,7 @@ def GetAsmCaps(isaVersion: IsaVersion, hipVersion: SemanticVersion, cachedAsmCap
+ exitFlag = True
+ if exitFlag:
+ printWarning("Cached asm caps differ from derived asm caps for {}".format(isaVersion))
++ PrintDiff(derivedAsmCaps, cachedAsmCaps[isaVersion])
+ return derivedAsmCaps
+ else:
+ printWarning("Assembler not present, asm caps loaded from cache are unverified")
+--
+2.53.0
+
diff --git a/0001-tensile-add-cmake-arches.patch b/0001-tensile-add-cmake-arches.patch
index 79bb3c5..ffe5265 100644
--- a/0001-tensile-add-cmake-arches.patch
+++ b/0001-tensile-add-cmake-arches.patch
@@ -1,4 +1,4 @@
-From 4d9f28f69cbe468a95e06d8ca81746cab785e9da Mon Sep 17 00:00:00 2001
+From 97d064fb6acae49c3543b3eb88c211bde4c82401 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Thu, 30 Oct 2025 07:59:11 -0700
Subject: [PATCH] tensile add cmake arches
@@ -21,5 +21,5 @@ index e8a28d3bfeda..2147db4d5a93 100644
"gfx1201")
--
-2.51.0
+2.52.0
diff --git a/0001-tensile-fedora-gpus.patch b/0001-tensile-fedora-gpus.patch
index db96faf..9d55d0f 100644
--- a/0001-tensile-fedora-gpus.patch
+++ b/0001-tensile-fedora-gpus.patch
@@ -1,4 +1,4 @@
-From 3c17363a401de821280a9d4da6e0fba4490c88ce Mon Sep 17 00:00:00 2001
+From a31c5dca12d81f81f3aaf9629e8c6ea0660fd06d Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Thu, 30 Oct 2025 06:59:47 -0700
Subject: [PATCH] tensile fedora gpus
@@ -212,7 +212,7 @@ index cacc1848b7e0..41330270c618 100644
'v_dot4c_i32_i8': False,
'v_fma_f16': True,
diff --git a/Tensile/Common.py b/Tensile/Common.py
-index 86c6c5778293..d16ca848cbc8 100644
+index 9370c3ef09d4..8b6b43111877 100644
--- a/Tensile/Common.py
+++ b/Tensile/Common.py
@@ -248,9 +248,9 @@ globalParameters["MaxFileName"] = 64 # If a file name would be long
@@ -227,7 +227,7 @@ index 86c6c5778293..d16ca848cbc8 100644
(12,0,0), (12,0,1)] # assembly kernels writer supports these architectures
globalParameters["KeepBuildTmp"] = True # Do not remove build artifacts during the build process or build_tmp after build completes
-@@ -325,7 +325,7 @@ architectureMap = {
+@@ -326,7 +326,7 @@ architectureMap = {
'gfx1010':'navi10', 'gfx1011':'navi12', 'gfx1012':'navi14',
'gfx1030':'navi21', 'gfx1031':'navi22', 'gfx1032':'navi23', 'gfx1034':'navi24', 'gfx1035':'rembrandt',
'gfx1100':'navi31', 'gfx1101':'navi32', 'gfx1102':'navi33', 'gfx1103':'gfx1103',
@@ -236,7 +236,7 @@ index 86c6c5778293..d16ca848cbc8 100644
'gfx1200':'gfx1200',
'gfx1201':'gfx1201'
}
-@@ -2464,7 +2464,7 @@ def assignGlobalParameters( config, capabilitiesCache: Optional[dict] = None ):
+@@ -2466,7 +2466,7 @@ def assignGlobalParameters( config, capabilitiesCache: Optional[dict] = None ):
if os.name == "nt":
globalParameters["CurrentISA"] = (9,0,6)
printWarning("Failed to detect ISA so forcing (gfx906) on windows")
@@ -336,5 +336,5 @@ index a21e584d291a..cb1c085258c9 100644
template <typename MyProblem, typename MySolution = typename MyProblem::Solution>
--
-2.51.0
+2.52.0
diff --git a/0001-tensile-gfx1036.patch b/0001-tensile-gfx1036.patch
index 9b7b08e..b0a7f15 100644
--- a/0001-tensile-gfx1036.patch
+++ b/0001-tensile-gfx1036.patch
@@ -1,4 +1,4 @@
-From 26080c363fb030d822e0317d3d6093789d5b1c4a Mon Sep 17 00:00:00 2001
+From 0bd27b4c7bbd913967583158983a9b6077c956f5 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Fri, 7 Nov 2025 10:07:52 -0800
Subject: [PATCH] tensile gfx1036
@@ -162,7 +162,7 @@ index c4bdc4775300..ea9d7567b58e 100644
'v_dot4c_i32_i8': False,
'v_fma_f16': True,
diff --git a/Tensile/Common.py b/Tensile/Common.py
-index 5ab3f6381fcf..157ac5abd233 100644
+index 140d4dbe58c2..a7d2ab5cd760 100644
--- a/Tensile/Common.py
+++ b/Tensile/Common.py
@@ -248,7 +248,7 @@ globalParameters["MaxFileName"] = 64 # If a file name would be long
@@ -174,7 +174,7 @@ index 5ab3f6381fcf..157ac5abd233 100644
(11,0,0), (11,0,1), (11,0,2), (11,0,3),
(11,5,0), (11,5,1), (11,5,2), (11,5,3),
(12,0,0), (12,0,1)] # assembly kernels writer supports these architectures
-@@ -323,7 +323,7 @@ architectureMap = {
+@@ -324,7 +324,7 @@ architectureMap = {
'gfx942':'aquavanjaram942', 'gfx942:xnack+':'aquavanjaram942', 'gfx942:xnack-':'aquavanjaram942',
'gfx950':'gfx950', 'gfx950:xnack+':'gfx950', 'gfx950:xnack-':'gfx950',
'gfx1010':'navi10', 'gfx1011':'navi12', 'gfx1012':'navi14',
@@ -238,5 +238,5 @@ index 77c9ced2cc35..852c41f60e8d 100644
return "TensileLibrary_*_gfx1100";
case LazyLoadingInit::gfx1101:
--
-2.51.0
+2.52.0
diff --git a/0001-tensile-gfx1153.patch b/0001-tensile-gfx1153.patch
index 8dd92bc..acf9bf6 100644
--- a/0001-tensile-gfx1153.patch
+++ b/0001-tensile-gfx1153.patch
@@ -1,4 +1,4 @@
-From 984dd95e0ab0458266a5375510524072cedbb11b Mon Sep 17 00:00:00 2001
+From 1c66a051e819a40e9bbe6bdd8d54124baf001f00 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Thu, 30 Oct 2025 07:15:18 -0700
Subject: [PATCH] tensile gfx1153
@@ -66,7 +66,7 @@ index 41330270c618..c4bdc4775300 100644
'v_dot4c_i32_i8': False,
'v_fma_f16': True,
diff --git a/Tensile/Common.py b/Tensile/Common.py
-index d16ca848cbc8..ad3e8a26b5db 100644
+index 8b6b43111877..a4a8bb524da0 100644
--- a/Tensile/Common.py
+++ b/Tensile/Common.py
@@ -250,7 +250,7 @@ globalParameters["SupportedISA"] = [(8,0,3),
@@ -78,7 +78,7 @@ index d16ca848cbc8..ad3e8a26b5db 100644
(12,0,0), (12,0,1)] # assembly kernels writer supports these architectures
globalParameters["KeepBuildTmp"] = True # Do not remove build artifacts during the build process or build_tmp after build completes
-@@ -325,7 +325,7 @@ architectureMap = {
+@@ -326,7 +326,7 @@ architectureMap = {
'gfx1010':'navi10', 'gfx1011':'navi12', 'gfx1012':'navi14',
'gfx1030':'navi21', 'gfx1031':'navi22', 'gfx1032':'navi23', 'gfx1034':'navi24', 'gfx1035':'rembrandt',
'gfx1100':'navi31', 'gfx1101':'navi32', 'gfx1102':'navi33', 'gfx1103':'gfx1103',
@@ -87,7 +87,7 @@ index d16ca848cbc8..ad3e8a26b5db 100644
'gfx1200':'gfx1200',
'gfx1201':'gfx1201'
}
-@@ -2464,7 +2464,7 @@ def assignGlobalParameters( config, capabilitiesCache: Optional[dict] = None ):
+@@ -2466,7 +2466,7 @@ def assignGlobalParameters( config, capabilitiesCache: Optional[dict] = None ):
if os.name == "nt":
globalParameters["CurrentISA"] = (9,0,6)
printWarning("Failed to detect ISA so forcing (gfx906) on windows")
@@ -161,5 +161,5 @@ index cb1c085258c9..77c9ced2cc35 100644
return "";
}
--
-2.51.0
+2.52.0
diff --git a/0001-tensile-ignore-cache-check.patch b/0001-tensile-ignore-cache-check.patch
index 55b861b..abe4231 100644
--- a/0001-tensile-ignore-cache-check.patch
+++ b/0001-tensile-ignore-cache-check.patch
@@ -1,4 +1,4 @@
-From f6f1389482fb882c5414f6a74b4f289f1c9c951f Mon Sep 17 00:00:00 2001
+From c0eae8b2bbe0a94fd6961d0802edf8c6f68227d0 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Thu, 30 Oct 2025 07:23:52 -0700
Subject: [PATCH] tensile ignore cache check
@@ -8,10 +8,10 @@ Subject: [PATCH] tensile ignore cache check
1 file changed, 1 insertion(+), 11 deletions(-)
diff --git a/Tensile/Common.py b/Tensile/Common.py
-index ad3e8a26b5db..5ab3f6381fcf 100644
+index a4a8bb524da0..140d4dbe58c2 100644
--- a/Tensile/Common.py
+++ b/Tensile/Common.py
-@@ -2103,17 +2103,7 @@ def GetAsmCaps(isaVersion: IsaVersion, hipVersion: SemanticVersion, cachedAsmCap
+@@ -2104,17 +2104,7 @@ def GetAsmCaps(isaVersion: IsaVersion, hipVersion: SemanticVersion, cachedAsmCap
derivedAsmCaps["SupportedSource"] = True
@@ -31,5 +31,5 @@ index ad3e8a26b5db..5ab3f6381fcf 100644
# check if derived caps matches asm cap cache
if not ignoreCacheCheck:
--
-2.51.0
+2.52.0
diff --git a/0001-tensile-set-default-paths.patch b/0001-tensile-set-default-paths.patch
index c90c2ea..f00414b 100644
--- a/0001-tensile-set-default-paths.patch
+++ b/0001-tensile-set-default-paths.patch
@@ -1,4 +1,4 @@
-From c08f67b1248ddbf5d0c2b188b85a9374f5f12c20 Mon Sep 17 00:00:00 2001
+From 0a4fb93121619f3fae1fc3ebba4f64a8d053ce93 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Sat, 20 Sep 2025 08:43:09 -0700
Subject: [PATCH] tensile set default paths
@@ -32,5 +32,5 @@ index ee9cbeeea9f3..c328b74bcbc2 100644
searchPaths.extend(
[
--
-2.51.0
+2.52.0
diff --git a/0002-add-generic-gpu-targets.patch b/0002-add-generic-gpu-targets.patch
new file mode 100644
index 0000000..1c94256
--- /dev/null
+++ b/0002-add-generic-gpu-targets.patch
@@ -0,0 +1,592 @@
+From a8a0e23fbaf6aacb42f7b505feb1ab8f06adbbea Mon Sep 17 00:00:00 2001
+From: Tom Rix <Tom.Rix@amd.com>
+Date: Sun, 8 Mar 2026 01:32:28 +0000
+Subject: [PATCH 2/6] add generic gpu targets
+
+To support generic gpu targets ex/ -DGPU_TARGETS=gfx11-generic.
+
+Tensile does not have support for every possible gpu target. Instead
+of adding then piecement, provide support for all the generic targets.
+
+In Common.py overload int tuple for SupportedISA, where if the last
+value is negative, then this is a generic isa.
+Ex
+ (10,3,-1) -> gfx10-3-generic
+ (11,0,-1) -> gfx11-generic
+
+In AsmCaps, cut-n-paste generic tables from a close existing table.
+ex/ (10,3,0) was used of (10,3,-1). Then fix the values based on
+the derrived vs cached warnings during a build.
+
+Add new mapping where appropriate.
+
+Signed-off-by: Tom Rix <Tom.Rix@amd.com>
+---
+ shared/tensile/Tensile/AsmCaps.py | 264 ++++++++++++++++++
+ shared/tensile/Tensile/Common.py | 57 +++-
+ .../cmake/TensileSupportedArchitectures.cmake | 9 +-
+ .../Source/lib/include/Tensile/AMDGPU.hpp | 44 ++-
+ .../include/Tensile/PlaceholderLibrary.hpp | 18 ++
+ 5 files changed, 375 insertions(+), 17 deletions(-)
+
+diff --git a/shared/tensile/Tensile/AsmCaps.py b/shared/tensile/Tensile/AsmCaps.py
+index 4bed5cd9f4ff..ed71b8dee02d 100644
+--- a/shared/tensile/Tensile/AsmCaps.py
++++ b/shared/tensile/Tensile/AsmCaps.py
+@@ -169,6 +169,50 @@ def getCapabilitiesCache(rocmVersion: NamedTuple) -> dict:
+ 'v_mov_b64': False,
+ 'v_pk_fma_f16': True,
+ 'v_pk_fmac_f16': False},
++ (9, 0, -1): {'HasAddLshl': True,
++ 'HasAtomicAdd': False,
++ 'HasDirectToLdsDest': False,
++ 'HasDirectToLdsNoDest': True,
++ 'HasExplicitCO': True,
++ 'HasExplicitNC': False,
++ 'HasGLCModifier': True,
++ 'HasNTModifier': False,
++ 'HasLshlOr': True,
++ 'HasMFMA': False,
++ 'HasMFMA_b8': False,
++ 'HasMFMA_bf16_1k': False,
++ 'HasMFMA_bf16_original': False,
++ 'HasMFMA_constSrc': False,
++ 'HasMFMA_f64': False,
++ 'HasMFMA_f8': False,
++ 'HasMFMA_i8_908': False,
++ 'HasMFMA_i8_940': False,
++ 'HasMFMA_vgpr': False,
++ 'HasMFMA_xf32': False,
++ 'HasSMulHi': True,
++ 'HasWMMA': False,
++ 'KernargPreloading': False,
++ 'MaxLgkmcnt': 15,
++ 'MaxVmcnt': 63,
++ 'SupportedISA': True,
++ 'SupportedSource': True,
++ 'VOP3v_dot4_i32_i8': False,
++ 'v_dot2_f32_f16': False,
++ 'v_dot2c_f32_f16': False,
++ 'v_dot4_i32_i8': False,
++ 'v_dot4c_i32_i8': False,
++ 'v_fma_f16': True,
++ 'v_fma_f32': True,
++ 'v_fma_f64': True,
++ 'v_fma_mix_f32': False,
++ 'v_fmac_f16': False,
++ 'v_fmac_f32': False,
++ 'v_mac_f16': True,
++ 'v_mac_f32': True,
++ 'v_mad_mix_f32': False,
++ 'v_mov_b64': False,
++ 'v_pk_fma_f16': True,
++ 'v_pk_fmac_f16': False},
+ (9, 0, 6): {'HasAddLshl': True,
+ 'HasAtomicAdd': False,
+ 'HasDirectToLdsDest': False,
+@@ -389,6 +433,50 @@ def getCapabilitiesCache(rocmVersion: NamedTuple) -> dict:
+ 'v_mov_b64': True,
+ 'v_pk_fma_f16': True,
+ 'v_pk_fmac_f16': False},
++ (9, 4, -1): {'HasAddLshl': True,
++ 'HasAtomicAdd': True,
++ 'HasDirectToLdsDest': False,
++ 'HasDirectToLdsNoDest': True,
++ 'HasExplicitCO': True,
++ 'HasExplicitNC': False,
++ 'HasGLCModifier': False,
++ 'HasNTModifier': True,
++ 'HasLshlOr': True,
++ 'HasMFMA': True,
++ 'HasMFMA_b8': False,
++ 'HasMFMA_bf16_1k': True,
++ 'HasMFMA_bf16_original': False,
++ 'HasMFMA_constSrc': True,
++ 'HasMFMA_f64': True,
++ 'HasMFMA_f8': False,
++ 'HasMFMA_i8_908': False,
++ 'HasMFMA_i8_940': True,
++ 'HasMFMA_vgpr': True,
++ 'HasMFMA_xf32': False,
++ 'HasSMulHi': True,
++ 'HasWMMA': False,
++ 'KernargPreloading': True,
++ 'MaxLgkmcnt': 15,
++ 'MaxVmcnt': 63,
++ 'SupportedISA': True,
++ 'SupportedSource': True,
++ 'VOP3v_dot4_i32_i8': True,
++ 'v_dot2_f32_f16': True,
++ 'v_dot2c_f32_f16': True,
++ 'v_dot4_i32_i8': False,
++ 'v_dot4c_i32_i8': True,
++ 'v_fma_f16': True,
++ 'v_fma_f32': True,
++ 'v_fma_f64': True,
++ 'v_fma_mix_f32': True,
++ 'v_fmac_f16': False,
++ 'v_fmac_f32': True,
++ 'v_mac_f16': True,
++ 'v_mac_f32': False,
++ 'v_mad_mix_f32': False,
++ 'v_mov_b64': True,
++ 'v_pk_fma_f16': True,
++ 'v_pk_fmac_f16': False},
+ (9, 5, 0): {'HasAddLshl': True,
+ 'HasAtomicAdd': True,
+ 'HasDirectToLdsDest': False,
+@@ -477,6 +565,50 @@ def getCapabilitiesCache(rocmVersion: NamedTuple) -> dict:
+ 'v_mov_b64': False,
+ 'v_pk_fma_f16': True,
+ 'v_pk_fmac_f16': False},
++ (10, 1, -1): {'HasAddLshl': True,
++ 'HasAtomicAdd': False,
++ 'HasDirectToLdsDest': False,
++ 'HasDirectToLdsNoDest': True,
++ 'HasExplicitCO': True,
++ 'HasExplicitNC': True,
++ 'HasGLCModifier': True,
++ 'HasNTModifier': False,
++ 'HasLshlOr': True,
++ 'HasMFMA': False,
++ 'HasMFMA_b8': False,
++ 'HasMFMA_bf16_1k': False,
++ 'HasMFMA_bf16_original': False,
++ 'HasMFMA_constSrc': False,
++ 'HasMFMA_f64': False,
++ 'HasMFMA_f8': False,
++ 'HasMFMA_i8_908': False,
++ 'HasMFMA_i8_940': False,
++ 'HasMFMA_vgpr': False,
++ 'HasMFMA_xf32': False,
++ 'HasSMulHi': True,
++ 'HasWMMA': False,
++ 'KernargPreloading': False,
++ 'MaxLgkmcnt': 15,
++ 'MaxVmcnt': 63,
++ 'SupportedISA': True,
++ 'SupportedSource': True,
++ 'VOP3v_dot4_i32_i8': False,
++ 'v_dot2_f32_f16': False,
++ 'v_dot2c_f32_f16': False,
++ 'v_dot4_i32_i8': False,
++ 'v_dot4c_i32_i8': False,
++ 'v_fma_f16': True,
++ 'v_fma_f32': True,
++ 'v_fma_f64': True,
++ 'v_fma_mix_f32': True,
++ 'v_fmac_f16': False,
++ 'v_fmac_f32': True,
++ 'v_mac_f16': False,
++ 'v_mac_f32': True,
++ 'v_mad_mix_f32': False,
++ 'v_mov_b64': False,
++ 'v_pk_fma_f16': True,
++ 'v_pk_fmac_f16': False},
+ (10, 1, 1): {'HasAddLshl': True,
+ 'HasAtomicAdd': False,
+ 'HasDirectToLdsDest': False,
+@@ -609,6 +741,50 @@ def getCapabilitiesCache(rocmVersion: NamedTuple) -> dict:
+ 'v_mov_b64': False,
+ 'v_pk_fma_f16': True,
+ 'v_pk_fmac_f16': False},
++ (10, 3, -1): {'HasAddLshl': True,
++ 'HasAtomicAdd': False,
++ 'HasDirectToLdsDest': False,
++ 'HasDirectToLdsNoDest': True,
++ 'HasExplicitCO': True,
++ 'HasExplicitNC': True,
++ 'HasGLCModifier': True,
++ 'HasNTModifier': False,
++ 'HasLshlOr': True,
++ 'HasMFMA': False,
++ 'HasMFMA_b8': False,
++ 'HasMFMA_bf16_1k': False,
++ 'HasMFMA_bf16_original': False,
++ 'HasMFMA_constSrc': False,
++ 'HasMFMA_f64': False,
++ 'HasMFMA_f8': False,
++ 'HasMFMA_i8_908': False,
++ 'HasMFMA_i8_940': False,
++ 'HasMFMA_vgpr': False,
++ 'HasMFMA_xf32': False,
++ 'HasSMulHi': True,
++ 'HasWMMA': False,
++ 'KernargPreloading': False,
++ 'MaxLgkmcnt': 15,
++ 'MaxVmcnt': 63,
++ 'SupportedISA': True,
++ 'SupportedSource': True,
++ 'VOP3v_dot4_i32_i8': True,
++ 'v_dot2_f32_f16': True,
++ 'v_dot2c_f32_f16': True,
++ 'v_dot4_i32_i8': False,
++ 'v_dot4c_i32_i8': True,
++ 'v_fma_f16': True,
++ 'v_fma_f32': True,
++ 'v_fma_f64': True,
++ 'v_fma_mix_f32': True,
++ 'v_fmac_f16': False,
++ 'v_fmac_f32': True,
++ 'v_mac_f16': False,
++ 'v_mac_f32': False,
++ 'v_mad_mix_f32': False,
++ 'v_mov_b64': False,
++ 'v_pk_fma_f16': True,
++ 'v_pk_fmac_f16': False},
+ (10, 3, 1): {'HasAddLshl': True,
+ 'HasAtomicAdd': False,
+ 'HasDirectToLdsDest': False,
+@@ -917,6 +1093,50 @@ def getCapabilitiesCache(rocmVersion: NamedTuple) -> dict:
+ 'v_mov_b64': False,
+ 'v_pk_fma_f16': True,
+ 'v_pk_fmac_f16': False},
++ (11, 0, -1): {'HasAddLshl': True,
++ 'HasAtomicAdd': True,
++ 'HasDirectToLdsDest': False,
++ 'HasDirectToLdsNoDest': False,
++ 'HasExplicitCO': True,
++ 'HasExplicitNC': True,
++ 'HasGLCModifier': True,
++ 'HasNTModifier': False,
++ 'HasLshlOr': True,
++ 'HasMFMA': False,
++ 'HasMFMA_b8': False,
++ 'HasMFMA_bf16_1k': False,
++ 'HasMFMA_bf16_original': False,
++ 'HasMFMA_constSrc': False,
++ 'HasMFMA_f64': False,
++ 'HasMFMA_f8': False,
++ 'HasMFMA_i8_908': False,
++ 'HasMFMA_i8_940': False,
++ 'HasMFMA_vgpr': False,
++ 'HasMFMA_xf32': False,
++ 'HasSMulHi': True,
++ 'HasWMMA': True,
++ 'KernargPreloading': False,
++ 'MaxLgkmcnt': 15,
++ 'MaxVmcnt': 63,
++ 'SupportedISA': True,
++ 'SupportedSource': True,
++ 'VOP3v_dot4_i32_i8': True,
++ 'v_dot2_f32_f16': True,
++ 'v_dot2c_f32_f16': True,
++ 'v_dot4_i32_i8': False,
++ 'v_dot4c_i32_i8': False,
++ 'v_fma_f16': True,
++ 'v_fma_f32': True,
++ 'v_fma_f64': True,
++ 'v_fma_mix_f32': True,
++ 'v_fmac_f16': False,
++ 'v_fmac_f32': True,
++ 'v_mac_f16': False,
++ 'v_mac_f32': False,
++ 'v_mad_mix_f32': False,
++ 'v_mov_b64': False,
++ 'v_pk_fma_f16': True,
++ 'v_pk_fmac_f16': False},
+ (11, 0, 1): {'HasAddLshl': True,
+ 'HasAtomicAdd': True,
+ 'HasDirectToLdsDest': False,
+@@ -1269,6 +1489,50 @@ def getCapabilitiesCache(rocmVersion: NamedTuple) -> dict:
+ 'v_mov_b64': False,
+ 'v_pk_fma_f16': True,
+ 'v_pk_fmac_f16': False},
++ (12, 0, -1): {'HasAddLshl': True,
++ 'HasAtomicAdd': False,
++ 'HasDirectToLdsDest': False,
++ 'HasDirectToLdsNoDest': False,
++ 'HasExplicitCO': True,
++ 'HasExplicitNC': True,
++ 'HasGLCModifier': False,
++ 'HasNTModifier': False,
++ 'HasLshlOr': True,
++ 'HasMFMA': False,
++ 'HasMFMA_b8': False,
++ 'HasMFMA_bf16_1k': False,
++ 'HasMFMA_bf16_original': False,
++ 'HasMFMA_constSrc': False,
++ 'HasMFMA_f64': False,
++ 'HasMFMA_f8': False,
++ 'HasMFMA_i8_908': False,
++ 'HasMFMA_i8_940': False,
++ 'HasMFMA_vgpr': False,
++ 'HasMFMA_xf32': False,
++ 'HasSMulHi': True,
++ 'HasWMMA': False,
++ 'KernargPreloading': False,
++ 'MaxLgkmcnt': 15,
++ 'MaxVmcnt': 63,
++ 'SupportedISA': True,
++ 'SupportedSource': True,
++ 'VOP3v_dot4_i32_i8': True,
++ 'v_dot2_f32_f16': True,
++ 'v_dot2c_f32_f16': False,
++ 'v_dot4_i32_i8': False,
++ 'v_dot4c_i32_i8': False,
++ 'v_fma_f16': True,
++ 'v_fma_f32': True,
++ 'v_fma_f64': True,
++ 'v_fma_mix_f32': True,
++ 'v_fmac_f16': False,
++ 'v_fmac_f32': True,
++ 'v_mac_f16': False,
++ 'v_mac_f32': False,
++ 'v_mad_mix_f32': False,
++ 'v_mov_b64': False,
++ 'v_pk_fma_f16': True,
++ 'v_pk_fmac_f16': False},
+ (12, 0, 1): {'HasAddLshl': True,
+ 'HasAtomicAdd': False,
+ 'HasDirectToLdsDest': False,
+diff --git a/shared/tensile/Tensile/Common.py b/shared/tensile/Tensile/Common.py
+index a4db814f542f..ef13633f4571 100644
+--- a/shared/tensile/Tensile/Common.py
++++ b/shared/tensile/Tensile/Common.py
+@@ -246,12 +246,12 @@ globalParameters["NumMergedFiles"] = 1 # The number of files that ker
+
+ globalParameters["MaxFileName"] = 64 # If a file name would be longer than this, shorten it with a hash.
+ globalParameters["SupportedISA"] = [(8,0,3),
+- (9,0,0), (9,0,6), (9,0,8), (9,0,10), (9,0,12),
+- (9,4,2), (9,5,0),
+- (10,1,0), (10,1,1), (10,1,2), (10,3,0), (10,3,1), (10,3,2), (10,3,3), (10,3,4), (10,3,5), (10,3,6),
+- (11,0,0), (11,0,1), (11,0,2), (11,0,3),
++ (9,0,0), (9,0,6), (9,0,8), (9,0,10), (9,0,12), (9,0,-1),
++ (9,4,2), (9,4,-1), (9,5,0),
++ (10,1,0), (10,1,1), (10,1,2), (10,1,-1), (10,3,0), (10,3,1), (10,3,2), (10,3,3), (10,3,4), (10,3,5), (10,3,6), (10,3,-1),
++ (11,0,0), (11,0,1), (11,0,2), (11,0,3), (11,0,-1),
+ (11,5,0), (11,5,1), (11,5,2), (11,5,3),
+- (12,0,0), (12,0,1), (12,5,0)] # assembly kernels writer supports these architectures
++ (12,0,0), (12,0,1), (12,5,0), (12,0,-1)] # assembly kernels writer supports these architectures
+
+ globalParameters["KeepBuildTmp"] = True # Do not remove build artifacts during the build process or build_tmp after build completes
+ globalParameters["GenerateManifestAndExit"] = False # Output manifest file with list of expected library objects and exit
+@@ -320,15 +320,15 @@ architectureMap = {
+ 'gfx803':'r9nano', 'gfx900':'vega10', 'gfx900:xnack-':'vega10', 'gfx90c':'vega10',
+ 'gfx906':'vega20', 'gfx906:xnack+':'vega20', 'gfx906:xnack-':'vega20',
+ 'gfx908':'arcturus','gfx908:xnack+':'arcturus', 'gfx908:xnack-':'arcturus',
+- 'gfx90a':'aldebaran', 'gfx90a:xnack+':'aldebaran', 'gfx90a:xnack-':'aldebaran',
+- 'gfx942':'aquavanjaram942', 'gfx942:xnack+':'aquavanjaram942', 'gfx942:xnack-':'aquavanjaram942',
++ 'gfx90a':'aldebaran', 'gfx90a:xnack+':'aldebaran', 'gfx90a:xnack-':'aldebaran', 'gfx9-generic':'gfx9-generic',
++ 'gfx942':'aquavanjaram942', 'gfx942:xnack+':'aquavanjaram942', 'gfx942:xnack-':'aquavanjaram942', 'gfx9-4-generic':'gfx9-4-generic',
+ 'gfx950':'gfx950', 'gfx950:xnack+':'gfx950', 'gfx950:xnack-':'gfx950',
+- 'gfx1010':'navi10', 'gfx1011':'navi12', 'gfx1012':'navi14',
+- 'gfx1030':'navi21', 'gfx1031':'navi22', 'gfx1032':'navi23', 'gfx1033':'van gogh', 'gfx1034':'navi24', 'gfx1035':'rembrandt', 'gfx1036':'raphael',
+- 'gfx1100':'navi31', 'gfx1101':'navi32', 'gfx1102':'navi33', 'gfx1103':'gfx1103',
++ 'gfx1010':'navi10', 'gfx1011':'navi12', 'gfx1012':'navi14', 'gfx10-1-generic':'gfx10-1-generic',
++ 'gfx1030':'navi21', 'gfx1031':'navi22', 'gfx1032':'navi23', 'gfx1033':'van gogh', 'gfx1034':'navi24', 'gfx1035':'rembrandt', 'gfx1036':'raphael', 'gfx10-3-generic':'gfx10-3-generic',
++ 'gfx1100':'navi31', 'gfx1101':'navi32', 'gfx1102':'navi33', 'gfx1103':'gfx1103', 'gfx11-generic':'gfx11-generic',
+ 'gfx1150':'strixpoint', 'gfx1151':'strixhalo', 'gfx1152':'gfx1152', 'gfx1153':'gfx1153',
+ 'gfx1200':'gfx1200',
+- 'gfx1201':'gfx1201',
++ 'gfx1201':'gfx1201', 'gfx12-generic':'gfx12-generic',
+ 'gfx1250':'gfx1250'
+ }
+
+@@ -2202,6 +2202,21 @@ def tryAssembler(isaVersion, asmString, debug=False, *options):
+
+ def gfxArch(name: str) -> Optional[IsaVersion]:
+ import re
++
++ # Handle special case for generic architectures like 'gfx10-3-generic'
++ generic_match = re.search(r'gfx([0-9]+)-([0-9]+)-generic', name)
++ if generic_match:
++ major = int(generic_match.group(1))
++ minor = int(generic_match.group(2))
++ return (major, minor, -1) # step=-1 to indicate generic
++
++ # Handle special case for generic architectures like 'gfx11-generic'
++ generic_match = re.search(r'gfx([0-9]+)-generic', name)
++ if generic_match:
++ major = int(generic_match.group(1))
++ return (major, 0, -1) # step=-1 to indicate generic, minor=0
++
++ # Handle regular architectures like 'gfx900', 'gfx803' etc.
+ match = re.search(r'gfx([0-9a-fA-F]{3,})', name)
+ if not match: return None
+
+@@ -2220,11 +2235,23 @@ def gfxArch(name: str) -> Optional[IsaVersion]:
+ return rv
+
+ def gfxName(arch):
+- # convert last digit to hex because reasons
+- name = str(arch[0]) + str(arch[1]) + ('%x' % arch[2])
++ # If arch[2] is negative, this is a generic target
++ if arch[2] < 0:
++ if arch[0] == 9:
++ if arch[1] == 4:
++ name = str(arch[0]) + '-' + str(arch[1]) + '-generic'
++ else:
++ name = str(arch[0]) + '-generic'
++ elif arch[0] == 10:
++ name = str(arch[0]) + '-' + str(arch[1]) + '-generic'
++ else:
++ name = str(arch[0]) + '-generic'
++ else:
++ # The normal case
++ # convert last digit to hex because reasons
++ name = str(arch[0]) + str(arch[1]) + ('%x' % arch[2])
+ return 'gfx' + ''.join(map(str,name))
+
+-
+ def detectIsaWindows(output):
+ i = 0
+ for line in output:
+@@ -2476,7 +2503,7 @@ def assignGlobalParameters( config, capabilitiesCache: Optional[dict] = None ):
+ if os.name == "nt":
+ globalParameters["CurrentISA"] = (9,0,6)
+ printWarning("Failed to detect ISA so forcing (gfx906) on windows")
+- isasWithDisabledHWMonitor = ((9,4,2), (9,5,0), (11,0,0), (11,0,1), (11,0,2), (11,0,3), (11,5,0), (11,5,1), (11,5,2), (11,5,3), (12,0,0), (12,0,1))
++ isasWithDisabledHWMonitor = ((9,0,-1), (9,4,2), (9,4,-1), (9,5,0), (10,1,-1), (10,3,-1), (11,0,0), (11,0,1), (11,0,2), (11,0,3), (11,5,0), (11,5,1), (11,5,2), (11,5,3), (11,0,-1), (12,0,0), (12,0,1), (12,0,-1))
+ if globalParameters["CurrentISA"] in isasWithDisabledHWMonitor:
+ isaString = ', '.join(map(gfxName, isasWithDisabledHWMonitor))
+ printWarning(f"HardwareMonitor currently disabled for {isaString}")
+diff --git a/shared/tensile/Tensile/Source/cmake/TensileSupportedArchitectures.cmake b/shared/tensile/Tensile/Source/cmake/TensileSupportedArchitectures.cmake
+index 56e3057d8511..2139617551c4 100644
+--- a/shared/tensile/Tensile/Source/cmake/TensileSupportedArchitectures.cmake
++++ b/shared/tensile/Tensile/Source/cmake/TensileSupportedArchitectures.cmake
+@@ -36,11 +36,14 @@ if(NOT BUILD_ADDRESS_SANITIZER)
+ "gfx906"
+ "gfx908"
+ "gfx90a"
++ "gfx9-generic"
+ "gfx942"
++ "gfx9-4-generic"
+ "gfx950"
+ "gfx1010"
+ "gfx1011"
+ "gfx1012"
++ "gfx10-1-generic"
+ "gfx1030"
+ "gfx1031"
+ "gfx1032"
+@@ -48,6 +51,7 @@ if(NOT BUILD_ADDRESS_SANITIZER)
+ "gfx1034"
+ "gfx1035"
+ "gfx1036"
++ "gfx10-3-generic"
+ "gfx1100"
+ "gfx1101"
+ "gfx1102"
+@@ -56,9 +60,12 @@ if(NOT BUILD_ADDRESS_SANITIZER)
+ "gfx1151"
+ "gfx1152"
+ "gfx1153"
++ "gfx11-generic"
+ "gfx1200"
+ "gfx1201"
+- "gfx1250")
++ "gfx1250"
++ "gfx12-generic"
++ )
+
+ set(SUPPORTED_ARCHITECTURES ${BASE_ARCHITECTURES})
+ list(APPEND SUPPORTED_ARCHITECTURES
+diff --git a/shared/tensile/Tensile/Source/lib/include/Tensile/AMDGPU.hpp b/shared/tensile/Tensile/Source/lib/include/Tensile/AMDGPU.hpp
+index f2bf41b507a3..4b71db91f814 100644
+--- a/shared/tensile/Tensile/Source/lib/include/Tensile/AMDGPU.hpp
++++ b/shared/tensile/Tensile/Source/lib/include/Tensile/AMDGPU.hpp
+@@ -83,7 +83,13 @@ namespace Tensile
+ gfx1153 = 1153,
+ gfx1200 = 1200,
+ gfx1201 = 1201,
+- gfx1250 = 1250
++ gfx1250 = 1250,
++ gfx9_generic = -900,
++ gfx9_4_generic = -940,
++ gfx10_1_generic = -1010,
++ gfx10_3_generic = -1030,
++ gfx11_generic = -1100,
++ gfx12_generic = -1200,
+ };
+
+ static std::string toString(Processor p)
+@@ -148,6 +154,18 @@ namespace Tensile
+ return "gfx1201";
+ case AMDGPU::Processor::gfx1250:
+ return "gfx1250";
++ case AMDGPU::Processor::gfx9_generic:
++ return "gfx9-generic";
++ case AMDGPU::Processor::gfx9_4_generic:
++ return "gfx9-4-generic";
++ case AMDGPU::Processor::gfx10_1_generic:
++ return "gfx10-1-generic";
++ case AMDGPU::Processor::gfx10_3_generic:
++ return "gfx10-3-generic";
++ case AMDGPU::Processor::gfx11_generic:
++ return "gfx11-generic";
++ case AMDGPU::Processor::gfx12_generic:
++ return "gfx12-generic";
+ }
+ return "";
+ }
+@@ -270,6 +288,30 @@ namespace Tensile
+ {
+ return AMDGPU::Processor::gfx1250;
+ }
++ else if(deviceString.find("gfx9-generic") != std::string::npos)
++ {
++ return AMDGPU::Processor::gfx9_generic;
++ }
++ else if(deviceString.find("gfx9-4-generic") != std::string::npos)
++ {
++ return AMDGPU::Processor::gfx9_4_generic;
++ }
++ else if(deviceString.find("gfx10-1-generic") != std::string::npos)
++ {
++ return AMDGPU::Processor::gfx10_1_generic;
++ }
++ else if(deviceString.find("gfx10-3-generic") != std::string::npos)
++ {
++ return AMDGPU::Processor::gfx10_3_generic;
++ }
++ else if(deviceString.find("gfx11-generic") != std::string::npos)
++ {
++ return AMDGPU::Processor::gfx11_generic;
++ }
++ else if(deviceString.find("gfx12-generic") != std::string::npos)
++ {
++ return AMDGPU::Processor::gfx12_generic;
++ }
+ else
+ {
+ return static_cast<AMDGPU::Processor>(0);
+diff --git a/shared/tensile/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp b/shared/tensile/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp
+index 9e21d4ac0805..421b21c5f7dd 100644
+--- a/shared/tensile/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp
++++ b/shared/tensile/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp
+@@ -68,6 +68,12 @@ namespace Tensile
+ gfx1200,
+ gfx1201,
+ gfx1250,
++ gfx9_generic,
++ gfx9_4_generic,
++ gfx10_1_generic,
++ gfx10_3_generic,
++ gfx11_generic,
++ gfx12_generic,
+ All
+ };
+
+@@ -136,6 +142,18 @@ namespace Tensile
+ return "TensileLibrary_*_gfx1201";
+ case LazyLoadingInit::gfx1250:
+ return "TensileLibrary_*_gfx1250";
++ case LazyLoadingInit::gfx9_generic:
++ return "TensileLibrary_*_gfx9-generic";
++ case LazyLoadingInit::gfx9_4_generic:
++ return "TensileLibrary_*_gfx9-4-generic";
++ case LazyLoadingInit::gfx10_1_generic:
++ return "TensileLibrary_*_gfx10-1-generic";
++ case LazyLoadingInit::gfx10_3_generic:
++ return "TensileLibrary_*_gfx10-3-generic";
++ case LazyLoadingInit::gfx11_generic:
++ return "TensileLibrary_*_gfx11-generic";
++ case LazyLoadingInit::gfx12_generic:
++ return "TensileLibrary_*_gfx12-generic";
+ case LazyLoadingInit::None:
+ return "";
+ }
+--
+2.53.0
+
diff --git a/0003-improve-fallback-name-to-handle-generics.patch b/0003-improve-fallback-name-to-handle-generics.patch
new file mode 100644
index 0000000..792eae9
--- /dev/null
+++ b/0003-improve-fallback-name-to-handle-generics.patch
@@ -0,0 +1,32 @@
+From fc89fe29ed8f4ea26aa6041d6655c1c9b46715dd Mon Sep 17 00:00:00 2001
+From: Tom Rix <Tom.Rix@amd.com>
+Date: Sun, 8 Mar 2026 13:38:28 -0700
+Subject: [PATCH 3/6] improve fallback name to handle generics
+
+The archName can be of the form gfx90a-xnack{+,-} and this function
+determines the fallback is gfx90a. However when the archName is
+a generic, ex gfx11-generic, the entire name must be used. So
+check if the name ends with -generic and skip splitting.
+
+Signed-off-by: Tom Rix <Tom.Rix@amd.com>
+---
+ shared/tensile/Tensile/TensileCreateLibrary.py | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/shared/tensile/Tensile/TensileCreateLibrary.py b/shared/tensile/Tensile/TensileCreateLibrary.py
+index cfb04a938d2c..af55b172422a 100644
+--- a/shared/tensile/Tensile/TensileCreateLibrary.py
++++ b/shared/tensile/Tensile/TensileCreateLibrary.py
+@@ -1001,7 +1001,8 @@ def addFallback(masterLibraries: Dict[str, MasterSolutionLibrary]) -> None:
+ value.insert(masterLibraries["fallback"])
+
+ for archName in archs:
+- archName = archName.split("-", 1)[0]
++ if not archName.endswith("-generic"):
++ archName = archName.split("-", 1)[0]
+ if archName not in masterLibraries:
+ tPrint(1, "Using fallback for arch: " + archName)
+ masterLibraries[archName] = masterLibraries["fallback"]
+--
+2.53.0
+
diff --git a/0004-generic-arches-need-a-solution-index.patch b/0004-generic-arches-need-a-solution-index.patch
new file mode 100644
index 0000000..1a7046b
--- /dev/null
+++ b/0004-generic-arches-need-a-solution-index.patch
@@ -0,0 +1,45 @@
+From 83e07d10fff2097878bfbae3e956c2f5177aa4ed Mon Sep 17 00:00:00 2001
+From: Tom Rix <Tom.Rix@amd.com>
+Date: Sun, 8 Mar 2026 16:21:07 -0700
+Subject: [PATCH 4/6] generic arches need a solution index
+
+So there is no overlap with the regular gpu indecies, pick
+a shift value that does not overlap.
+
+(9 << 29) >> 18 = 18432
+
+Signed-off-by: Tom Rix <Tom.Rix@amd.com>
+---
+ shared/tensile/Tensile/SolutionLibrary.py | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+diff --git a/shared/tensile/Tensile/SolutionLibrary.py b/shared/tensile/Tensile/SolutionLibrary.py
+index 0c7b6428d624..e7c4b7457737 100644
+--- a/shared/tensile/Tensile/SolutionLibrary.py
++++ b/shared/tensile/Tensile/SolutionLibrary.py
+@@ -255,7 +255,7 @@ class MasterSolutionLibrary:
+ """Maps hex characters from gfx name to an index.
+
+ Given a gfx name of the form gfx[0-9a-f]*, map the characters following
+- gfx from hex to int and left shift the integer by 18.
++ gfx from hex to int and left shift the integer by 18 (or 29 for generic architectures).
+
+ Args:
+ architectureName: The gfx name (or fallback).
+@@ -273,7 +273,12 @@ class MasterSolutionLibrary:
+ archString = re.search('(?<=gfx)[0-9a-f]*', architectureName)
+ if archString is not None:
+ archLiteral = archString.group(0)
+- archval = (int(archLiteral, 16) << 18)
++ # Use left shift of 29 for generic architectures, 18 otherwise
++ if architectureName.endswith("-generic"):
++ shift_bits = 29
++ else:
++ shift_bits = 18
++ archval = (int(archLiteral, 16) << shift_bits)
+ # Check for duplicate architecture values
+ if archval >= 0 and not archval in cls.ArchitectureSet:
+ cls.ArchitectureSet.add(archval)
+--
+2.53.0
+
diff --git a/0005-rocblas-add-rocblas_internal_get_generic_arch_name.patch b/0005-rocblas-add-rocblas_internal_get_generic_arch_name.patch
new file mode 100644
index 0000000..ec3518c
--- /dev/null
+++ b/0005-rocblas-add-rocblas_internal_get_generic_arch_name.patch
@@ -0,0 +1,99 @@
+From d9c3f1d52a6a35d69bb0dc9a69dd67521156b032 Mon Sep 17 00:00:00 2001
+From: Tom Rix <Tom.Rix@amd.com>
+Date: Sun, 29 Mar 2026 10:48:52 -0700
+Subject: [PATCH 5/6] rocblas add rocblas_internal_get_generic_arch_name
+
+---
+ .../rocblas/library/src/include/utility.hpp | 9 +++
+ .../rocblas/library/src/rocblas_auxiliary.cpp | 55 +++++++++++++++++++
+ 2 files changed, 64 insertions(+)
+
+diff --git a/projects/rocblas/library/src/include/utility.hpp b/projects/rocblas/library/src/include/utility.hpp
+index 3ec82e012822..a9f7371b07c2 100644
+--- a/projects/rocblas/library/src/include/utility.hpp
++++ b/projects/rocblas/library/src/include/utility.hpp
+@@ -806,6 +806,15 @@ std::string rocblas_internal_get_arch_name(int device);
+ // for internal use during testing, fetch arch name
+ ROCBLAS_INTERNAL_EXPORT std::string rocblas_internal_get_arch_name();
+
++// for internal use
++std::string rocblas_internal_get_arch_name(int device);
++
++// for internal use
++ROCBLAS_INTERNAL_EXPORT std::string rocblas_internal_get_generic_arch_name();
++
++// for internal use
++std::string rocblas_internal_get_generic_arch_name(int device);
++
+ // for internal use, fetch xnack mode
+ std::string rocblas_internal_get_xnack_mode();
+
+diff --git a/projects/rocblas/library/src/rocblas_auxiliary.cpp b/projects/rocblas/library/src/rocblas_auxiliary.cpp
+index 37a4bc5b097c..4524f19ed056 100644
+--- a/projects/rocblas/library/src/rocblas_auxiliary.cpp
++++ b/projects/rocblas/library/src/rocblas_auxiliary.cpp
+@@ -908,6 +908,61 @@ std::string rocblas_internal_get_arch_name()
+ return rocblas_internal_get_arch_name(deviceId);
+ }
+
++std::string rocblas_internal_get_generic_arch_name(int deviceId)
++{
++ std::string arch_name = rocblas_internal_get_arch_name(deviceId);
++ // Map specific architecture names to generic names
++ static const std::map<std::string, std::string> arch_map = {
++ {"gfx900", "gfx9-generic"},
++ {"gfx902", "gfx9-generic"},
++ {"gfx904", "gfx9-generic"},
++ {"gfx906", "gfx9-generic"},
++ {"gfx908", "gfx9-generic"},
++ {"gfx909", "gfx9-generic"},
++ {"gfx90a", "gfx9-generic"},
++ {"gfx940", "gfx9-4-generic"},
++ {"gfx941", "gfx9-4-generic"},
++ {"gfx942", "gfx9-4-generic"},
++ {"gfx1010", "gfx10-1-generic"},
++ {"gfx1011", "gfx10-1-generic"},
++ {"gfx1012", "gfx10-1-generic"},
++ {"gfx1013", "gfx10-1-generic"},
++ {"gfx1030", "gfx10-3-generic"},
++ {"gfx1031", "gfx10-3-generic"},
++ {"gfx1032", "gfx10-3-generic"},
++ {"gfx1033", "gfx10-3-generic"},
++ {"gfx1034", "gfx10-3-generic"},
++ {"gfx1035", "gfx10-3-generic"},
++ {"gfx1036", "gfx10-3-generic"},
++ {"gfx1100", "gfx11-generic"},
++ {"gfx1101", "gfx11-generic"},
++ {"gfx1102", "gfx11-generic"},
++ {"gfx1103", "gfx11-generic"},
++ {"gfx1150", "gfx11-generic"},
++ {"gfx1151", "gfx11-generic"},
++ {"gfx1152", "gfx11-generic"},
++ {"gfx1153", "gfx11-generic"},
++ {"gfx1200", "gfx12-generic"},
++ {"gfx1201", "gfx12-generic"},
++ {"gfx1250", "gfx12-generic"},
++ {"gfx1251", "gfx12-generic"}
++ };
++
++ auto it = arch_map.find(arch_name);
++ if(it != arch_map.end())
++ return it->second;
++
++ // Return original name if no mapping found
++ return arch_name;
++}
++
++std::string rocblas_internal_get_generic_arch_name()
++{
++ int deviceId;
++ PRINT_IF_HIP_ERROR(hipGetDevice(&deviceId));
++ return rocblas_internal_get_generic_arch_name(deviceId);
++}
++
+ // exported. Get xnack mode
+ std::string rocblas_internal_get_xnack_mode()
+ {
+--
+2.53.0
+
diff --git a/0006-rocblas-generalize-finding-tensile-for-generics.patch b/0006-rocblas-generalize-finding-tensile-for-generics.patch
new file mode 100644
index 0000000..265ae85
--- /dev/null
+++ b/0006-rocblas-generalize-finding-tensile-for-generics.patch
@@ -0,0 +1,130 @@
+From 9839138308b7bb8972c1e57b63614fe3fd164908 Mon Sep 17 00:00:00 2001
+From: Tom Rix <Tom.Rix@amd.com>
+Date: Sun, 29 Mar 2026 11:03:43 -0700
+Subject: [PATCH 6/6] rocblas generalize finding tensile for generics
+
+---
+ projects/rocblas/library/src/tensile_host.cpp | 76 +++++++++++--------
+ 1 file changed, 44 insertions(+), 32 deletions(-)
+
+diff --git a/projects/rocblas/library/src/tensile_host.cpp b/projects/rocblas/library/src/tensile_host.cpp
+index d36f686cfeff..d2e8c1bf5708 100644
+--- a/projects/rocblas/library/src/tensile_host.cpp
++++ b/projects/rocblas/library/src/tensile_host.cpp
+@@ -813,12 +813,21 @@ namespace
+ #endif
+
+ // The name of the current GPU platform
+- std::string processor = rocblas_internal_get_arch_name(deviceId);
++ std::string specific_processor = rocblas_internal_get_arch_name(deviceId);
++ std::string generic_processor = rocblas_internal_get_generic_arch_name(deviceId);
++ std::string processors[2] = {specific_processor, generic_processor};
++ std::string processor;
+
+ static std::string base_path;
+ static int determined_path{determine_tensile_base_path(base_path)};
+
+- path = base_path;
++ // Loop over processors to find a valid Tensile library
++ // Only call rocblas_abort on the final processor
++ for(int i = 0; i < 2; ++i)
++ {
++ processor = processors[i];
++
++ path = base_path;
+ // Probe subdirectories from most-specific to least-specific so that shard
+ // overlays compose correctly regardless of how TheRock splits arch builds:
+ // 1. library/<arch>-<xnack>/ – split single-xnack-variant shard
+@@ -836,56 +845,59 @@ namespace
+ }
+ }
+ if(!found_subdir && TestPath(path + "/" + processor))
+- path += "/" + processor;
++ path += "/" + processor;
+
+ #ifdef TENSILE_YAML
+- tensileLibraryPath = path + "/TensileLibrary_lazy_" + processor + ".yaml";
++ tensileLibraryPath = path + "/TensileLibrary_lazy_" + processor + ".yaml";
+ #else
+- tensileLibraryPath = path + "/TensileLibrary_lazy_" + processor + ".dat";
++ tensileLibraryPath = path + "/TensileLibrary_lazy_" + processor + ".dat";
+ #endif
+- if(!TestPath(tensileLibraryPath))
+- {
+- tensile_lazy_load_enabled = false;
++ if(TestPath(tensileLibraryPath)) {
++ tensile_lazy_load_enabled = true;
++ break;
++ }
+
+ #ifdef TENSILE_YAML
+ tensileLibraryPath = path + "/TensileLibrary_" + processor + ".yaml";
+ #else
+ tensileLibraryPath = path + "/TensileLibrary_" + processor + ".dat";
+ #endif
+- if(!TestPath(tensileLibraryPath))
+- {
++ if(TestPath(tensileLibraryPath))
++ break;
++
+ #ifdef TENSILE_YAML
+- tensileLibraryPath = path + "/TensileLibrary.yaml";
++ tensileLibraryPath = path + "/TensileLibrary.yaml";
+ #else
+- tensileLibraryPath = path + "/TensileLibrary.dat";
++ tensileLibraryPath = path + "/TensileLibrary.dat";
+ #endif
+- if(!TestPath(tensileLibraryPath))
+- {
++ if(TestPath(tensileLibraryPath))
++ break;
++
+ #if ROCBLAS_TENSILE_SEPARATE_ARCH
+- rocblas_cerr << "\nrocBLAS error: Cannot read " << tensileLibraryPath
+- << ": " << strerror(errno) << " for GPU arch : " << processor
+- << std::endl;
++ rocblas_cerr << "\nrocBLAS error: Cannot read " << tensileLibraryPath
++ << ": " << strerror(errno) << " for GPU arch : " << processor
++ << std::endl;
+ #if ROCBLAS_TENSILE_LAZY_LOAD
+- std::regex fileMatcher(path + "/TensileLibrary_lazy.*");
++ std::regex fileMatcher(path + "/TensileLibrary_lazy.*");
+ #else
+- std::regex fileMatcher(path + "/TensileLibrary_gfx\\d+.dat");
++ std::regex fileMatcher(path + "/TensileLibrary_gfx\\d+.dat");
+ #endif
+- rocblas_cerr << " List of available TensileLibrary Files : " << std::endl;
+- for(auto& file_name : fs::directory_iterator(path))
+- {
+- if(std::regex_match(file_name.path().string(), fileMatcher))
+- {
++ rocblas_cerr << " List of available TensileLibrary Files : " << std::endl;
++ for(auto& file_name : fs::directory_iterator(path))
++ {
++ if(std::regex_match(file_name.path().string(), fileMatcher))
++ {
+ rocblas_cerr << file_name << std::endl;
+- }
+- }
++ }
++ }
+ #else
+- rocblas_cerr << "\nrocBLAS error: Cannot read " << tensileLibraryPath
+- << ": " << strerror(errno) << std::endl;
++ rocblas_cerr << "\nrocBLAS error: Cannot read " << tensileLibraryPath
++ << ": " << strerror(errno) << std::endl;
+ #endif
+- rocblas_abort();
+- }
+- }
+- }
++
++ if (i == 1)
++ rocblas_abort();
++ }
+
+ // Supports multi architecture configuration
+ static int initialize_once = [&] {
+--
+2.53.0
+
diff --git a/rocblas.spec b/rocblas.spec
index 59235ac..85ff742 100644
--- a/rocblas.spec
+++ b/rocblas.spec
@@ -19,34 +19,43 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
-%bcond_with gitcommit
-%if %{with gitcommit}
-%global commit0 de5c1aebb641af098d9310a9fcca5591a7c066c8
-%global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20251015
+%global upstreamname rocblas
+
+%global pkg_library_name %{upstreamname}
+%global pkg_library_version 5
+
+%bcond_with preview
+%if %{with preview}
+%global rocm_release 7.13
+%global rocm_patch 0
+%global pkg_src therock-%{rocm_release}
+%else
+%global rocm_release 7.2
+%global rocm_patch 0
+%global pkg_src rocm-%{rocm_release}.%{rocm_patch}
%endif
-%global upstreamname rocblas
-%global rocm_release 7.1
-%global rocm_patch 1
%global rocm_version %{rocm_release}.%{rocm_patch}
%bcond_with compat
%if %{with compat}
%global pkg_libdir lib
%global pkg_prefix %{_prefix}/lib64/rocm/rocm-%{rocm_release}
-%global pkg_suffix -%{rocm_release}
+%global pkg_suffix %{rocm_release}
%global pkg_module rocm%{pkg_suffix}
+%global skip_install_rpath OFF
%else
%global pkg_libdir %{_lib}
%global pkg_prefix %{_prefix}
%global pkg_suffix %{nil}
%global pkg_module default
+%global skip_install_rpath ON
%endif
+
%if 0%{?suse_version}
-%global rocblas_name librocblas5%{pkg_suffix}
+%global pkg_name lib%{pkg_library_name}%{pkg_library_version}%{pkg_suffix}
%else
-%global rocblas_name rocblas%{pkg_suffix}
+%global pkg_name %{NAME}
%endif
%global toolchain rocm
@@ -67,11 +76,12 @@
%global build_compress OFF
%endif
-%if 0%{?fedora}
-%bcond_without test
-%else
+# Some parts of install are not legal, make test optional
+# Ex
+# rocblas-test.x86_64: E: script-without-shebang /usr/bin/rocblas_clients_readme.txt
+# rocblas-test.x86_64: E: script-without-shebang /usr/bin/rocblas_common.yaml
+# rocblas-test.x86_64: E: script-without-shebang /usr/bin/rocblas_extras.yaml
%bcond_with test
-%endif
%if %{with test}
%global build_test ON
%else
@@ -91,7 +101,7 @@
%endif
# Compression type and level for source/binary package payloads.
-# "w7T0.xzdio" xz level 7 using %%{getncpus} threads
+# "w7T0.xzdio" xz level 7 using %%{getncpus} threads
%global _source_payload w7T0.xzdio
%global _binary_payload w7T0.xzdio
@@ -134,10 +144,12 @@
-DCMAKE_CXX_COMPILER=%rocmllvm_bindir/amdclang++ \\\
-DCMAKE_INSTALL_LIBDIR=%{pkg_libdir} \\\
-DCMAKE_INSTALL_PREFIX=%{pkg_prefix} \\\
+ -DCMAKE_INSTALL_RPATH=%{pkg_prefix}/%{pkg_libdir} \\\
-DCMAKE_LINKER=%rocmllvm_bindir/ld.lld \\\
-DCMAKE_RANLIB=%rocmllvm_bindir/llvm-ranlib \\\
-DCMAKE_PREFIX_PATH=%{rocmllvm_cmakedir}/.. \\\
- -DCMAKE_SKIP_RPATH=ON \\\
+ -DCMAKE_SKIP_RPATH=%{skip_install_rpath} \\\
+ -DCMAKE_SKIP_INSTALL_RPATH=%{skip_install_rpath} \\\
-DCMAKE_VERBOSE_MAKEFILE=ON \\\
-DGPU_TARGETS=%{gpu_list} \\\
-DROCM_SYMLINK_LIBS=OFF \\\
@@ -151,52 +163,63 @@
%global gpu_list %{rocm_gpu_list_default}
%global _gpu_list gfx1100
-%if %{with compat}
%bcond_without bundled_tensile
-%else
-%if 0%{?suse_version}
-%bcond_without bundled_tensile
-%else
-%bcond_with bundled_tensile
-%endif
-%endif
Name: rocblas%{pkg_suffix}
Summary: BLAS implementation for ROCm
License: MIT AND BSD-3-Clause
+# Most of the files are MIT
+# Some files are MIT and BSD-3-Clause
+# library/src/blas2/gemv_device.hpp
+# library/src/blas2/rocblas_hemv_symv_kernels.cpp
+# library/src/blas2/rocblas_trsv_kernels.cpp
+# library/src/blas3/rocblas_trmm_kernels.cpp
URL: https://github.com/ROCm/rocm-libraries
-
-%if %{with gitcommit}
-Version: git%{date0}.%{shortcommit0}
-Release: 2%{?dist}
-Source0: %{url}/archive/%{commit0}/rocm-libraries-%{shortcommit0}.tar.gz
-%else
Version: %{rocm_version}
-Release: 6%{?dist}
-Source0: %{url}/releases/download/rocm-%{version}/%{upstreamname}.tar.gz#/%{upstreamname}-%{version}.tar.gz
+%if %{with preview}
+Release: 0%{?dist}
+%else
+Release: 8%{?dist}
%endif
-Patch1: 0001-fixup-install-of-tensile-output.patch
+Source0: %{url}/releases/download/%{pkg_src}/%{upstreamname}.tar.gz#/%{upstreamname}-%{version}.tar.gz
+Source1: %{url}/releases/download/%{pkg_src}/tensile.tar.gz#/tensile-%{version}.tar.gz
-# Bundled tensile
-Source1: https://github.com/ROCmSoftwarePlatform/Tensile/archive/rocm-%{version}.tar.gz#/Tensile-%{version}.tar.gz
+%if %{with preview}
+Patch1: 0001-improve-the-warning-for-asm-caps-mismatches.patch
+Patch2: 0002-add-generic-gpu-targets.patch
+Patch3: 0003-improve-fallback-name-to-handle-generics.patch
+Patch4: 0004-generic-arches-need-a-solution-index.patch
+Patch5: 0005-rocblas-add-rocblas_internal_get_generic_arch_name.patch
+Patch6: 0006-rocblas-generalize-finding-tensile-for-generics.patch
+%else
+# Fix tensile output install path to use CMAKE_INSTALL_LIBDIR
+Patch1: 0001-fixup-install-of-tensile-output.patch
+# Add support for Fedora-specific GPU architectures (gfx1035, gfx1150-1152)
Patch101: 0001-tensile-fedora-gpus.patch
+# Add support for gfx1153 GPU architecture in Tensile
Patch102: 0001-tensile-gfx1153.patch
+# Update default ROCm and LLVM binary paths to /usr and /usr/lib64/rocm/llvm
Patch103: 0001-tensile-set-default-paths.patch
+# Force Tensile to ignore assembly capability cache checks
Patch104: 0001-tensile-ignore-cache-check.patch
+# Add gfx1152 and gfx1153 to Tensile's supported CMake architectures
Patch105: 0001-tensile-add-cmake-arches.patch
+# Add support for gfx1036 GPU architecture in Tensile
Patch106: 0001-tensile-gfx1036.patch
+%endif
+BuildRequires: chrpath
BuildRequires: cmake
BuildRequires: gcc-c++
BuildRequires: rocminfo%{pkg_suffix}
BuildRequires: rocm-cmake%{pkg_suffix}
BuildRequires: rocm-comgr%{pkg_suffix}-devel
BuildRequires: rocm-compilersupport%{pkg_suffix}-macros
+BuildRequires: rocm-filesystem%{pkg_suffix}
BuildRequires: rocm-hip%{pkg_suffix}-devel
BuildRequires: rocm-runtime%{pkg_suffix}-devel
BuildRequires: rocm-rpm-macros%{pkg_suffix}
-BuildRequires: rocm-rpm-macros%{pkg_suffix}-modules
%if %{with tensile}
%if 0%{?suse_version}
@@ -211,7 +234,7 @@ BuildRequires: rocm-rpm-macros%{pkg_suffix}-modules
%global tensile_verbose 1
%global tensile_library_format msgpack
# suse_version
-%endif
+%endif
%else
%global tensile_verbose %{nil}
%global tensile_library_format %{nil}
@@ -235,6 +258,8 @@ BuildRequires: msgpack-cxx-devel
%else
BuildRequires: python3-devel
BuildRequires: python3dist(setuptools)
+BuildRequires: python3dist(pip)
+BuildRequires: python3dist(wheel)
BuildRequires: msgpack-devel
%if 0%{?fedora} || 0%{?rhel} > 9
BuildRequires: python3dist(joblib)
@@ -267,7 +292,9 @@ BuildRequires: pkgconfig(libzstd)
%endif
%if %{with test}
-
+%if %{with preview}
+BuildRequires: amdsmi%{pkg_suffix}-devel
+%endif
BuildRequires: libomp-devel
BuildRequires: rocminfo%{pkg_suffix}
BuildRequires: rocm-smi%{pkg_suffix}-devel
@@ -303,6 +330,8 @@ BuildRequires: ninja
%endif
Provides: rocblas%{pkg_suffix} = %{version}-%{release}
+Requires: rocm-filesystem%{pkg_suffix}
+Requires: rocm-hip%{pkg_suffix}
# Only x86_64 works right now:
ExclusiveArch: x86_64
@@ -313,18 +342,19 @@ rocBLAS is the AMD library for Basic Linear Algebra Subprograms
programming language and optimized for AMD GPUs.
%if 0%{?suse_version}
-%package -n %{rocblas_name}
+%package -n %{pkg_name}
Summary: Shared libraries for %{name}
-%description -n %{rocblas_name}
+%description -n %{pkg_name}
%{summary}
-%ldconfig_scriptlets -n %{rocblas_name}
+%ldconfig_scriptlets -n %{pkg_name}
%endif
%package devel
Summary: Libraries and headers for %{name}
-Requires: %{rocblas_name}%{?_isa} = %{version}-%{release}
+Requires: %{pkg_name}%{?_isa} = %{version}-%{release}
+Requires: rocm-filesystem%{pkg_suffix}
%if %{without compat}
Requires: cmake(hip)
%endif
@@ -335,7 +365,7 @@ Requires: cmake(hip)
%if %{with test}
%package test
Summary: Tests for %{name}
-Requires: %{rocblas_name}%{?_isa} = %{version}-%{release}
+Requires: %{pkg_name}%{?_isa} = %{version}-%{release}
Requires: diffutils
%description test
@@ -343,24 +373,29 @@ Requires: diffutils
%endif
%prep
-%if %{with gitcommit}
-%setup -q -n rocm-libraries-%{commit0}
-cd projects/rocblas
-%patch -P1 -p1
-%else
%setup -q -n %{upstreamname}
+%if %{with preview}
+%patch -P5 -p3
+%patch -P6 -p3
+%else
%patch -P1 -p1
%endif
tar xf %{SOURCE1}
-mv Tensile-* Tensile
-cd Tensile
+cd tensile
+%if %{with preview}
+%patch -P1 -p3
+%patch -P2 -p3
+%patch -P3 -p3
+%patch -P4 -p3
+%else
%patch -P101 -p1
%patch -P102 -p1
%patch -P103 -p1
%patch -P104 -p1
%patch -P105 -p1
%patch -P106 -p1
+%endif
#Fix a few things:
chmod 755 Tensile/Configs/miopen/convert_cfg.py
@@ -389,10 +424,16 @@ sed -i -e 's@if not ignoreCacheCheck and derivedAsmCaps@if False and derivedAsmC
sed -i -e '/joblib/d' requirements.*
sed -i -e '/rich/d' requirements.*
sed -i -e '/msgpack/d' requirements.*
+sed -i -e '/pyyaml/d' requirements.*
# Generalize prefix
+%if %{with preview}
+sed -i -e 's@DEFAULT_ROCM_BIN_PATH_POSIX = Path("/opt/rocm/bin")@DEFAULT_ROCM_BIN_PATH_POSIX = Path("%{pkg_prefix}/bin")@' Tensile/Utilities/Toolchain.py
+sed -i -e 's@DEFAULT_ROCM_LLVM_BIN_PATH_POSIX = Path("/opt/rocm/lib/llvm/bin")@DEFAULT_ROCM_LLVM_BIN_PATH_POSIX = Path("%{rocmllvm_bindir}")@' Tensile/Utilities/Toolchain.py
+%else
sed -i -e 's@/usr/bin@%{pkg_prefix}/bin@' Tensile/Utilities/Toolchain.py
sed -i -e 's@/usr/lib64/rocm/llvm/bin@%{rocmllvm_bindir}@' Tensile/Utilities/Toolchain.py
+%endif
# Make sure hip/hip_runtime.h is found
sed -i -e 's@"-D__HIP_HCC_COMPAT_MODE__=1"@"-D__HIP_HCC_COMPAT_MODE__=1","-I%{pkg_prefix}/include"@' Tensile/BuildCommands/SourceCommands.py
@@ -422,20 +463,24 @@ sed -i -e 's@list( APPEND COMMON_LINK_LIBS "-lgfortran")@#list( APPEND COMMON_LI
%if %{with tensile}
%if %{with bundled_tensile}
-cd Tensile
+cd tensile
+
+%if 0%{?suse_version}
TL=$PWD
python3 setup.py install --root $TL
TP=${TL}/usr/lib/python%{python3_version}/site-packages/Tensile/
+%else
+TL=$PWD/install
+# pip install --no-index --find-links /usr/lib/python%{python3_version}/site-packages --target $TL .
+/usr/bin/python3 -m pip install -vvv --no-build-isolation --no-index --find-links /usr/lib/python%{python3_version}/site-packages --find-links /usr/lib64/python%{python3_version}/site-packages --target $TL .
+TP=${TL}/Tensile/
+%endif
cd ..
%else
TP=`/usr/bin/TensileGetPath`
%endif
%endif
-%if %{with gitcommit}
-cd projects/rocblas
-%endif
-
CORES=`lscpu | grep 'Core(s)' | awk '{ print $4 }'`
if [ ${CORES}x = x ]; then
CORES=1
@@ -459,14 +504,23 @@ export HIPCC_LINK_FLAGS_APPEND=-fuse-ld=lld
%cmake_build
%install
-%if %{with gitcommit}
-cd projects/rocblas
-%endif
-
%cmake_install
+# Extra license
rm -f %{buildroot}%{pkg_prefix}/share/doc/rocblas/LICENSE.md
+# rocblas.x86_64: W: unstripped-binary-or-object /usr/lib64/rocblas/library/Kernels.so-000-gfx1010.hsaco
+# The below stripping silience rpmlint but is reported to cause runtime problems
+# So do not strip
+# %{rocmllvm_bindir}/llvm-strip %{buildroot}%{pkg_prefix}/%{pkg_libdir}/rocblas/library/*.hsaco
+
+%if %{with compat}
+# ERROR 0008: file '/usr/lib64/rocm/rocm-7.2/lib/librocblas.so.5.2'
+# contains the $ORIGIN runpath specifier at the wrong position in
+# [/usr/lib64/rocm/rocm-7.2/lib:$ORIGIN/../lib:$ORIGIN/../lib/rocblas/lib]
+chrpath -r %{pkg_prefix}/%{pkg_libdir} %{buildroot}%{pkg_prefix}/%{pkg_libdir}/lib%{pkg_library_name}.so.%{pkg_library_version}.*
+%endif
+
%check
%if %{with test}
%if %{with check}
@@ -480,15 +534,10 @@ export LD_LIBRARY_PATH=%{_vpath_builddir}/library/src:$LD_LIBRARY_PATH
%endif
%endif
-%files -n %{rocblas_name}
-%if %{with gitcommit}
-%license projects/rocblas/LICENSE.md
-%doc projects/rocblas/README.md
-%else
+%files -n %{pkg_name}
%license LICENSE.md
%doc README.md
-%endif
-%{pkg_prefix}/%{pkg_libdir}/librocblas.so.5{,.*}
+%{pkg_prefix}/%{pkg_libdir}/lib%{pkg_library_name}.so.%{pkg_library_version}{,.*}
%if %{with tensile}
%{pkg_prefix}/%{pkg_libdir}/rocblas/
%endif
@@ -496,7 +545,7 @@ export LD_LIBRARY_PATH=%{_vpath_builddir}/library/src:$LD_LIBRARY_PATH
%files devel
%{pkg_prefix}/include/rocblas/
%{pkg_prefix}/%{pkg_libdir}/cmake/rocblas/
-%{pkg_prefix}/%{pkg_libdir}/librocblas.so
+%{pkg_prefix}/%{pkg_libdir}/lib%{pkg_library_name}.so
%if %{with test}
%files test
@@ -504,6 +553,37 @@ export LD_LIBRARY_PATH=%{_vpath_builddir}/library/src:$LD_LIBRARY_PATH
%endif
%changelog
+* Sat Jun 6 2026 Tom Rix <Tom.Rix@amd.com> - 7.2.0-8
+- merge compat changes
+
+* Thu May 28 2026 Tom Rix <Tom.Rix@amd.com> - 7.2.0-7
+- Explicitly license smoke tests 0BSD
+- Smoke test not part of srpm so remove from license tag
+
+* Sat Apr 18 2026 Tom Rix <Tom.Rix@amd.com> - 7.2.0-6
+- Generate suse package names
+
+* Tue Mar 17 2026 Tom Rix <Tom.Rix@amd.com> - 7.2.0-5
+- Install tensile with pip
+
+* Wed Mar 11 2026 Tom Rix <Tom.Rix@amd.com> - 7.2.0-4
+- Do not strip hsaco files
+
+* Sat Mar 7 2026 Tom Rix <Tom.Rix@amd.com> - 7.2.0-3
+- Change --with gitcommit to preview
+- Use rocm-libraries for tensile source
+
+* Sun Feb 15 2026 Tom Rix <Tom.Rix@amd.com> - 7.2.0-2
+- strip hsaco files
+- make test optional
+
+* Wed Feb 11 2026 Tom Rix <Tom.Rix@amd.com> - 7.2.0-1
+- Update to 7.2.0
+- Add smoke test
+
+* Sat Jan 17 2026 Fedora Release Engineering <releng@fedoraproject.org> - 7.1.1-7
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_44_Mass_Rebuild
+
* Fri Jan 2 2026 Tom Rix <Tom.Rix@amd.com> - 7.1.1-6
- Fix SUSE
@@ -566,7 +646,7 @@ export LD_LIBRARY_PATH=%{_vpath_builddir}/library/src:$LD_LIBRARY_PATH
* Wed Aug 13 2025 Egbert Eich <eich@suse.com> - 6.4.2-5
- Fix build and runtime dependencies of test package.
-
+
* Tue Aug 12 2025 Tom Rix <Tom.Rix@amd.com> - 6.4.2-5
- remove roctracer
- Use distro appropriate blas libs
@@ -652,5 +732,3 @@ export LD_LIBRARY_PATH=%{_vpath_builddir}/library/src:$LD_LIBRARY_PATH
* Sun Nov 10 2024 Tom Rix <Tom.Rix@amd.com> - 6.2.1-1
- Stub for tumbleweed
-
-
diff --git a/sources b/sources
index 22af380..7c48b91 100644
--- a/sources
+++ b/sources
@@ -1,2 +1,3 @@
-SHA512 (rocblas-7.1.1.tar.gz) = ea31432ff5350175c1e9d1a7aaaa3e92b6a9925313067e68a22b3ec671906733774e06350d1bc9dafee67d97394b365102d3280e01e80612f9b429850e14db52
-SHA512 (Tensile-7.1.1.tar.gz) = 05ad08c0f80abf9458332a4708f9c4d7ecc694a892d4578faec8c1d88ec78e5aab7bdaf7506802e62b81dd69b0203bb652b14a9e10de2e675dd2aa45ee92448b
+SHA512 (Tensile-7.2.0.tar.gz) = fc1946aa1c3ebddbdab02f6966d7ed08d937e17518d192b31a54d2084972188d8c71b8d1c58f0fd5d8455cc9a3e11414f1f7dbbfd284e0c90538264b9af2c4d0
+SHA512 (rocblas-7.2.0.tar.gz) = 5301a8822c4d3b9ea4223ebe001a80522605d0b2634d11e824043026fe8b148c424c4ffaa4402133dcb28857363c273aa56caa3533b91b0b6147e0289350ca1f
+SHA512 (tensile-7.2.0.tar.gz) = 8b17ee9fc2c0998242928ee923d82f7125d551940af71afc3bcfee90b02e654f9715e84f2caf2dd720e0904e670930b7a9e014b929ebeae04608ba7a128532dd
diff --git a/test.cpp b/test.cpp
new file mode 100644
index 0000000..72b2323
--- /dev/null
+++ b/test.cpp
@@ -0,0 +1,72 @@
+// BSD Zero Clause License
+
+#include <rocblas/rocblas.h>
+#include <hip/hip_runtime.h>
+#include <vector>
+#include <random>
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+
+int main()
+{
+ size_t n = 128;
+ size_t size = n * n;
+
+ std::random_device rd;
+ std::mt19937 gen(rd());
+ std::uniform_real_distribution<float> dist(-1.0, 1.0);
+ auto myrand = [&](){return dist(gen);};
+
+ float *x;
+ float *y;
+ float *z;
+ hipMalloc((void**)&x, sizeof *x * size);
+ hipMalloc((void**)&y, sizeof *y * size);
+ hipMalloc((void**)&z, sizeof *z * size);
+
+ std::vector<float> xin(size);
+ std::vector<float> yin(size);
+
+ std::generate(xin.begin(), xin.end(), myrand);
+ std::generate(yin.begin(), yin.end(), myrand);
+
+ hipMemcpy(x, xin.data(), sizeof *x * size, hipMemcpyHostToDevice);
+ hipMemcpy(y, yin.data(), sizeof *x * size, hipMemcpyHostToDevice);
+
+ rocblas_handle handle;
+ rocblas_create_handle(&handle);
+
+ float alpha = 15.412f;
+ float beta = 0.0f;
+ rocblas_sgemm(handle, rocblas_operation_none, rocblas_operation_none,
+ n, n, n, &alpha, x, n, y, n, &beta, z, n);
+
+ std::vector<float> zout(size);
+ hipMemcpy(zout.data(), z, sizeof *z * size, hipMemcpyDeviceToHost);
+
+ for(size_t j = 0; j < n; j++){
+ for(size_t i = 0; i < n; i++){
+ for(size_t k = 0; k < n; k++){
+ zout[i + j * n] -= alpha * xin[i + k * n] * yin[k + j * n];
+ }
+ }
+ }
+
+ float tol = 0.001f;
+ for(size_t i = 0; i < size; i++){
+ if(std::abs(zout[i]) > tol){
+ std::cout << "Element mismatch at index " << i << "\n";
+ std::cout << "Expected: 0\n";
+ std::cout << "Actual : " << zout[i] << "\n";
+ return 1;
+ }
+ }
+
+ std::cout << "TESTS PASSED!" << std::endl;
+
+ hipFree(x);
+ hipFree(y);
+ hipFree(z);
+ rocblas_destroy_handle(handle);
+}
diff --git a/test.sh b/test.sh
new file mode 100755
index 0000000..fe0fced
--- /dev/null
+++ b/test.sh
@@ -0,0 +1,10 @@
+#! /usr/bin/env sh
+# BSD Zero Clause License
+
+BPATH=/usr/bin
+IPATH=/usr/include
+LPATH=/usr/lib64
+
+OUT=$(mktemp -d)
+${BPATH}/hipcc -o "$OUT"/test test.cpp -I${IPATH} -L${LPATH} -lrocblas -lamdhip64
+"$OUT"/test
diff --git a/rocblas.spec b/rocblas.spec
index 03cdf30..85ff742 100644
--- a/rocblas.spec
+++ b/rocblas.spec
@@ -664,6 +664,9 @@ export LD_LIBRARY_PATH=%{_vpath_builddir}/library/src:$LD_LIBRARY_PATH
* Tue Jul 22 2025 Jeremy Newton <alexjnewt at hotmail dot com> - 6.4.2-1
- Update to 6.4.2
+* Thu Jun 19 2025 Tom Rix <Tom.Rix@amd.com> - 6.4.0-9
+- Turn off tensile for first rhel build
+
* Wed Jun 11 2025 Tom Rix <Tom.Rix@amd.com> - 6.4.0-8
- Remove suse check for using ldconfig
^ permalink raw reply related [flat|nested] only message in thread
only message in thread, other threads:[~2026-06-11 14:33 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2026-06-11 14:33 [rpms/rocblas] epel10: Merge branch 'rawhide' into epel10 Tom Rix
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox