public inbox for git-commits@fedoraproject.org
help / color / mirror / Atom feed
* [rpms/firefox] rawhide: add ppc64le JIT
@ 2026-06-16 13:10
0 siblings, 0 replies; only message in thread
From: @ 2026-06-16 13:10 UTC (permalink / raw)
To: git-commits
A new commit has been pushed.
Repo : rpms/firefox
Branch : rawhide
Commit : e99f0d4925ac596ad75f2ae084620d36c44a85c2
Author : Dan Horák <dan@danny.cz>
Date : 2026-06-16T13:10:34+00:00
Stats : +42086/-0 in 4 file(s)
URL : https://src.fedoraproject.org/rpms/firefox/c/e99f0d4925ac596ad75f2ae084620d36c44a85c2?branch=rawhide
Log:
add ppc64le JIT
---
diff --git a/0001-Add-VSX-instructions-for-SKIA.patch b/0001-Add-VSX-instructions-for-SKIA.patch
new file mode 100644
index 0000000..ac3a0d8
--- /dev/null
+++ b/0001-Add-VSX-instructions-for-SKIA.patch
@@ -0,0 +1,1347 @@
+From a47c991dbbfb709134737a54e8bbe7e0b1bce800 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Trung=20L=C3=AA?= <8@tle.id.au>
+Date: Fri, 12 Jun 2026 15:23:10 +1000
+Subject: [PATCH 1/3] Add VSX instructions for SKIA
+
+Adapted from work done by Raptor Engineering for chromium's vendored
+SKIA
+
+Co-authored-by: Timothy Pearson <tpearson@raptorengineering.com>
+---
+ gfx/skia/skia/src/base/SkVx.h | 58 +++-
+ gfx/skia/skia/src/core/SkBlitRow_D32.cpp | 98 ++++++
+ gfx/skia/skia/src/core/SkBlitter_ARGB32.cpp | 268 ++++++++++++++++
+ .../skia/src/opts/SkBitmapProcState_opts.h | 164 ++++++++++
+ gfx/skia/skia/src/opts/SkBlitRow_opts.h | 48 +++
+ .../skia/src/opts/SkRasterPipeline_opts.h | 237 ++++++++++++++
+ gfx/skia/skia/src/opts/SkSwizzler_opts.inc | 289 ++++++++++++++++++
+ 7 files changed, 1160 insertions(+), 2 deletions(-)
+
+diff --git a/gfx/skia/skia/src/base/SkVx.h b/gfx/skia/skia/src/base/SkVx.h
+index f87ca44d4af0..ed80c91fd38e 100644
+--- a/gfx/skia/skia/src/base/SkVx.h
++++ b/gfx/skia/skia/src/base/SkVx.h
+@@ -52,6 +52,8 @@
+ #include <arm_neon.h>
+ #elif defined(__wasm_simd128__)
+ #include <wasm_simd128.h>
++ #elif defined(SK_CPU_PPC) && defined(__VSX__)
++ #include <altivec.h>
+ #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
+ #include <lasxintrin.h>
+ #include <lsxintrin.h>
+@@ -509,6 +511,14 @@ SINT Vec<N,T> if_then_else(const Vec<N,M<T>>& cond, const Vec<N,T>& t, const Vec
+ sk_bit_cast<uint8x16_t>(e)));
+ }
+ #endif
++#if SKVX_USE_SIMD && defined(SK_CPU_PPC) && defined(__VSX__)
++ if constexpr (N*sizeof(T) == 16) {
++ return sk_bit_cast<Vec<N,T>>(
++ vec_sel(sk_bit_cast<__vector unsigned char>(e),
++ sk_bit_cast<__vector unsigned char>(t),
++ sk_bit_cast<__vector unsigned char>(cond)));
++ }
++#endif
+ #if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
+ if constexpr (N*sizeof(T) == 32) {
+ return sk_bit_cast<Vec<N,T>>(__lasx_xvbitsel_v(sk_bit_cast<__m256i>(e),
+@@ -579,6 +589,11 @@ SINT bool any(const Vec<N,T>& x) {
+ sk_bit_cast<__m128i>(x)));
+ return retv[0] != 0b0000;
+ }
++#endif
++#if SKVX_USE_SIMD && defined(SK_CPU_PPC) && defined(__VSX__)
++ if constexpr (N*sizeof(T) == 16) {
++ return vec_any_ne(sk_bit_cast<__vector unsigned int>(x), vec_splats(0u));
++ }
+ #endif
+ return any(x.lo)
+ || any(x.hi);
+@@ -622,6 +637,11 @@ SINT bool all(const Vec<N,T>& x) {
+ sk_bit_cast<__m128i>(x)));
+ return retv[0] == 0b1111;
+ }
++#endif
++#if SKVX_USE_SIMD && defined(SK_CPU_PPC) && defined(__VSX__)
++ if constexpr (N*sizeof(T) == 16) {
++ return vec_all_ne(sk_bit_cast<__vector unsigned int>(x), vec_splats(0u));
++ }
+ #endif
+ return all(x.lo)
+ && all(x.hi);
+@@ -647,8 +667,22 @@ SIT T max(const Vec<1,T>& x) { return x.val; }
+ SINT T min(const Vec<N,T>& x) { return std::min(min(x.lo), min(x.hi)); }
+ SINT T max(const Vec<N,T>& x) { return std::max(max(x.lo), max(x.hi)); }
+
+-SINT Vec<N,T> min(const Vec<N,T>& x, const Vec<N,T>& y) { return naive_if_then_else(y < x, y, x); }
+-SINT Vec<N,T> max(const Vec<N,T>& x, const Vec<N,T>& y) { return naive_if_then_else(x < y, y, x); }
++SINT Vec<N,T> min(const Vec<N,T>& x, const Vec<N,T>& y) {
++#if SKVX_USE_SIMD && defined(SK_CPU_PPC) && defined(__VSX__)
++ if constexpr (N*sizeof(T) == 16) {
++ return sk_bit_cast<Vec<N,T>>(vec_min(to_vext(x), to_vext(y)));
++ }
++#endif
++ return naive_if_then_else(y < x, y, x);
++}
++SINT Vec<N,T> max(const Vec<N,T>& x, const Vec<N,T>& y) {
++#if SKVX_USE_SIMD && defined(SK_CPU_PPC) && defined(__VSX__)
++ if constexpr (N*sizeof(T) == 16) {
++ return sk_bit_cast<Vec<N,T>>(vec_max(to_vext(x), to_vext(y)));
++ }
++#endif
++ return naive_if_then_else(x < y, y, x);
++}
+
+ SINTU Vec<N,T> min(const Vec<N,T>& x, U y) { return min(x, Vec<N,T>(y)); }
+ SINTU Vec<N,T> max(const Vec<N,T>& x, U y) { return max(x, Vec<N,T>(y)); }
+@@ -960,6 +994,26 @@ SIN Vec<N,uint16_t> mulhi(const Vec<N,uint16_t>& x,
+ } else { // N > 8
+ return join(mulhi(x.lo, y.lo), mulhi(x.hi, y.hi));
+ }
++#elif SKVX_USE_SIMD && defined(SK_CPU_PPC) && defined(__VSX__)
++ if constexpr (N == 8) {
++ // u16*u16 -> u32 even/odd products (vmuleuh/vmulouh), then gather the
++ // high 16 bits of each back into sequential lanes. Same idiom as the
++ // VSX scale() in SkSwizzler_opts.
++ __vector unsigned short xs = sk_bit_cast<__vector unsigned short>(x);
++ __vector unsigned short ys = sk_bit_cast<__vector unsigned short>(y);
++ __vector unsigned int even = vec_vmuleuh(xs, ys);
++ __vector unsigned int odd = vec_vmulouh(xs, ys);
++ const __vector unsigned char hi = {
++ 0x02,0x03, 0x12,0x13, 0x06,0x07, 0x16,0x17,
++ 0x0A,0x0B, 0x1A,0x1B, 0x0E,0x0F, 0x1E,0x1F
++ };
++ return sk_bit_cast<Vec<8,uint16_t>>(
++ vec_perm((__vector unsigned char)even, (__vector unsigned char)odd, hi));
++ } else if constexpr (N < 8) {
++ return mulhi(join(x,x), join(y,y)).lo;
++ } else { // N > 8
++ return join(mulhi(x.lo, y.lo), mulhi(x.hi, y.hi));
++ }
+ #else
+ return skvx::cast<uint16_t>(mull(x, y) >> 16);
+ #endif
+diff --git a/gfx/skia/skia/src/core/SkBlitRow_D32.cpp b/gfx/skia/skia/src/core/SkBlitRow_D32.cpp
+index bcbf2e66bd46..920d6a9b2366 100644
+--- a/gfx/skia/skia/src/core/SkBlitRow_D32.cpp
++++ b/gfx/skia/skia/src/core/SkBlitRow_D32.cpp
+@@ -517,6 +517,104 @@ static void blit_row_s32_opaque(SkPMColor* dst,
+ }
+ }
+
++#elif defined(SK_CPU_PPC) && defined(__VSX__)
++ #include <altivec.h>
++
++ // dst + (((src - dst) * src_scale) >> 8), splayed into 16-bit lanes; the
++ // vec_* transcription of SkPMLerp_SSE2.
++ static inline __vector unsigned char SkPMLerp_VSX(__vector unsigned char src,
++ __vector unsigned char dst,
++ unsigned src_scale) {
++ const __vector unsigned int mask = vec_splats(0x00FF00FFu);
++ const __vector unsigned short eight = vec_splats((unsigned short)8);
++ __vector unsigned short src_rb = (__vector unsigned short)vec_and((__vector unsigned int)src, mask);
++ __vector unsigned short src_ag = vec_sr((__vector unsigned short)src, eight);
++ __vector unsigned short dst_rb = (__vector unsigned short)vec_and((__vector unsigned int)dst, mask);
++ __vector unsigned short dst_ag = vec_sr((__vector unsigned short)dst, eight);
++ __vector unsigned short s = vec_splats((unsigned short)src_scale);
++ __vector unsigned short diff_rb = vec_mul(vec_sub(src_rb, dst_rb), s);
++ __vector unsigned short diff_ag = vec_mul(vec_sub(src_ag, dst_ag), s);
++ diff_rb = vec_sr(diff_rb, eight);
++ __vector unsigned int diff = vec_or((__vector unsigned int)diff_rb,
++ vec_andc((__vector unsigned int)diff_ag, mask));
++ return vec_add(dst, (__vector unsigned char)diff);
++ }
++
++ static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
++ SkASSERT(alpha <= 255);
++ unsigned src_scale = SkAlpha255To256(alpha);
++ while (count >= 4) {
++ __vector unsigned char s = vec_xl(0, (const unsigned char*)src);
++ __vector unsigned char d = vec_xl(0, (const unsigned char*)dst);
++ vec_xst(SkPMLerp_VSX(s, d, src_scale), 0, (unsigned char*)dst);
++ src += 4; dst += 4; count -= 4;
++ }
++ while (count --> 0) {
++ *dst = SkPMLerp(*src, *dst, src_scale);
++ src++;
++ dst++;
++ }
++ }
++
++ // The vec_* transcription of SkBlendARGB32_SSE2: scale src by aa and dst by
++ // SkAlphaMulInv256(srcA, aa), then add the splayed halves.
++ static inline __vector unsigned char SkBlendARGB32_VSX(__vector unsigned char src,
++ __vector unsigned char dst,
++ unsigned aa) {
++ unsigned alpha = SkAlpha255To256(aa);
++ __vector unsigned short src_scale = vec_splats((unsigned short)alpha);
++ const __vector unsigned int mask = vec_splats(0x00FF00FFu);
++ const __vector unsigned short eight = vec_splats((unsigned short)8);
++
++ // dst_scale = SkAlphaMulInv256(SkGetPackedA32(src), alpha), per 32-bit lane.
++ __vector unsigned int srcA = vec_sr((__vector unsigned int)src, vec_splats(24u));
++ __vector unsigned int ds = (__vector unsigned int)vec_mul((__vector unsigned short)srcA, src_scale);
++ ds = vec_sub(vec_splats((unsigned int)0xFFFF), ds);
++ ds = vec_add(ds, vec_sr(ds, vec_splats(8u)));
++ ds = vec_sr(ds, vec_splats(8u));
++ // Duplicate the low 16-bit word of each 32-bit lane into both halves
++ // (the SSE shufflelo/shufflehi _MM_SHUFFLE(2,2,0,0)).
++ const __vector unsigned char dup = (__vector unsigned char){
++ 0,1,0,1, 4,5,4,5, 8,9,8,9, 12,13,12,13
++ };
++ __vector unsigned short dst_scale =
++ (__vector unsigned short)vec_perm((__vector unsigned char)ds,
++ (__vector unsigned char)ds, dup);
++
++ __vector unsigned short src_rb = (__vector unsigned short)vec_and((__vector unsigned int)src, mask);
++ __vector unsigned short src_ag = vec_sr((__vector unsigned short)src, eight);
++ __vector unsigned short dst_rb = (__vector unsigned short)vec_and((__vector unsigned int)dst, mask);
++ __vector unsigned short dst_ag = vec_sr((__vector unsigned short)dst, eight);
++
++ src_rb = vec_mul(src_rb, src_scale);
++ src_ag = vec_mul(src_ag, src_scale);
++ dst_rb = vec_mul(dst_rb, dst_scale);
++ dst_ag = vec_mul(dst_ag, dst_scale);
++
++ dst_rb = vec_add(src_rb, dst_rb);
++ dst_ag = vec_add(src_ag, dst_ag);
++
++ dst_rb = vec_sr(dst_rb, eight);
++ __vector unsigned int out = vec_or((__vector unsigned int)dst_rb,
++ vec_andc((__vector unsigned int)dst_ag, mask));
++ return (__vector unsigned char)out;
++ }
++
++ static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
++ SkASSERT(alpha <= 255);
++ while (count >= 4) {
++ __vector unsigned char s = vec_xl(0, (const unsigned char*)src);
++ __vector unsigned char d = vec_xl(0, (const unsigned char*)dst);
++ vec_xst(SkBlendARGB32_VSX(s, d, alpha), 0, (unsigned char*)dst);
++ src += 4; dst += 4; count -= 4;
++ }
++ while (count --> 0) {
++ *dst = SkBlendARGB32(*src, *dst, alpha);
++ src++;
++ dst++;
++ }
++ }
++
+ #else
+ static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
+ SkASSERT(alpha <= 255);
+diff --git a/gfx/skia/skia/src/core/SkBlitter_ARGB32.cpp b/gfx/skia/skia/src/core/SkBlitter_ARGB32.cpp
+index a7538027b85d..9669431292b6 100644
+--- a/gfx/skia/skia/src/core/SkBlitter_ARGB32.cpp
++++ b/gfx/skia/skia/src/core/SkBlitter_ARGB32.cpp
+@@ -480,6 +480,274 @@ static inline SkPMColor blend_lcd16_opaque(int srcR, int srcG, int srcB,
+ }
+ }
+
++#elif defined(SK_CPU_PPC) && defined(__VSX__)
++ #include <altivec.h>
++
++ // Native VSX/AltiVec port of the SSE2 LCD-subpixel blend block below.
++ // Same algorithm — only the intrinsics change. Translations follow the
++ // GCC ppc_wrappers pattern (vec_mergeh/l, vec_packsu, etc.).
++
++ // The following (left) shifts cause the top 5 bits of the mask components to
++ // line up with the corresponding components in an SkPMColor.
++ // Note that the mask's RGB16 order may differ from the SkPMColor order.
++ #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
++ #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
++ #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
++
++ // Each macro must always return __vector unsigned int so the surrounding
++ // vec_and gets matching element types. The pass-through case (SHIFT == 0)
++ // still needs an explicit reinterpret-cast since `mask` is __vector
++ // unsigned char in our function signature.
++ #if SK_R16x5_R32x5_SHIFT == 0
++ #define SkPackedR16x5ToUnmaskedR32x5_VSX(x) ((__vector unsigned int)(x))
++ #elif SK_R16x5_R32x5_SHIFT > 0
++ #define SkPackedR16x5ToUnmaskedR32x5_VSX(x) \
++ vec_sl((__vector unsigned int)(x), vec_splats((unsigned int)SK_R16x5_R32x5_SHIFT))
++ #else
++ #define SkPackedR16x5ToUnmaskedR32x5_VSX(x) \
++ vec_sr((__vector unsigned int)(x), vec_splats((unsigned int)(-SK_R16x5_R32x5_SHIFT)))
++ #endif
++
++ #if SK_G16x5_G32x5_SHIFT == 0
++ #define SkPackedG16x5ToUnmaskedG32x5_VSX(x) ((__vector unsigned int)(x))
++ #elif SK_G16x5_G32x5_SHIFT > 0
++ #define SkPackedG16x5ToUnmaskedG32x5_VSX(x) \
++ vec_sl((__vector unsigned int)(x), vec_splats((unsigned int)SK_G16x5_G32x5_SHIFT))
++ #else
++ #define SkPackedG16x5ToUnmaskedG32x5_VSX(x) \
++ vec_sr((__vector unsigned int)(x), vec_splats((unsigned int)(-SK_G16x5_G32x5_SHIFT)))
++ #endif
++
++ #if SK_B16x5_B32x5_SHIFT == 0
++ #define SkPackedB16x5ToUnmaskedB32x5_VSX(x) ((__vector unsigned int)(x))
++ #elif SK_B16x5_B32x5_SHIFT > 0
++ #define SkPackedB16x5ToUnmaskedB32x5_VSX(x) \
++ vec_sl((__vector unsigned int)(x), vec_splats((unsigned int)SK_B16x5_B32x5_SHIFT))
++ #else
++ #define SkPackedB16x5ToUnmaskedB32x5_VSX(x) \
++ vec_sr((__vector unsigned int)(x), vec_splats((unsigned int)(-SK_B16x5_B32x5_SHIFT)))
++ #endif
++
++ static __vector unsigned char blend_lcd16_vsx(__vector unsigned char& src,
++ __vector unsigned char& dst,
++ __vector unsigned char& mask,
++ __vector unsigned char& srcA) {
++ // Get the R,G,B of each 16bit mask pixel, all aligned to 5-bit positions.
++ __vector unsigned int r = vec_and(SkPackedR16x5ToUnmaskedR32x5_VSX(mask),
++ vec_splats((unsigned int)(0x1F << SK_R32_SHIFT)));
++ __vector unsigned int g = vec_and(SkPackedG16x5ToUnmaskedG32x5_VSX(mask),
++ vec_splats((unsigned int)(0x1F << SK_G32_SHIFT)));
++ __vector unsigned int b = vec_and(SkPackedB16x5ToUnmaskedB32x5_VSX(mask),
++ vec_splats((unsigned int)(0x1F << SK_B32_SHIFT)));
++
++ // a needs to be either the min or the max of the LCD coverages, depending on srcA < dstA.
++ __vector unsigned int rA = vec_sl(r, vec_splats((unsigned int)(SK_A32_SHIFT - SK_R32_SHIFT)));
++ __vector unsigned int gA = vec_sl(g, vec_splats((unsigned int)(SK_A32_SHIFT - SK_G32_SHIFT)));
++ __vector unsigned int bA = vec_sl(b, vec_splats((unsigned int)(SK_A32_SHIFT - SK_B32_SHIFT)));
++ __vector unsigned char aMin = vec_min(vec_min((__vector unsigned char)rA,
++ (__vector unsigned char)gA),
++ (__vector unsigned char)bA);
++ __vector unsigned char aMax = vec_max(vec_max((__vector unsigned char)rA,
++ (__vector unsigned char)gA),
++ (__vector unsigned char)bA);
++ // srcA has been biased to [0-256]; compare srcA against (dstA+1).
++ __vector unsigned int dstA = vec_and(vec_add((__vector unsigned int)dst,
++ vec_splats((unsigned int)(1 << SK_A32_SHIFT))),
++ vec_splats((unsigned int)SK_A32_MASK));
++ __vector __bool int aLT = vec_cmplt((__vector signed int)srcA, (__vector signed int)dstA);
++ // a = (aMin & aLT) | (aMax & ~aLT)
++ __vector unsigned char a = vec_or(vec_and(aMin, (__vector unsigned char)aLT),
++ vec_andc(aMax, (__vector unsigned char)aLT));
++
++ // Pack the 4 16-bit mask pixels into 4 32-bit pixels (m0A, m0R, m0G, m0B, ...).
++ mask = vec_or(vec_or(a, (__vector unsigned char)r),
++ vec_or((__vector unsigned char)g, (__vector unsigned char)b));
++
++ // Interleave into 16-bit words.
++ const __vector unsigned char zeros = vec_splats((unsigned char)0);
++ __vector unsigned short maskLo = (__vector unsigned short)vec_mergeh(mask, zeros);
++ __vector unsigned short maskHi = (__vector unsigned short)vec_mergel(mask, zeros);
++
++ // Upscale 0..31 -> 0..32 by adding (mask >> 4).
++ const __vector unsigned short v4 = vec_splats((unsigned short)4);
++ const __vector unsigned short v8 = vec_splats((unsigned short)8);
++ const __vector unsigned short v5 = vec_splats((unsigned short)5);
++ maskLo = vec_add(maskLo, vec_sr(maskLo, v4));
++ maskHi = vec_add(maskHi, vec_sr(maskHi, v4));
++
++ // Multiply by srcA per 16-bit lane.
++ maskLo = vec_mul(maskLo, (__vector unsigned short)srcA);
++ maskHi = vec_mul(maskHi, (__vector unsigned short)srcA);
++ // Divide by 256 (right-shift 8).
++ maskLo = vec_sr(maskLo, v8);
++ maskHi = vec_sr(maskHi, v8);
++
++ // Unpack dst into 16-bit words.
++ __vector signed short dstLo = (__vector signed short)vec_mergeh(dst, zeros);
++ __vector signed short dstHi = (__vector signed short)vec_mergel(dst, zeros);
++ // mask = (src - dst) * mask
++ __vector signed short srcS = (__vector signed short)src;
++ __vector signed short mLoS = vec_mul((__vector signed short)maskLo, vec_sub(srcS, dstLo));
++ __vector signed short mHiS = vec_mul((__vector signed short)maskHi, vec_sub(srcS, dstHi));
++ // arithmetic shift right by 5
++ mLoS = vec_sra(mLoS, (__vector unsigned short)v5);
++ mHiS = vec_sra(mHiS, (__vector unsigned short)v5);
++ // result = dst + ((src - dst) * mask >> 5)
++ __vector signed short resLo = vec_add(dstLo, mLoS);
++ __vector signed short resHi = vec_add(dstHi, mHiS);
++ // Pack 16-bit signed -> 8-bit unsigned with saturation.
++ return vec_packsu(resLo, resHi);
++ }
++
++ static __vector unsigned char blend_lcd16_opaque_vsx(__vector unsigned char& src,
++ __vector unsigned char& dst,
++ __vector unsigned char& mask) {
++ __vector unsigned int r = vec_and(SkPackedR16x5ToUnmaskedR32x5_VSX(mask),
++ vec_splats((unsigned int)(0x1F << SK_R32_SHIFT)));
++ __vector unsigned int g = vec_and(SkPackedG16x5ToUnmaskedG32x5_VSX(mask),
++ vec_splats((unsigned int)(0x1F << SK_G32_SHIFT)));
++ __vector unsigned int b = vec_and(SkPackedB16x5ToUnmaskedB32x5_VSX(mask),
++ vec_splats((unsigned int)(0x1F << SK_B32_SHIFT)));
++
++ // Opaque src: a = max(r, g, b) shifted to alpha lane.
++ __vector unsigned int rA = vec_sl(r, vec_splats((unsigned int)(SK_A32_SHIFT - SK_R32_SHIFT)));
++ __vector unsigned int gA = vec_sl(g, vec_splats((unsigned int)(SK_A32_SHIFT - SK_G32_SHIFT)));
++ __vector unsigned int bA = vec_sl(b, vec_splats((unsigned int)(SK_A32_SHIFT - SK_B32_SHIFT)));
++ __vector unsigned char a = vec_max(vec_max((__vector unsigned char)rA,
++ (__vector unsigned char)gA),
++ (__vector unsigned char)bA);
++
++ mask = vec_or(vec_or(a, (__vector unsigned char)r),
++ vec_or((__vector unsigned char)g, (__vector unsigned char)b));
++
++ const __vector unsigned char zeros = vec_splats((unsigned char)0);
++ __vector unsigned short maskLo = (__vector unsigned short)vec_mergeh(mask, zeros);
++ __vector unsigned short maskHi = (__vector unsigned short)vec_mergel(mask, zeros);
++
++ const __vector unsigned short v4 = vec_splats((unsigned short)4);
++ const __vector unsigned short v5 = vec_splats((unsigned short)5);
++ maskLo = vec_add(maskLo, vec_sr(maskLo, v4));
++ maskHi = vec_add(maskHi, vec_sr(maskHi, v4));
++
++ __vector signed short dstLo = (__vector signed short)vec_mergeh(dst, zeros);
++ __vector signed short dstHi = (__vector signed short)vec_mergel(dst, zeros);
++ __vector signed short srcS = (__vector signed short)src;
++ __vector signed short mLoS = vec_mul((__vector signed short)maskLo, vec_sub(srcS, dstLo));
++ __vector signed short mHiS = vec_mul((__vector signed short)maskHi, vec_sub(srcS, dstHi));
++ mLoS = vec_sra(mLoS, (__vector unsigned short)v5);
++ mHiS = vec_sra(mHiS, (__vector unsigned short)v5);
++ __vector signed short resLo = vec_add(dstLo, mLoS);
++ __vector signed short resHi = vec_add(dstHi, mHiS);
++ return vec_packsu(resLo, resHi);
++ }
++
++ void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src,
++ int width, SkPMColor) {
++ if (width <= 0) {
++ return;
++ }
++ int srcA = SkColorGetA(src);
++ int srcR = SkColorGetR(src);
++ int srcG = SkColorGetG(src);
++ int srcB = SkColorGetB(src);
++ srcA = SkAlpha255To256(srcA);
++
++ if (width >= 4) {
++ SkASSERT(SkIsAlign4((uintptr_t) dst));
++ while (!SkIsAlign16((uintptr_t) dst)) {
++ *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
++ mask++; dst++; width--;
++ }
++
++ // Replicate source across 4 lanes, then unpack low half to interleaved 16-bit.
++ uint32_t srcPM = SkPackARGB32(0xFF, srcR, srcG, srcB);
++ __vector unsigned int src_v32 = vec_splats(srcPM);
++ const __vector unsigned char zeros = vec_splats((unsigned char)0);
++ __vector unsigned char src_v = vec_mergeh((__vector unsigned char)src_v32, zeros);
++ __vector unsigned char srcA_v = (__vector unsigned char)vec_splats((unsigned short)srcA);
++
++ while (width >= 4) {
++ __vector unsigned char dst_v = vec_xl(0, (const unsigned char*)dst);
++ // Load 8 bytes (4x uint16 mask) into low half of vector.
++ uint64_t mlo;
++ memcpy(&mlo, mask, sizeof(mlo));
++ __vector unsigned long long mask_low =
++ (__vector unsigned long long){mlo, 0};
++ __vector unsigned char mask_v = (__vector unsigned char)mask_low;
++
++ // Check if all mask values are zero (skip blending if so).
++ if (!vec_all_eq((__vector unsigned long long)mask_v,
++ vec_splats((unsigned long long)0))) {
++ // Unpack low 8 bytes of mask (4x uint16) into 4x uint32 (with zeros).
++ // Zero-extend the 4 uint16 masks to 4 uint32 (16-bit-granularity
++ // merge, matching SSE2's _mm_unpacklo_epi16); a char-granularity
++ // merge would byte-stretch the RGB565 value and misplace the shifts.
++ mask_v = (__vector unsigned char)vec_mergeh((__vector unsigned short)mask_v,
++ (__vector unsigned short)zeros);
++ __vector unsigned char result =
++ blend_lcd16_vsx(src_v, dst_v, mask_v, srcA_v);
++ vec_xst(result, 0, (unsigned char*)dst);
++ }
++ dst += 4; mask += 4; width -= 4;
++ }
++ }
++
++ while (width > 0) {
++ *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
++ mask++; dst++; width--;
++ }
++ }
++
++ void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
++ SkColor src, int width, SkPMColor opaqueDst) {
++ if (width <= 0) {
++ return;
++ }
++ int srcR = SkColorGetR(src);
++ int srcG = SkColorGetG(src);
++ int srcB = SkColorGetB(src);
++
++ if (width >= 4) {
++ SkASSERT(SkIsAlign4((uintptr_t) dst));
++ while (!SkIsAlign16((uintptr_t) dst)) {
++ *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
++ mask++; dst++; width--;
++ }
++
++ uint32_t srcPM = SkPackARGB32(0xFF, srcR, srcG, srcB);
++ __vector unsigned int src_v32 = vec_splats(srcPM);
++ const __vector unsigned char zeros = vec_splats((unsigned char)0);
++ __vector unsigned char src_v = vec_mergeh((__vector unsigned char)src_v32, zeros);
++
++ while (width >= 4) {
++ __vector unsigned char dst_v = vec_xl(0, (const unsigned char*)dst);
++ uint64_t mlo;
++ memcpy(&mlo, mask, sizeof(mlo));
++ __vector unsigned long long mask_low =
++ (__vector unsigned long long){mlo, 0};
++ __vector unsigned char mask_v = (__vector unsigned char)mask_low;
++
++ if (!vec_all_eq((__vector unsigned long long)mask_v,
++ vec_splats((unsigned long long)0))) {
++ // Zero-extend the 4 uint16 masks to 4 uint32 (16-bit-granularity
++ // merge, matching SSE2's _mm_unpacklo_epi16); a char-granularity
++ // merge would byte-stretch the RGB565 value and misplace the shifts.
++ mask_v = (__vector unsigned char)vec_mergeh((__vector unsigned short)mask_v,
++ (__vector unsigned short)zeros);
++ __vector unsigned char result =
++ blend_lcd16_opaque_vsx(src_v, dst_v, mask_v);
++ vec_xst(result, 0, (unsigned char*)dst);
++ }
++ dst += 4; mask += 4; width -= 4;
++ }
++ }
++
++ while (width > 0) {
++ *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
++ mask++; dst++; width--;
++ }
++ }
++
+ #elif defined(SK_ARM_HAS_NEON)
+ #include <arm_neon.h>
+
+diff --git a/gfx/skia/skia/src/opts/SkBitmapProcState_opts.h b/gfx/skia/skia/src/opts/SkBitmapProcState_opts.h
+index 6d01a2f4458f..87b160ed7a1e 100644
+--- a/gfx/skia/skia/src/opts/SkBitmapProcState_opts.h
++++ b/gfx/skia/skia/src/opts/SkBitmapProcState_opts.h
+@@ -29,6 +29,8 @@
+ #include <lasxintrin.h>
+ #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
+ #include <lsxintrin.h>
++#elif defined(SK_CPU_PPC) && defined(__VSX__)
++ #include <altivec.h>
+ #endif
+
+ namespace SK_OPTS_NS {
+@@ -260,6 +262,168 @@ static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, O
+ }
+ }
+
++#elif defined(SK_CPU_PPC) && defined(__VSX__)
++
++ // Helper: scalar uint32_t -> 16-byte vector with x in low 32 bits, zero elsewhere.
++ // Equivalent of x86's _mm_cvtsi32_si128.
++ static inline __vector unsigned char vsx_cvt_u32_to_vec(uint32_t x) {
++ __vector unsigned int v = (__vector unsigned int){x, 0, 0, 0};
++ return (__vector unsigned char)v;
++ }
++
++ // Helper: PPC64 VSX equivalent of x86's _mm_maddubs_epi16. Multiplies pairs of
++ // (unsigned byte, signed byte) and adds adjacent pairs to produce 16-bit signed
++ // values, saturating to int16. Implementation transcribes the GCC ppc_wrappers
++ // tmmintrin.h sequence for endianness correctness on LE PPC64.
++ static inline __vector signed short vsx_maddubs_epi16(__vector unsigned char A,
++ __vector signed char B) {
++ __vector signed short __ff = vec_splats((signed short)0x00FF);
++ __vector signed short __C = vec_and(vec_unpackh((__vector signed char)A), __ff);
++ __vector signed short __D = vec_and(vec_unpackl((__vector signed char)A), __ff);
++ __vector signed short __E = vec_unpackh(B);
++ __vector signed short __F = vec_unpackl(B);
++ __C = vec_mul(__C, __E);
++ __D = vec_mul(__D, __F);
++ const __vector unsigned char __odds = (__vector unsigned char){
++ 0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29
++ };
++ const __vector unsigned char __evens = (__vector unsigned char){
++ 2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31
++ };
++ __E = (__vector signed short)vec_perm((__vector unsigned char)__C,
++ (__vector unsigned char)__D, __odds);
++ __F = (__vector signed short)vec_perm((__vector unsigned char)__C,
++ (__vector unsigned char)__D, __evens);
++ return vec_adds(__E, __F);
++ }
++
++ /*not static*/ inline
++ void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
++ const uint32_t* xy, int count, uint32_t* colors) {
++ SkASSERT(count > 0 && colors != nullptr);
++ SkASSERT(s.fBilerp);
++ SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
++ SkASSERT(s.fAlphaScale <= 256);
++
++ // interpolate_in_x() is the crux of the implementation, interpolating in X
++ // for up to two output pixels (A and B) using vsx_maddubs_epi16().
++ auto interpolate_in_x = [](uint32_t A0, uint32_t A1,
++ uint32_t B0, uint32_t B1,
++ __vector signed char interlaced_x_weights) {
++ // _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1))
++ // = vec_mergeh on uchar, since the input vectors have only the low 32 bits set.
++ __vector unsigned char interlaced_A = vec_mergeh(vsx_cvt_u32_to_vec(A0),
++ vsx_cvt_u32_to_vec(A1));
++ __vector unsigned char interlaced_B = vec_mergeh(vsx_cvt_u32_to_vec(B0),
++ vsx_cvt_u32_to_vec(B1));
++ // _mm_unpacklo_epi64 = vec_mergeh on long long.
++ __vector long long lo64 = vec_mergeh((__vector long long)interlaced_A,
++ (__vector long long)interlaced_B);
++ return vsx_maddubs_epi16((__vector unsigned char)lo64, interlaced_x_weights);
++ };
++
++ // Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B.
++ // Returns two pixels, with each color channel in a 16-bit lane of the result.
++ auto interpolate_in_x_and_y = [&](uint32_t A0, uint32_t A1,
++ uint32_t A2, uint32_t A3,
++ uint32_t B0, uint32_t B1,
++ uint32_t B2, uint32_t B3,
++ __vector signed char interlaced_x_weights,
++ int wy) {
++ __vector signed short top = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights);
++ __vector signed short bot = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights);
++
++ // 16*top + (bot-top)*wy, mirroring the SSE2 form (saves one multiply vs. the
++ // straightforward top*(16-wy) + bot*wy).
++ __vector unsigned short v4 = vec_splats((unsigned short)4);
++ __vector signed short wy_v = vec_splats((signed short)wy);
++ __vector signed short px = vec_add(vec_sl(top, v4), vec_mul(vec_sub(bot, top), wy_v));
++
++ // Scale down by total max weight 16x16 = 256.
++ px = (__vector signed short)vec_sr((__vector unsigned short)px, vec_splats((unsigned short)8));
++
++ // Scale by alpha if needed.
++ if (s.fAlphaScale < 256) {
++ __vector signed short scale_v = vec_splats((signed short)s.fAlphaScale);
++ px = (__vector signed short)vec_sr((__vector unsigned short)vec_mul(px, scale_v),
++ vec_splats((unsigned short)8));
++ }
++ return px;
++ };
++
++ // We're in _DX mode here, so we're only varying in X.
++ // That means the first entry of xy is our constant pair of Y coordinates and weight in Y.
++ int y0, y1, wy;
++ decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
++
++ auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()),
++ row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes());
++
++ while (count >= 4) {
++ // We can really get going, loading 4 X-pairs at a time to produce 4 output pixels.
++ int x0[4],
++ x1[4];
++ __vector unsigned int wx;
++
++ // decode_packed_coordinates_and_weight(), 4x.
++ __vector unsigned int packed = (__vector unsigned int)vec_xl(0, (const unsigned char*)xy);
++ __vector unsigned int x0_v = vec_sr(packed, vec_splats(18u));
++ __vector unsigned int x1_v = vec_and(packed, vec_splats(0x3fffu));
++ vec_xst((__vector unsigned char)x0_v, 0, (unsigned char*)x0);
++ vec_xst((__vector unsigned char)x1_v, 0, (unsigned char*)x1);
++ wx = vec_and(vec_sr(packed, vec_splats(14u)), vec_splats(0xfu)); // [0,15]
++
++ // Splat each x weight 4x (for each color channel) as wr for pixels on the right at x1,
++ // and sixteen minus that as wl for pixels on the left at x0.
++ const __vector unsigned char wr_mask = (__vector unsigned char){
++ 0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12
++ };
++ __vector unsigned char wr = vec_perm((__vector unsigned char)wx,
++ (__vector unsigned char)wx, wr_mask);
++ __vector unsigned char wl = vec_sub(vec_splats((unsigned char)16), wr);
++
++ // Interlace wl and wr for vsx_maddubs_epi16().
++ __vector signed char interlaced_x_weights_AB = (__vector signed char)vec_mergeh(wl, wr);
++ __vector signed char interlaced_x_weights_CD = (__vector signed char)vec_mergel(wl, wr);
++
++ enum { A,B,C,D };
++
++ __vector signed short AB = interpolate_in_x_and_y(
++ row0[x0[A]], row0[x1[A]], row1[x0[A]], row1[x1[A]],
++ row0[x0[B]], row0[x1[B]], row1[x0[B]], row1[x1[B]],
++ interlaced_x_weights_AB, wy);
++ __vector signed short CD = interpolate_in_x_and_y(
++ row0[x0[C]], row0[x1[C]], row1[x0[C]], row1[x1[C]],
++ row0[x0[D]], row0[x1[D]], row1[x0[D]], row1[x1[D]],
++ interlaced_x_weights_CD, wy);
++
++ // Pack 16-bit signed -> 8-bit unsigned with saturation, write 4 pixels.
++ __vector unsigned char packed_out = vec_packsu(AB, CD);
++ vec_xst(packed_out, 0, (unsigned char*)colors);
++ xy += 4;
++ colors += 4;
++ count -= 4;
++ }
++
++ while (count --> 0) {
++ // Same flow as the count >= 4 loop, but writing one pixel.
++ int x0, x1, wx;
++ decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
++
++ __vector unsigned char wr = vec_splats((unsigned char)wx);
++ __vector unsigned char wl = vec_sub(vec_splats((unsigned char)16), wr);
++ __vector signed char interlaced_x_weights = (__vector signed char)vec_mergeh(wl, wr);
++
++ __vector signed short Av = interpolate_in_x_and_y(
++ row0[x0], row0[x1], row1[x0], row1[x1],
++ 0, 0, 0, 0,
++ interlaced_x_weights, wy);
++ __vector unsigned char packed_out = vec_packsu(Av,
++ (__vector signed short)(__vector unsigned char){0});
++ *colors++ = ((__vector unsigned int)packed_out)[0];
++ }
++ }
++
+ #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
+ /*not static*/ inline
+ void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
+diff --git a/gfx/skia/skia/src/opts/SkBlitRow_opts.h b/gfx/skia/skia/src/opts/SkBlitRow_opts.h
+index d1de5681a72e..d03908a03a32 100644
+--- a/gfx/skia/skia/src/opts/SkBlitRow_opts.h
++++ b/gfx/skia/skia/src/opts/SkBlitRow_opts.h
+@@ -68,6 +68,43 @@
+ }
+ #endif
+
++#if defined(SK_CPU_PPC) && defined(__VSX__)
++ #include <altivec.h>
++
++ // Native VSX/AltiVec port of SkPMSrcOver_SSE2.
++ // Same algorithm: src + dst*(256-srcAlpha)/256.
++ static inline __vector unsigned char SkPMSrcOver_VSX(__vector unsigned char src,
++ __vector unsigned char dst) {
++ __vector unsigned int src_u32 = (__vector unsigned int)src;
++ __vector unsigned int dst_u32 = (__vector unsigned int)dst;
++
++ // scale = 256 - (src >> 24) (per 32-bit lane)
++ __vector unsigned int scale = vec_sub(vec_splats((unsigned int)256),
++ vec_sr(src_u32, vec_splats(24u)));
++ // scale_x2 = (scale << 16) | scale -- splat the scale into both 16-bit halves
++ __vector unsigned int scale_x2 = vec_or(vec_sl(scale, vec_splats(16u)), scale);
++
++ const __vector unsigned int rb_mask = vec_splats(0x00FF00FFu);
++
++ // rb = (dst & 0x00FF00FF) * scale_x2 >> 8 (R and B channels in 16-bit lanes)
++ __vector unsigned short rb = (__vector unsigned short)vec_and(rb_mask, dst_u32);
++ rb = vec_mul(rb, (__vector unsigned short)scale_x2);
++ rb = vec_sr(rb, vec_splats((unsigned short)8));
++
++ // ga = (dst >> 8) * scale_x2 then mask out the rb channels
++ __vector unsigned short ga = vec_sr((__vector unsigned short)dst_u32,
++ vec_splats((unsigned short)8));
++ ga = vec_mul(ga, (__vector unsigned short)scale_x2);
++ // andc(ga, rb_mask) = ga & ~rb_mask -- keep only G and A channels in 16-bit lanes
++ __vector unsigned int ga_u32 = vec_andc((__vector unsigned int)ga, rb_mask);
++
++ // result = src + adds_epu8(rb | ga)
++ __vector unsigned char merged =
++ (__vector unsigned char)vec_or((__vector unsigned int)rb, ga_u32);
++ return vec_adds(src, merged);
++ }
++#endif
++
+ #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+ #include <immintrin.h>
+
+@@ -176,6 +213,17 @@ inline void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len,
+ }
+ #endif
+
++#if defined(SK_CPU_PPC) && defined(__VSX__)
++ while (len >= 4) {
++ __vector unsigned char vsrc = vec_xl(0, (const unsigned char*)src);
++ __vector unsigned char vdst = vec_xl(0, (const unsigned char*)dst);
++ vec_xst(SkPMSrcOver_VSX(vsrc, vdst), 0, (unsigned char*)dst);
++ src += 4;
++ dst += 4;
++ len -= 4;
++ }
++#endif
++
+ #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+ while (len >= 4) {
+ _mm_storeu_si128((__m128i*)dst, SkPMSrcOver_SSE2(_mm_loadu_si128((const __m128i*)src),
+diff --git a/gfx/skia/skia/src/opts/SkRasterPipeline_opts.h b/gfx/skia/skia/src/opts/SkRasterPipeline_opts.h
+index 695b71434f8c..e2af0b94f392 100644
+--- a/gfx/skia/skia/src/opts/SkRasterPipeline_opts.h
++++ b/gfx/skia/skia/src/opts/SkRasterPipeline_opts.h
+@@ -87,6 +87,8 @@ using NoCtx = const void*;
+ #define SKRP_CPU_SCALAR
+ #elif defined(SK_ARM_HAS_NEON)
+ #define SKRP_CPU_NEON
++#elif defined(SK_CPU_PPC) && defined(__VSX__)
++ #define SKRP_CPU_VSX
+ #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
+ #define SKRP_CPU_SKX
+ #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
+@@ -109,6 +111,8 @@ using NoCtx = const void*;
+ #include <math.h>
+ #elif defined(SKRP_CPU_NEON)
+ #include <arm_neon.h>
++#elif defined(SKRP_CPU_VSX)
++ #include <altivec.h>
+ #elif defined(SKRP_CPU_LASX)
+ #include <lasxintrin.h>
+ #include <lsxintrin.h>
+@@ -337,6 +341,239 @@ namespace SK_OPTS_NS {
+ vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
+ }
+
++#elif defined(SKRP_CPU_VSX)
++ // Reuse the file-scope Vec<N,T> defined above. It already handles the
++ // GCC-vs-Clang divergence (ext_vector_type on Clang; vector_size via
++ // VecHelper on GCC) and produces the right vector-register-passing ABI
++ // on PPC64. The vec_* intrinsics in <altivec.h> accept either form.
++ template <typename T> using V = Vec<4, T>;
++ using F = V<float >;
++ using I32 = V< int32_t>;
++ using U64 = V<uint64_t>;
++ using U32 = V<uint32_t>;
++ using U16 = V<uint16_t>;
++ using U8 = V<uint8_t >;
++
++ // We polyfill a few routines that Clang doesn't build into ext_vector_types.
++ SI F min(F a, F b) { return vec_min(a,b); }
++ SI I32 min(I32 a, I32 b) { return vec_min(a,b); }
++ SI U32 min(U32 a, U32 b) { return vec_min(a,b); }
++ SI F max(F a, F b) { return vec_max(a,b); }
++ SI I32 max(I32 a, I32 b) { return vec_max(a,b); }
++ SI U32 max(U32 a, U32 b) { return vec_max(a,b); }
++
++ SI F abs_ (F v) { return vec_abs(v); }
++ SI I32 abs_ (I32 v) { return vec_abs(v); }
++ SI F rcp_approx(F v) { return vec_re(v); }
++ SI F rcp_precise (F v) { F e = rcp_approx(v); return e * (2.0f - v * e); }
++ SI F rsqrt_approx (F v) { return vec_rsqrte(v); }
++
++ SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); }
++ SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); }
++
++ SI F if_then_else(I32 c, F t, F e) {
++ return vec_or((__vector float)vec_and((__vector float)c, (__vector float)t), (__vector float)vec_andc((__vector float)e, (__vector float)c));
++ }
++ SI I32 if_then_else(I32 c, I32 t, I32 e) {
++ return (I32)vec_or((__vector unsigned int)vec_and((__vector unsigned int)c, (__vector unsigned int)t), (__vector unsigned int)vec_andc((__vector unsigned int)e, (__vector unsigned int)c));
++ }
++
++ // In both AltiVec and SSE there is no horizontal element compare, unlike ARM. Fall back to scalar operations here...
++ SI bool any(I32 c) {
++ if (vec_extract((U32)c, 0) != 0) return 1;
++ if (vec_extract((U32)c, 1) != 0) return 1;
++ if (vec_extract((U32)c, 2) != 0) return 1;
++ if (vec_extract((U32)c, 3) != 0) return 1;
++ return 0;
++ }
++ SI bool all(I32 c) {
++ if (vec_extract((U32)c, 0) == 0) return 0;
++ if (vec_extract((U32)c, 1) == 0) return 0;
++ if (vec_extract((U32)c, 2) == 0) return 0;
++ if (vec_extract((U32)c, 3) == 0) return 0;
++ return 1;
++ }
++
++ SI F mad(F f, F m, F a) { return vec_madd(f,m,a); }
++ SI F nmad(F f, F m, F a) { return vec_nmsub(f,m,a); }
++ SI F floor_(F v) { return vec_floor(v); }
++ SI F ceil_(F v) { return vec_ceil(v); }
++ SI F sqrt_(F v) { return vec_sqrt(v); }
++ SI I32 iround(F v) { return vec_cts((__vector float)vec_rint(v), 0); }
++ SI U32 round(F v) { return vec_ctu((__vector float)vec_rint(v), 0); }
++ SI U32 round(F v, F scale) { return (U32)vec_cts((__vector float)vec_rint(v*scale), 0); }
++
++ template <typename T>
++ SI V<T> gather(const T* p, U32 ix) {
++ return V<T>{p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]};
++ }
++ template <typename T>
++ SI V<T> gather_unaligned(const T* ptr, U32 ix) {
++ // This tells the compiler ptr might not be aligned appropriately, so
++ // it generates better assembly.
++ typedef T __attribute__ ((aligned (1))) unaligned_ptr;
++ const unaligned_ptr* uptr = static_cast<const unaligned_ptr*>(ptr);
++ return V<T>{uptr[ix[0]], uptr[ix[1]], uptr[ix[2]], uptr[ix[3]]};
++ }
++ template <typename V, typename S>
++ SI void scatter_masked(V src, S* dst, U32 ix, I32 mask) {
++ V before = gather(dst, ix);
++ V after = if_then_else(mask, src, before);
++ dst[ix[0]] = after[0];
++ dst[ix[1]] = after[1];
++ dst[ix[2]] = after[2];
++ dst[ix[3]] = after[3];
++ }
++
++ // Native VSX/AltiVec ports of the load2/store2/load3/load4/store4 helpers.
++ // Each uses vec_xl/vec_xst for unaligned 16-byte loads/stores, vec_mergeh/
++ // vec_mergel for SSE-style epi16/epi32/ps unpack ops, and vec_perm with a
++ // byte-mask for the SSE shufflelo/shufflehi/shuffle/srli_si128 ops. The
++ // PPC64 LE register-to-memory byte order matches x86 LE, so the byte-mask
++ // patterns are identical to the corresponding _mm_setr_epi8 forms.
++
++ SI void load2(const uint16_t* ptr, U16* r, U16* g) {
++ // Load 8 uint16: r0 g0 r1 g1 r2 g2 r3 g3 (in LE memory order).
++ __vector unsigned char v = vec_xl(0, (const unsigned char*)ptr);
++ // Extract every-other 16-bit value via vec_perm (high half of result is unused
++ // but written; sk_unaligned_load below picks up the low 8 bytes).
++ const __vector unsigned char r_mask = (__vector unsigned char){
++ 0,1, 4,5, 8,9, 12,13, 0,0,0,0,0,0,0,0
++ };
++ const __vector unsigned char g_mask = (__vector unsigned char){
++ 2,3, 6,7, 10,11, 14,15, 0,0,0,0,0,0,0,0
++ };
++ __vector unsigned char R_v = vec_perm(v, v, r_mask);
++ __vector unsigned char G_v = vec_perm(v, v, g_mask);
++ *r = sk_unaligned_load<U16>(&R_v);
++ *g = sk_unaligned_load<U16>(&G_v);
++ }
++
++ SI void store2(uint16_t* ptr, U16 r, U16 g) {
++ // Interleave: rg = r0 g0 r1 g1 r2 g2 r3 g3.
++ // r and g are 8-byte vectors; widen to 16 and vec_mergeh on ushort takes
++ // the low 4 lanes of each.
++ __vector unsigned short rw = widen_cast<__vector unsigned short>(r);
++ __vector unsigned short gw = widen_cast<__vector unsigned short>(g);
++ __vector unsigned short rg = vec_mergeh(rw, gw);
++ vec_xst((__vector unsigned char)rg, 0, (unsigned char*)ptr);
++ }
++
++ SI void load3(const uint16_t* ptr, U16* r, U16* g, U16* b) {
++ // 4 pixels x 3 channels x 2 bytes = 24 bytes. Two 16-byte loads with overlap
++ // avoid reading past the 24-byte source.
++ __vector unsigned char v01 = vec_xl(0, (const unsigned char*)(ptr + 0));
++ __vector unsigned char v23_raw = vec_xl(0, (const unsigned char*)(ptr + 4));
++ const __vector unsigned char zero = vec_splats((unsigned char)0);
++ // v23 = v23_raw >> 4 bytes (drops the overlapping pixel-1 trailing R).
++ const __vector unsigned char shift4 = (__vector unsigned char){
++ 4,5,6,7, 8,9,10,11, 12,13,14,15, 16,16,16,16
++ };
++ __vector unsigned char v23 = vec_perm(v23_raw, zero, shift4);
++ // _N holds R,G,B for pixel N in its lower 3 lanes. shift6 advances to the next pixel.
++ const __vector unsigned char shift6 = (__vector unsigned char){
++ 6,7,8,9, 10,11,12,13, 14,15, 16,16, 16,16, 16,16
++ };
++ __vector unsigned char _0 = v01;
++ __vector unsigned char _1 = vec_perm(v01, zero, shift6);
++ __vector unsigned char _2 = v23;
++ __vector unsigned char _3 = vec_perm(v23, zero, shift6);
++ // De-interlace to R,G,B per the SSE flow.
++ __vector unsigned short _02 = vec_mergeh((__vector unsigned short)_0,
++ (__vector unsigned short)_2);
++ __vector unsigned short _13 = vec_mergeh((__vector unsigned short)_1,
++ (__vector unsigned short)_3);
++ __vector unsigned short R_v = vec_mergeh(_02, _13);
++ const __vector unsigned char shift8 = (__vector unsigned char){
++ 8,9,10,11, 12,13,14,15, 16,16,16,16, 16,16,16,16
++ };
++ __vector unsigned char G_v = vec_perm((__vector unsigned char)R_v, zero, shift8);
++ __vector unsigned short B_v = vec_mergel(_02, _13);
++ *r = sk_unaligned_load<U16>(&R_v);
++ *g = sk_unaligned_load<U16>(&G_v);
++ *b = sk_unaligned_load<U16>(&B_v);
++ }
++
++ SI void load4(const uint16_t* ptr, U16* r, U16* g, U16* b, U16* a) {
++ __vector unsigned short v01 = (__vector unsigned short)
++ vec_xl(0, (const unsigned char*)ptr); // r0 g0 b0 a0 r1 g1 b1 a1
++ __vector unsigned short v23 = (__vector unsigned short)
++ vec_xl(0, (const unsigned char*)(ptr + 8)); // r2 g2 b2 a2 r3 g3 b3 a3
++ __vector unsigned short _02 = vec_mergeh(v01, v23); // r0 r2 g0 g2 b0 b2 a0 a2
++ __vector unsigned short _13 = vec_mergel(v01, v23); // r1 r3 g1 g3 b1 b3 a1 a3
++ __vector unsigned short rg = vec_mergeh(_02, _13); // r0 r1 r2 r3 g0 g1 g2 g3
++ __vector unsigned short ba = vec_mergel(_02, _13); // b0 b1 b2 b3 a0 a1 a2 a3
++ *r = sk_unaligned_load<U16>((const uint16_t*)&rg + 0);
++ *g = sk_unaligned_load<U16>((const uint16_t*)&rg + 4);
++ *b = sk_unaligned_load<U16>((const uint16_t*)&ba + 0);
++ *a = sk_unaligned_load<U16>((const uint16_t*)&ba + 4);
++ }
++
++ SI void store4(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) {
++ __vector unsigned short rw = widen_cast<__vector unsigned short>(r);
++ __vector unsigned short gw = widen_cast<__vector unsigned short>(g);
++ __vector unsigned short bw = widen_cast<__vector unsigned short>(b);
++ __vector unsigned short aw = widen_cast<__vector unsigned short>(a);
++ __vector unsigned short rg = vec_mergeh(rw, gw); // r0 g0 r1 g1 r2 g2 r3 g3
++ __vector unsigned short ba = vec_mergeh(bw, aw); // b0 a0 b1 a1 b2 a2 b3 a3
++ // Now interleave 32-bit lanes (each rg pair = 1 lane, each ba pair = 1 lane).
++ __vector unsigned int rgba_lo = vec_mergeh((__vector unsigned int)rg,
++ (__vector unsigned int)ba);
++ __vector unsigned int rgba_hi = vec_mergel((__vector unsigned int)rg,
++ (__vector unsigned int)ba);
++ vec_xst((__vector unsigned char)rgba_lo, 0, (unsigned char*)ptr);
++ vec_xst((__vector unsigned char)rgba_hi, 0, (unsigned char*)(ptr + 8));
++ }
++
++ SI void load2(const float* ptr, F* r, F* g) {
++ __vector float _01 = vec_xl(0, ptr); // r0 g0 r1 g1
++ __vector float _23 = vec_xl(0, ptr + 4); // r2 g2 r3 g3
++ // r = lanes {_01[0], _01[2], _23[0], _23[2]}; g = {_01[1], _01[3], _23[1], _23[3]}.
++ const __vector unsigned char r_mask = (__vector unsigned char){
++ 0,1,2,3, 8,9,10,11, 16,17,18,19, 24,25,26,27
++ };
++ const __vector unsigned char g_mask = (__vector unsigned char){
++ 4,5,6,7, 12,13,14,15, 20,21,22,23, 28,29,30,31
++ };
++ *r = (F)vec_perm((__vector unsigned char)_01, (__vector unsigned char)_23, r_mask);
++ *g = (F)vec_perm((__vector unsigned char)_01, (__vector unsigned char)_23, g_mask);
++ }
++
++ SI void store2(float* ptr, F r, F g) {
++ __vector float _01 = vec_mergeh((__vector float)r, (__vector float)g); // r0 g0 r1 g1
++ __vector float _23 = vec_mergel((__vector float)r, (__vector float)g); // r2 g2 r3 g3
++ vec_xst((__vector unsigned char)_01, 0, (unsigned char*)ptr);
++ vec_xst((__vector unsigned char)_23, 0, (unsigned char*)(ptr + 4));
++ }
++
++ SI void load4(const float* ptr, F* r, F* g, F* b, F* a) {
++ // 4x4 float matrix transpose: rows -> columns.
++ __vector float row0 = vec_xl(0, ptr + 0);
++ __vector float row1 = vec_xl(0, ptr + 4);
++ __vector float row2 = vec_xl(0, ptr + 8);
++ __vector float row3 = vec_xl(0, ptr + 12);
++ __vector float T0 = vec_mergeh(row0, row2); // {row0[0], row2[0], row0[1], row2[1]}
++ __vector float T1 = vec_mergeh(row1, row3);
++ __vector float T2 = vec_mergel(row0, row2);
++ __vector float T3 = vec_mergel(row1, row3);
++ *r = (F)vec_mergeh(T0, T1); // {row0[0], row1[0], row2[0], row3[0]}
++ *g = (F)vec_mergel(T0, T1);
++ *b = (F)vec_mergeh(T2, T3);
++ *a = (F)vec_mergel(T2, T3);
++ }
++
++ SI void store4(float* ptr, F r, F g, F b, F a) {
++ // 4x4 float matrix transpose, then store rows.
++ __vector float T0 = vec_mergeh((__vector float)r, (__vector float)b);
++ __vector float T1 = vec_mergeh((__vector float)g, (__vector float)a);
++ __vector float T2 = vec_mergel((__vector float)r, (__vector float)b);
++ __vector float T3 = vec_mergel((__vector float)g, (__vector float)a);
++ vec_xst((__vector unsigned char)vec_mergeh(T0, T1), 0, (unsigned char*)(ptr + 0));
++ vec_xst((__vector unsigned char)vec_mergel(T0, T1), 0, (unsigned char*)(ptr + 4));
++ vec_xst((__vector unsigned char)vec_mergeh(T2, T3), 0, (unsigned char*)(ptr + 8));
++ vec_xst((__vector unsigned char)vec_mergel(T2, T3), 0, (unsigned char*)(ptr + 12));
++ }
++
+ #elif defined(SKRP_CPU_SKX)
+ template <typename T> using V = Vec<16, T>;
+ using F = V<float >;
+diff --git a/gfx/skia/skia/src/opts/SkSwizzler_opts.inc b/gfx/skia/skia/src/opts/SkSwizzler_opts.inc
+index 671db3f05f61..c578238a9e58 100644
+--- a/gfx/skia/skia/src/opts/SkSwizzler_opts.inc
++++ b/gfx/skia/skia/src/opts/SkSwizzler_opts.inc
+@@ -84,6 +84,29 @@ SI float reciprocal_alpha(float a) {
+ auto q = F4{1.0f} / vA;
+ return _mm_and_ps(sk_bit_cast<__m128>(vA != F4{0.0f}), q)[0];
+ }
++#elif defined(SK_CPU_PPC) && defined(__VSX__)
++// -- VSX -- Harden against timing attacks.
++// vec_splats / vec_div / vec_cmpgt / vec_and each map to a single VSX op on
++// both GCC and Clang. vec_cmpgt(vA, 0) is exact for the non-negative-alpha
++// contract (0 <= a) and avoids Clang's static_cast<float>(vector) extension
++// that GCC does not support.
++SK_NO_SANITIZE("float-divide-by-zero")
++SI float reciprocal_alpha_times_255(float a) {
++ SkASSERT(0 <= a && a <= 255);
++ __vector float vA = vec_splats(a);
++ __vector float q = vec_div(vec_splats(255.0f), vA);
++ __vector float vMask = (__vector float)vec_cmpgt(vA, vec_splats(0.0f));
++ return vec_and(vMask, q)[0];
++}
++
++SK_NO_SANITIZE("float-divide-by-zero")
++SI float reciprocal_alpha(float a) {
++ SkASSERT(0 <= a && a <= 1);
++ __vector float vA = vec_splats(a);
++ __vector float q = vec_div(vec_splats(1.0f), vA);
++ __vector float vMask = (__vector float)vec_cmpgt(vA, vec_splats(0.0f));
++ return vec_and(vMask, q)[0];
++}
+ #else
+ // -- Portable -- *Not* hardened against timing attacks
+ SI float reciprocal_alpha_times_255(float a) {
+@@ -1085,6 +1108,208 @@ void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
+ rgbA_to_BGRA_portable(dst, src, count);
+ }
+
++#elif defined(SK_CPU_PPC) && defined(__VSX__)
++// -- VSX -- Native Power VSX/AltiVec ports of the SSSE3 swizzlers below.
++// Each _mm_* operation is replaced by the corresponding vec_* sequence per
++// the GCC ppc_wrappers translation pattern (vec_mergeh/l, vec_perm, and the
++// vec_vmuleuh/vmulouh + permute idiom for _mm_mulhi_epu16). The permute
++// masks for byte-shuffles use the same byte-order layout as the SSE
++// _mm_setr_epi8 forms because PPC64 LE register-to-memory byte order is the
++// same as x86 LE.
++
++// Scale: ((x*y) + 128) * 257 >> 16, per 16-bit lane (matches the SSSE3 form).
++static inline __vector unsigned short scale(__vector unsigned short x, __vector unsigned short y) {
++ const __vector unsigned short v128 = vec_splats((unsigned short)128);
++ const __vector unsigned short v257 = vec_splats((unsigned short)257);
++ __vector unsigned short summ = (__vector unsigned short)((__vector unsigned short)(x * y) + v128);
++ // _mm_mulhi_epu16 equivalent: 16x16 -> high 16 bits, via mule+mulo+permute.
++ __vector unsigned int even = vec_vmuleuh(summ, v257);
++ __vector unsigned int odd = vec_vmulouh(summ, v257);
++ const __vector unsigned char xform = (__vector unsigned char){
++ 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
++ 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
++ };
++ return (__vector unsigned short)vec_perm((__vector unsigned char)even,
++ (__vector unsigned char)odd, xform);
++}
++
++static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
++ auto premul8 = [=](__vector unsigned char* lo, __vector unsigned char* hi) {
++ const __vector unsigned char zeros = (__vector unsigned char){0};
++ const __vector unsigned char planar = kSwapRB
++ ? (__vector unsigned char){2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15}
++ : (__vector unsigned char){0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15};
++
++ // Swizzle each 16-byte chunk into 8-bit planar layout.
++ *lo = vec_perm(*lo, *lo, planar); // rrrrgggg bbbbaaaa
++ *hi = vec_perm(*hi, *hi, planar); // RRRRGGGG BBBBAAAA
++
++ // Interleave the two halves at 32-bit granularity.
++ __vector unsigned char rg = (__vector unsigned char)
++ vec_mergeh((__vector unsigned int)*lo, (__vector unsigned int)*hi); // rrrrRRRR ggggGGGG
++ __vector unsigned char ba = (__vector unsigned char)
++ vec_mergel((__vector unsigned int)*lo, (__vector unsigned int)*hi); // bbbbBBBB aaaaAAAA
++
++ // Unpack to 16-bit planar.
++ __vector unsigned short r = (__vector unsigned short)vec_mergeh(rg, zeros);
++ __vector unsigned short g = (__vector unsigned short)vec_mergel(rg, zeros);
++ __vector unsigned short b = (__vector unsigned short)vec_mergeh(ba, zeros);
++ __vector unsigned short a = (__vector unsigned short)vec_mergel(ba, zeros);
++
++ // Premultiply each colour channel by alpha.
++ r = scale(r, a);
++ g = scale(g, a);
++ b = scale(b, a);
++
++ // Repack into interlaced pixels.
++ const __vector unsigned short v8 = vec_splats((unsigned short)8);
++ __vector unsigned short rg2 = vec_or(r, vec_sl(g, v8));
++ __vector unsigned short ba2 = vec_or(b, vec_sl(a, v8));
++ *lo = (__vector unsigned char)vec_mergeh(rg2, ba2);
++ *hi = (__vector unsigned char)vec_mergel(rg2, ba2);
++ };
++
++ while (count >= 8) {
++ __vector unsigned char lo = vec_xl(0, (const unsigned char*)(src + 0));
++ __vector unsigned char hi = vec_xl(0, (const unsigned char*)(src + 4));
++ premul8(&lo, &hi);
++ vec_xst(lo, 0, (unsigned char*)(dst + 0));
++ vec_xst(hi, 0, (unsigned char*)(dst + 4));
++ src += 8; dst += 8; count -= 8;
++ }
++
++ if (count >= 4) {
++ __vector unsigned char lo = vec_xl(0, (const unsigned char*)src);
++ __vector unsigned char hi = (__vector unsigned char){0};
++ premul8(&lo, &hi);
++ vec_xst(lo, 0, (unsigned char*)dst);
++ src += 4; dst += 4; count -= 4;
++ }
++
++ auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
++ proc(dst, src, count);
++}
++
++void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
++ premul_should_swapRB(false, dst, src, count);
++}
++
++void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
++ premul_should_swapRB(true, dst, src, count);
++}
++
++void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
++ const __vector unsigned char swapRB = (__vector unsigned char){
++ 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15
++ };
++ while (count >= 4) {
++ __vector unsigned char rgba = vec_xl(0, (const unsigned char*)src);
++ __vector unsigned char bgra = vec_perm(rgba, rgba, swapRB);
++ vec_xst(bgra, 0, (unsigned char*)dst);
++ src += 4; dst += 4; count -= 4;
++ }
++ RGBA_to_BGRA_portable(dst, src, count);
++}
++
++void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
++ while (count >= 8) {
++ __vector unsigned short ga = (__vector unsigned short)vec_xl(0, src);
++ __vector unsigned short gg = vec_or(
++ vec_and(ga, vec_splats((unsigned short)0x00FF)),
++ vec_sl (ga, vec_splats((unsigned short)8)));
++ __vector unsigned short ggga_lo = vec_mergeh(gg, ga);
++ __vector unsigned short ggga_hi = vec_mergel(gg, ga);
++ vec_xst((__vector unsigned char)ggga_lo, 0, (unsigned char*)(dst + 0));
++ vec_xst((__vector unsigned char)ggga_hi, 0, (unsigned char*)(dst + 4));
++ src += 8 * 2; dst += 8; count -= 8;
++ }
++ grayA_to_RGBA_portable(dst, src, count);
++}
++
++void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
++ while (count >= 8) {
++ __vector unsigned short grayA = (__vector unsigned short)vec_xl(0, src);
++ __vector unsigned short g0 = vec_and(grayA, vec_splats((unsigned short)0x00FF));
++ __vector unsigned short a0 = vec_sr (grayA, vec_splats((unsigned short)8));
++ g0 = scale(g0, a0);
++ const __vector unsigned short v8 = vec_splats((unsigned short)8);
++ __vector unsigned short gg = vec_or(g0, vec_sl(g0, v8));
++ __vector unsigned short ga = vec_or(g0, vec_sl(a0, v8));
++ __vector unsigned short ggga_lo = vec_mergeh(gg, ga);
++ __vector unsigned short ggga_hi = vec_mergel(gg, ga);
++ vec_xst((__vector unsigned char)ggga_lo, 0, (unsigned char*)(dst + 0));
++ vec_xst((__vector unsigned char)ggga_hi, 0, (unsigned char*)(dst + 4));
++ src += 8 * 2; dst += 8; count -= 8;
++ }
++ grayA_to_rgbA_portable(dst, src, count);
++}
++
++enum Format { kRGB1, kBGR1 };
++static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
++ auto convert8 = [=](__vector unsigned char* lo, __vector unsigned char* hi) {
++ const __vector unsigned char zeros = (__vector unsigned char){0};
++ const __vector unsigned char planar = (kBGR1 == format)
++ ? (__vector unsigned char){2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15}
++ : (__vector unsigned char){0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15};
++
++ *lo = vec_perm(*lo, *lo, planar); // ccccmmmm yyyykkkk
++ *hi = vec_perm(*hi, *hi, planar); // CCCCMMMM YYYYKKKK
++ __vector unsigned char cm = (__vector unsigned char)
++ vec_mergeh((__vector unsigned int)*lo, (__vector unsigned int)*hi);
++ __vector unsigned char yk = (__vector unsigned char)
++ vec_mergel((__vector unsigned int)*lo, (__vector unsigned int)*hi);
++
++ __vector unsigned short c = (__vector unsigned short)vec_mergeh(cm, zeros);
++ __vector unsigned short m = (__vector unsigned short)vec_mergel(cm, zeros);
++ __vector unsigned short y = (__vector unsigned short)vec_mergeh(yk, zeros);
++ __vector unsigned short k = (__vector unsigned short)vec_mergel(yk, zeros);
++
++ __vector unsigned short r = scale(c, k);
++ __vector unsigned short g = scale(m, k);
++ __vector unsigned short b = scale(y, k);
++
++ const __vector unsigned short v8 = vec_splats((unsigned short)8);
++ __vector unsigned short rg = vec_or(r, vec_sl(g, v8));
++ __vector unsigned short ba = vec_or(b, vec_splats((unsigned short)0xFF00));
++ *lo = (__vector unsigned char)vec_mergeh(rg, ba);
++ *hi = (__vector unsigned char)vec_mergel(rg, ba);
++ };
++
++ while (count >= 8) {
++ __vector unsigned char lo = vec_xl(0, (const unsigned char*)(src + 0));
++ __vector unsigned char hi = vec_xl(0, (const unsigned char*)(src + 4));
++ convert8(&lo, &hi);
++ vec_xst(lo, 0, (unsigned char*)(dst + 0));
++ vec_xst(hi, 0, (unsigned char*)(dst + 4));
++ src += 8; dst += 8; count -= 8;
++ }
++ if (count >= 4) {
++ __vector unsigned char lo = vec_xl(0, (const unsigned char*)src);
++ __vector unsigned char hi = (__vector unsigned char){0};
++ convert8(&lo, &hi);
++ vec_xst(lo, 0, (unsigned char*)dst);
++ src += 4; dst += 4; count -= 4;
++ }
++ auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
++ proc(dst, src, count);
++}
++
++void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
++ inverted_cmyk_to(kRGB1, dst, src, count);
++}
++
++void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
++ inverted_cmyk_to(kBGR1, dst, src, count);
++}
++
++void rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) {
++ rgbA_to_RGBA_portable(dst, src, count);
++}
++
++void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
++ rgbA_to_BGRA_portable(dst, src, count);
++}
++
+ #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
+ // -- LASX ----------------------------------------------------------------------------------------
+
+@@ -1736,6 +1961,39 @@ static void gray_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count)
+ }
+ gray_to_RGB1_portable(dst, src, count);
+ }
++#elif defined(SK_CPU_PPC) && defined(__VSX__)
++ void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
++ const __vector unsigned char alphas = vec_splats((unsigned char)0xFF);
++ while (count >= 16) {
++ __vector unsigned char grays = vec_xl(0, src);
++
++ // Replicate gray byte: gg = unpack(gray, gray) per 8-bit lane.
++ __vector unsigned char gg_lo = vec_mergeh(grays, grays);
++ __vector unsigned char gg_hi = vec_mergel(grays, grays);
++ __vector unsigned char ga_lo = vec_mergeh(grays, alphas);
++ __vector unsigned char ga_hi = vec_mergel(grays, alphas);
++
++ // Interleave g-pairs and ga-pairs at 16-bit granularity.
++ __vector unsigned short ggga0 = vec_mergeh((__vector unsigned short)gg_lo,
++ (__vector unsigned short)ga_lo);
++ __vector unsigned short ggga1 = vec_mergel((__vector unsigned short)gg_lo,
++ (__vector unsigned short)ga_lo);
++ __vector unsigned short ggga2 = vec_mergeh((__vector unsigned short)gg_hi,
++ (__vector unsigned short)ga_hi);
++ __vector unsigned short ggga3 = vec_mergel((__vector unsigned short)gg_hi,
++ (__vector unsigned short)ga_hi);
++
++ vec_xst((__vector unsigned char)ggga0, 0, (unsigned char*)(dst + 0));
++ vec_xst((__vector unsigned char)ggga1, 0, (unsigned char*)(dst + 4));
++ vec_xst((__vector unsigned char)ggga2, 0, (unsigned char*)(dst + 8));
++ vec_xst((__vector unsigned char)ggga3, 0, (unsigned char*)(dst + 12));
++
++ src += 16;
++ dst += 16;
++ count -= 16;
++ }
++ gray_to_RGB1_portable(dst, src, count);
++ }
+ #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
+ /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
+ const __m256i alphas = __lasx_xvreplgr2vr_b(0xFF);
+@@ -1920,6 +2178,37 @@ static void RGB_to_BGR1_portable(uint32_t dst[], const uint8_t* src, int count)
+ proc(dst, src, count);
+ }
+
++ void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
++ insert_alpha_should_swaprb(false, dst, src, count);
++ }
++ void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
++ insert_alpha_should_swaprb(true, dst, src, count);
++ }
++#elif defined(SK_CPU_PPC) && defined(__VSX__)
++ static void insert_alpha_should_swaprb(bool kSwapRB,
++ uint32_t dst[], const uint8_t* src, int count) {
++ // alphaMask = 0xFF000000 per 32-bit lane -> bytes (in LE memory layout) are
++ // {00,00,00,FF, 00,00,00,FF, ...}.
++ const __vector unsigned char alphaMask = (__vector unsigned char){
++ 0,0,0,0xFF, 0,0,0,0xFF, 0,0,0,0xFF, 0,0,0,0xFF
++ };
++ // 'X' (= 0) is irrelevant: the alphaMask OR overwrites those lanes with FF.
++ const __vector unsigned char expand = kSwapRB
++ ? (__vector unsigned char){2,1,0,0, 5,4,3,0, 8,7,6,0, 11,10,9,0}
++ : (__vector unsigned char){0,1,2,0, 3,4,5,0, 6,7,8,0, 9,10,11,0};
++
++ while (count >= 6) {
++ __vector unsigned char rgb = vec_xl(0, src);
++ __vector unsigned char rgba = vec_or(vec_perm(rgb, rgb, expand), alphaMask);
++ vec_xst(rgba, 0, (unsigned char*)dst);
++ src += 4*3;
++ dst += 4;
++ count -= 4;
++ }
++ auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
++ proc(dst, src, count);
++ }
++
+ void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
+ insert_alpha_should_swaprb(false, dst, src, count);
+ }
+
+base-commit: a8d530ac13f0ce7e937c047f01f0d36764f5d34e
+--
+2.52.0
+
diff --git a/0002-Add-VSX-instructions-for-libwebp.patch b/0002-Add-VSX-instructions-for-libwebp.patch
new file mode 100644
index 0000000..1f857a7
--- /dev/null
+++ b/0002-Add-VSX-instructions-for-libwebp.patch
@@ -0,0 +1,2524 @@
+From b9e116898830a0f9edd1b0566651ce2d4989618d Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Trung=20L=C3=AA?= <8@tle.id.au>
+Date: Fri, 12 Jun 2026 15:30:13 +1000
+Subject: [PATCH 2/3] Add VSX instructions for libwebp
+
+Assisted-by: Lance Albertson <lance@osuosl.org>
+Assisted-by: Thushan Fernando <thushan@thushanfernando.com>
+---
+ media/libwebp/src/dsp/alpha_processing.c | 6 +
+ media/libwebp/src/dsp/alpha_processing_vsx.c | 246 +++++++
+ media/libwebp/src/dsp/cpu.h | 14 +-
+ media/libwebp/src/dsp/dec.c | 6 +
+ media/libwebp/src/dsp/dec_vsx.c | 737 +++++++++++++++++++
+ media/libwebp/src/dsp/filters.c | 6 +
+ media/libwebp/src/dsp/filters_vsx.c | 162 ++++
+ media/libwebp/src/dsp/lossless.c | 6 +
+ media/libwebp/src/dsp/lossless_vsx.c | 449 +++++++++++
+ media/libwebp/src/dsp/moz.build | 14 +
+ media/libwebp/src/dsp/rescaler.c | 6 +
+ media/libwebp/src/dsp/rescaler_vsx.c | 201 +++++
+ media/libwebp/src/dsp/upsampling.c | 12 +
+ media/libwebp/src/dsp/upsampling_vsx.c | 151 ++++
+ media/libwebp/src/dsp/yuv.c | 6 +
+ media/libwebp/src/dsp/yuv.h | 21 +
+ media/libwebp/src/dsp/yuv_vsx.c | 206 ++++++
+ media/libwebp/src/moz/cpu.cpp | 4 +
+ 18 files changed, 2252 insertions(+), 1 deletion(-)
+ create mode 100644 media/libwebp/src/dsp/alpha_processing_vsx.c
+ create mode 100644 media/libwebp/src/dsp/dec_vsx.c
+ create mode 100644 media/libwebp/src/dsp/filters_vsx.c
+ create mode 100644 media/libwebp/src/dsp/lossless_vsx.c
+ create mode 100644 media/libwebp/src/dsp/rescaler_vsx.c
+ create mode 100644 media/libwebp/src/dsp/upsampling_vsx.c
+ create mode 100644 media/libwebp/src/dsp/yuv_vsx.c
+
+diff --git a/media/libwebp/src/dsp/alpha_processing.c b/media/libwebp/src/dsp/alpha_processing.c
+index 4927e73e81bf..5f9152bf701a 100644
+--- a/media/libwebp/src/dsp/alpha_processing.c
++++ b/media/libwebp/src/dsp/alpha_processing.c
+@@ -434,6 +434,7 @@ extern void WebPInitAlphaProcessingMIPSdspR2(void);
+ extern void WebPInitAlphaProcessingSSE2(void);
+ extern void WebPInitAlphaProcessingSSE41(void);
+ extern void WebPInitAlphaProcessingNEON(void);
++extern void WebPInitAlphaProcessingVSX(void);
+
+ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
+ WebPMultARGBRow = WebPMultARGBRow_C;
+@@ -472,6 +473,11 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
+ if (VP8GetCPUInfo(kMIPSdspR2)) {
+ WebPInitAlphaProcessingMIPSdspR2();
+ }
++#endif
++#if defined(WEBP_HAVE_VSX)
++ if (VP8GetCPUInfo(kVSX)) {
++ WebPInitAlphaProcessingVSX();
++ }
+ #endif
+ }
+
+diff --git a/media/libwebp/src/dsp/alpha_processing_vsx.c b/media/libwebp/src/dsp/alpha_processing_vsx.c
+new file mode 100644
+index 000000000000..2aad1cd8b648
+--- /dev/null
++++ b/media/libwebp/src/dsp/alpha_processing_vsx.c
+@@ -0,0 +1,246 @@
++// Copyright 2014 Google Inc. All Rights Reserved.
++//
++// Use of this source code is governed by a BSD-style license
++// that can be found in the COPYING file in the root of the source
++// tree. An additional intellectual property rights grant can be found
++// in the file PATENTS. All contributing project authors may
++// be found in the AUTHORS file in the root of the source tree.
++// -----------------------------------------------------------------------------
++//
++// VSX (PowerPC) version of alpha processing functions.
++
++#include "src/dsp/dsp.h"
++
++#if defined(WEBP_USE_VSX)
++
++#include <altivec.h>
++#include <string.h>
++
++#include "src/dsp/cpu.h"
++#include "src/webp/types.h"
++
++typedef __vector unsigned char u8x16;
++typedef __vector unsigned short u16x8;
++typedef __vector signed short i16x8;
++typedef __vector unsigned int u32x4;
++typedef __vector signed int i32x4;
++
++//------------------------------------------------------------------------------
++// Alpha dispatch / extraction.
++
++static int DispatchAlpha_VSX(const uint8_t* WEBP_RESTRICT alpha,
++ int alpha_stride, int width, int height,
++ uint8_t* WEBP_RESTRICT dst, int dst_stride) {
++ uint32_t alpha_and = 0xff;
++ int i, j, k;
++ const u8x16 zero = vec_splats((unsigned char)0);
++ const u16x8 z16 = vec_splats((unsigned short)0);
++ const u32x4 a_mask = vec_splats((uint32_t)0xff); // selects the low byte
++ u8x16 all_and = vec_splats((unsigned char)0xff);
++ const int limit = width & ~15;
++
++ for (j = 0; j < height; ++j) {
++ uint8_t* ptr = dst;
++ for (i = 0; i < limit; i += 16) {
++ const u8x16 a0 = vec_xl(0, (unsigned char*)&alpha[i]);
++ // Spread the 16 alpha bytes to the low byte of 16 32-bit lanes.
++ const u16x8 a1_lo = (u16x8)vec_mergeh(a0, zero);
++ const u16x8 a1_hi = (u16x8)vec_mergel(a0, zero);
++ const u32x4 s0 = (u32x4)vec_mergeh(a1_lo, z16);
++ const u32x4 s1 = (u32x4)vec_mergel(a1_lo, z16);
++ const u32x4 s2 = (u32x4)vec_mergeh(a1_hi, z16);
++ const u32x4 s3 = (u32x4)vec_mergel(a1_hi, z16);
++ const u32x4* spread[4] = {&s0, &s1, &s2, &s3};
++ for (k = 0; k < 4; ++k) {
++ const u32x4 d = vec_xl(0, (uint32_t*)(ptr + 16 * k));
++ vec_xst(vec_sel(d, *spread[k], a_mask), 0, (uint32_t*)(ptr + 16 * k));
++ }
++ all_and = vec_and(all_and, a0);
++ ptr += 64;
++ }
++ for (; i < width; ++i) {
++ const uint32_t alpha_value = alpha[i];
++ dst[4 * i] = alpha_value;
++ alpha_and &= alpha_value;
++ }
++ alpha += alpha_stride;
++ dst += dst_stride;
++ }
++ {
++ unsigned char tmp[16];
++ memcpy(tmp, &all_and, 16);
++ for (k = 0; k < 16; ++k) alpha_and &= tmp[k];
++ }
++ return (alpha_and != 0xff);
++}
++
++static void DispatchAlphaToGreen_VSX(const uint8_t* WEBP_RESTRICT alpha,
++ int alpha_stride, int width, int height,
++ uint32_t* WEBP_RESTRICT dst,
++ int dst_stride) {
++ int i, j;
++ const u8x16 zero = vec_splats((unsigned char)0);
++ const u16x8 z16 = vec_splats((unsigned short)0);
++ const int limit = width & ~15;
++ for (j = 0; j < height; ++j) {
++ for (i = 0; i < limit; i += 16) {
++ const u8x16 a0 = vec_xl(0, (unsigned char*)&alpha[i]);
++ // Place each alpha byte into the green slot (<< 8) of a 32-bit lane.
++ const u16x8 a1_lo = (u16x8)vec_mergeh(zero, a0); // note the 'zero' first
++ const u16x8 a1_hi = (u16x8)vec_mergel(zero, a0);
++ const u32x4 g0 = (u32x4)vec_mergeh(a1_lo, z16);
++ const u32x4 g1 = (u32x4)vec_mergel(a1_lo, z16);
++ const u32x4 g2 = (u32x4)vec_mergeh(a1_hi, z16);
++ const u32x4 g3 = (u32x4)vec_mergel(a1_hi, z16);
++ vec_xst(g0, 0, &dst[i + 0]);
++ vec_xst(g1, 0, &dst[i + 4]);
++ vec_xst(g2, 0, &dst[i + 8]);
++ vec_xst(g3, 0, &dst[i + 12]);
++ }
++ for (; i < width; ++i) dst[i] = alpha[i] << 8;
++ alpha += alpha_stride;
++ dst += dst_stride;
++ }
++}
++
++static int ExtractAlpha_VSX(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
++ int width, int height, uint8_t* WEBP_RESTRICT alpha,
++ int alpha_stride) {
++ uint32_t alpha_and = 0xff;
++ int i, j, k;
++ const u32x4 a_mask = vec_splats((uint32_t)0xff); // keeps the low byte
++ u8x16 all_and = vec_splats((unsigned char)0xff);
++ const int limit = width & ~7;
++
++ for (j = 0; j < height; ++j) {
++ const uint32_t* src = (const uint32_t*)argb;
++ for (i = 0; i < limit; i += 8) {
++ const u32x4 a0 = vec_and(vec_xl(0, (uint32_t*)(src + 0)), a_mask);
++ const u32x4 a1 = vec_and(vec_xl(0, (uint32_t*)(src + 4)), a_mask);
++ const i16x8 c0 = vec_packs((i32x4)a0, (i32x4)a1);
++ const u8x16 d0 = vec_packsu(c0, c0); // 8 alpha bytes in the low half
++ memcpy(&alpha[i], &d0, 8);
++ all_and = vec_and(all_and, d0);
++ src += 8;
++ }
++ for (; i < width; ++i) {
++ const uint32_t alpha_value = argb[4 * i];
++ alpha[i] = alpha_value;
++ alpha_and &= alpha_value;
++ }
++ argb += argb_stride;
++ alpha += alpha_stride;
++ }
++ {
++ unsigned char tmp[16];
++ memcpy(tmp, &all_and, 16);
++ for (k = 0; k < 8; ++k) alpha_and &= tmp[k];
++ }
++ return (alpha_and == 0xff);
++}
++
++static void ExtractGreen_VSX(const uint32_t* WEBP_RESTRICT argb,
++ uint8_t* WEBP_RESTRICT alpha, int size) {
++ int i;
++ const u32x4 mask = vec_splats((uint32_t)0xff);
++ const u32x4 sh8 = vec_splats((uint32_t)8);
++ for (i = 0; i + 16 <= size; i += 16) {
++ const u32x4 a0 =
++ vec_and(vec_sr(vec_xl(0, (uint32_t*)(argb + i + 0)), sh8), mask);
++ const u32x4 a1 =
++ vec_and(vec_sr(vec_xl(0, (uint32_t*)(argb + i + 4)), sh8), mask);
++ const u32x4 a2 =
++ vec_and(vec_sr(vec_xl(0, (uint32_t*)(argb + i + 8)), sh8), mask);
++ const u32x4 a3 =
++ vec_and(vec_sr(vec_xl(0, (uint32_t*)(argb + i + 12)), sh8), mask);
++ const i16x8 d0 = vec_packs((i32x4)a0, (i32x4)a1);
++ const i16x8 d1 = vec_packs((i32x4)a2, (i32x4)a3);
++ const u8x16 e = vec_packsu(d0, d1);
++ vec_xst(e, 0, &alpha[i]);
++ }
++ for (; i < size; ++i) alpha[i] = argb[i] >> 8;
++}
++
++//------------------------------------------------------------------------------
++// Premultiply.
++
++#define MULTIPLIER(a) ((a) * 32897U)
++#define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
++
++// Spreads the alpha lane across r/g/b and inserts 0xff in the alpha lane, for
++// the two pixels packed in a 16-bit-per-channel vector. Built against the
++// little-endian byte order; src is the channel vector, the second operand is
++// an all-0xff vector.
++static const u8x16 kSpreadAlphaLast = {6, 7, 6, 7, 6, 7, 16, 7,
++ 14, 15, 14, 15, 14, 15, 16, 15};
++static const u8x16 kSpreadAlphaFirst = {16, 1, 0, 1, 0, 1, 0, 1,
++ 16, 9, 8, 9, 8, 9, 8, 9};
++
++static WEBP_INLINE u16x8 MulHi16(u16x8 a, u16x8 b) {
++ const u32x4 sh = vec_splats((unsigned int)16);
++ const u32x4 e = vec_sr(vec_mule(a, b), sh);
++ const u32x4 o = vec_sr(vec_mulo(a, b), sh);
++ return vec_pack(vec_mergeh(e, o), vec_mergel(e, o));
++}
++
++static void ApplyAlphaMultiply_VSX(uint8_t* rgba, int alpha_first, int w, int h,
++ int stride) {
++ const u8x16 zero = vec_splats((unsigned char)0);
++ const u8x16 allff = vec_splats((unsigned char)0xff);
++ const u16x8 z16 = vec_splats((unsigned short)0);
++ const u16x8 kMult = vec_splats((unsigned short)0x8081);
++ const u16x8 sh7 = vec_splats((unsigned short)7);
++ const u8x16 ctrl = alpha_first ? kSpreadAlphaFirst : kSpreadAlphaLast;
++ const int kSpan = 4;
++ while (h-- > 0) {
++ uint8_t* const rgbx = rgba;
++ int i;
++ for (i = 0; i + kSpan <= w; i += kSpan) {
++ const u8x16 argb0 = vec_xl(0, (unsigned char*)(rgbx + 4 * i));
++ const u16x8 lo = (u16x8)vec_mergeh(argb0, zero);
++ const u16x8 hi = (u16x8)vec_mergel(argb0, zero);
++ const u16x8 a_lo = (u16x8)vec_perm((u8x16)lo, allff, ctrl);
++ const u16x8 a_hi = (u16x8)vec_perm((u8x16)hi, allff, ctrl);
++ const u16x8 A0lo = vec_mladd(a_lo, lo, z16);
++ const u16x8 A0hi = vec_mladd(a_hi, hi, z16);
++ const u16x8 A2lo = vec_sr(MulHi16(A0lo, kMult), sh7);
++ const u16x8 A2hi = vec_sr(MulHi16(A0hi, kMult), sh7);
++ const u8x16 out = vec_packsu((i16x8)A2lo, (i16x8)A2hi);
++ vec_xst(out, 0, (unsigned char*)(rgbx + 4 * i));
++ }
++ // Finish with left-overs.
++ for (; i < w; ++i) {
++ uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
++ const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
++ const uint32_t a = alpha[4 * i];
++ if (a != 0xff) {
++ const uint32_t mult = MULTIPLIER(a);
++ rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);
++ rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);
++ rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);
++ }
++ }
++ rgba += stride;
++ }
++}
++
++#undef MULTIPLIER
++#undef PREMULTIPLY
++
++//------------------------------------------------------------------------------
++
++extern void WebPInitAlphaProcessingVSX(void);
++
++WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingVSX(void) {
++ WebPApplyAlphaMultiply = ApplyAlphaMultiply_VSX;
++ WebPDispatchAlpha = DispatchAlpha_VSX;
++ WebPDispatchAlphaToGreen = DispatchAlphaToGreen_VSX;
++ WebPExtractAlpha = ExtractAlpha_VSX;
++ WebPExtractGreen = ExtractGreen_VSX;
++}
++
++#else // !WEBP_USE_VSX
++
++WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingVSX)
++
++#endif // WEBP_USE_VSX
+diff --git a/media/libwebp/src/dsp/cpu.h b/media/libwebp/src/dsp/cpu.h
+index 17c4db971c7f..d1d4b3127c84 100644
+--- a/media/libwebp/src/dsp/cpu.h
++++ b/media/libwebp/src/dsp/cpu.h
+@@ -154,6 +154,17 @@
+ #define WEBP_USE_MSA
+ #endif
+
++//------------------------------------------------------------------------------
++// PPC64 / Power VSX (ISA 2.07 / POWER8 baseline).
++
++#if defined(__powerpc64__) && defined(__VSX__)
++#define WEBP_USE_VSX
++#endif
++
++#if defined(WEBP_USE_VSX) && !defined(WEBP_HAVE_VSX)
++#define WEBP_HAVE_VSX
++#endif
++
+ //------------------------------------------------------------------------------
+
+ #ifndef WEBP_DSP_OMIT_C_CODE
+@@ -308,7 +319,8 @@ typedef enum {
+ kNEON,
+ kMIPS32,
+ kMIPSdspR2,
+- kMSA
++ kMSA,
++ kVSX
+ } CPUFeature;
+
+ // returns true if the CPU supports the feature.
+diff --git a/media/libwebp/src/dsp/dec.c b/media/libwebp/src/dsp/dec.c
+index 4f38309980ce..f34276ba7316 100644
+--- a/media/libwebp/src/dsp/dec.c
++++ b/media/libwebp/src/dsp/dec.c
+@@ -752,6 +752,7 @@ extern void VP8DspInitNEON(void);
+ extern void VP8DspInitMIPS32(void);
+ extern void VP8DspInitMIPSdspR2(void);
+ extern void VP8DspInitMSA(void);
++extern void VP8DspInitVSX(void);
+
+ WEBP_DSP_INIT_FUNC(VP8DspInit) {
+ VP8InitClipTables();
+@@ -843,6 +844,11 @@ WEBP_DSP_INIT_FUNC(VP8DspInit) {
+ if (VP8GetCPUInfo(kMSA)) {
+ VP8DspInitMSA();
+ }
++#endif
++#if defined(WEBP_HAVE_VSX)
++ if (VP8GetCPUInfo(kVSX)) {
++ VP8DspInitVSX();
++ }
+ #endif
+ }
+
+diff --git a/media/libwebp/src/dsp/dec_vsx.c b/media/libwebp/src/dsp/dec_vsx.c
+new file mode 100644
+index 000000000000..e0c1cbc3b71b
+--- /dev/null
++++ b/media/libwebp/src/dsp/dec_vsx.c
+@@ -0,0 +1,737 @@
++// Copyright 2011 Google Inc. All Rights Reserved.
++//
++// Use of this source code is governed by a BSD-style license
++// that can be found in the COPYING file in the root of the source
++// tree. An additional intellectual property rights grant can be found
++// in the file PATENTS. All contributing project authors may
++// be found in the AUTHORS file in the root of the source tree.
++// -----------------------------------------------------------------------------
++//
++// VSX (PowerPC) version of decoding functions.
++
++#include "src/dsp/dsp.h"
++
++#if defined(WEBP_USE_VSX)
++
++#include <altivec.h>
++#include <string.h>
++
++typedef __vector signed short i16x8;
++typedef __vector unsigned short u16x8;
++typedef __vector signed int i32x4;
++typedef __vector unsigned int u32x4;
++typedef __vector unsigned char u8x16;
++typedef __vector signed char i8x16;
++typedef __vector signed long long i64x2;
++
++// Signed multiply-high of packed 16-bit lanes (POWER8 has no vmulhsh).
++static WEBP_INLINE i16x8 MulHi16_S(i16x8 a, i16x8 b) {
++ const u32x4 sh = vec_splats((unsigned int)16);
++ const i32x4 e = vec_sra(vec_mule(a, b), sh);
++ const i32x4 o = vec_sra(vec_mulo(a, b), sh);
++ return (i16x8)vec_pack(vec_mergeh(e, o), vec_mergel(e, o));
++}
++
++// Transpose two interleaved 4x4 blocks of 16-bit values.
++static WEBP_INLINE void Transpose2_4x4(i16x8 in0, i16x8 in1, i16x8 in2,
++ i16x8 in3, i16x8* out0, i16x8* out1,
++ i16x8* out2, i16x8* out3) {
++ const i16x8 t0 = (i16x8)vec_mergeh(in0, in1);
++ const i16x8 t1 = (i16x8)vec_mergeh(in2, in3);
++ const i16x8 t2 = (i16x8)vec_mergel(in0, in1);
++ const i16x8 t3 = (i16x8)vec_mergel(in2, in3);
++ const i32x4 u0 = vec_mergeh((i32x4)t0, (i32x4)t1);
++ const i32x4 u1 = vec_mergeh((i32x4)t2, (i32x4)t3);
++ const i32x4 u2 = vec_mergel((i32x4)t0, (i32x4)t1);
++ const i32x4 u3 = vec_mergel((i32x4)t2, (i32x4)t3);
++ *out0 = (i16x8)vec_mergeh((i64x2)u0, (i64x2)u1);
++ *out1 = (i16x8)vec_mergel((i64x2)u0, (i64x2)u1);
++ *out2 = (i16x8)vec_mergeh((i64x2)u2, (i64x2)u3);
++ *out3 = (i16x8)vec_mergel((i64x2)u2, (i64x2)u3);
++}
++
++// Bounded 4-coefficient load into the low half of a 16-bit vector.
++static WEBP_INLINE i16x8 Load4Coeffs(const int16_t* WEBP_RESTRICT p) {
++ int16_t tmp[8] = {0};
++ memcpy(tmp, p, 4 * sizeof(int16_t));
++ return *(const i16x8*)tmp;
++}
++
++// Bounded load of n pixels, zero-extended to 16-bit lanes.
++static WEBP_INLINE i16x8 LoadDst(const uint8_t* WEBP_RESTRICT p, int n) {
++ unsigned char tmp[16] = {0};
++ memcpy(tmp, p, n);
++ return (i16x8)vec_mergeh(vec_xl(0, tmp), vec_splats((unsigned char)0));
++}
++
++static void Transform_VSX(const int16_t* WEBP_RESTRICT in,
++ uint8_t* WEBP_RESTRICT dst, int do_two) {
++ const i16x8 k1 = vec_splats((short)20091);
++ const i16x8 k2 = vec_splats((short)-30068);
++ const u16x8 three = vec_splats((unsigned short)3);
++ i16x8 in0 = Load4Coeffs(in + 0), in1 = Load4Coeffs(in + 4);
++ i16x8 in2 = Load4Coeffs(in + 8), in3 = Load4Coeffs(in + 12);
++ i16x8 T0, T1, T2, T3;
++
++ if (do_two) {
++ in0 = (i16x8)vec_mergeh((i64x2)in0, (i64x2)Load4Coeffs(in + 16));
++ in1 = (i16x8)vec_mergeh((i64x2)in1, (i64x2)Load4Coeffs(in + 20));
++ in2 = (i16x8)vec_mergeh((i64x2)in2, (i64x2)Load4Coeffs(in + 24));
++ in3 = (i16x8)vec_mergeh((i64x2)in3, (i64x2)Load4Coeffs(in + 28));
++ }
++
++ { // Vertical pass + transpose.
++ const i16x8 a = vec_add(in0, in2);
++ const i16x8 b = vec_sub(in0, in2);
++ const i16x8 c = vec_add(vec_sub(in1, in3),
++ vec_sub(MulHi16_S(in1, k2), MulHi16_S(in3, k1)));
++ const i16x8 d = vec_add(vec_add(in1, in3),
++ vec_add(MulHi16_S(in1, k1), MulHi16_S(in3, k2)));
++ Transpose2_4x4(vec_add(a, d), vec_add(b, c), vec_sub(b, c), vec_sub(a, d),
++ &T0, &T1, &T2, &T3);
++ }
++ { // Horizontal pass + transpose.
++ const i16x8 dc = vec_add(T0, vec_splats((short)4));
++ const i16x8 a = vec_add(dc, T2);
++ const i16x8 b = vec_sub(dc, T2);
++ const i16x8 c = vec_add(vec_sub(T1, T3),
++ vec_sub(MulHi16_S(T1, k2), MulHi16_S(T3, k1)));
++ const i16x8 d = vec_add(vec_add(T1, T3),
++ vec_add(MulHi16_S(T1, k1), MulHi16_S(T3, k2)));
++ const i16x8 s0 = vec_sra(vec_add(a, d), three);
++ const i16x8 s1 = vec_sra(vec_add(b, c), three);
++ const i16x8 s2 = vec_sra(vec_sub(b, c), three);
++ const i16x8 s3 = vec_sra(vec_sub(a, d), three);
++ Transpose2_4x4(s0, s1, s2, s3, &T0, &T1, &T2, &T3);
++ }
++ { // Add to the reference pixels and store with saturation.
++ const int n = do_two ? 8 : 4;
++ const i16x8 d0 = LoadDst(dst + 0 * BPS, n);
++ const i16x8 d1 = LoadDst(dst + 1 * BPS, n);
++ const i16x8 d2 = LoadDst(dst + 2 * BPS, n);
++ const i16x8 d3 = LoadDst(dst + 3 * BPS, n);
++ const u8x16 r0 = vec_packsu(vec_add(d0, T0), vec_add(d0, T0));
++ const u8x16 r1 = vec_packsu(vec_add(d1, T1), vec_add(d1, T1));
++ const u8x16 r2 = vec_packsu(vec_add(d2, T2), vec_add(d2, T2));
++ const u8x16 r3 = vec_packsu(vec_add(d3, T3), vec_add(d3, T3));
++ unsigned char b0[16], b1[16], b2[16], b3[16];
++ memcpy(b0, &r0, 16); memcpy(b1, &r1, 16);
++ memcpy(b2, &r2, 16); memcpy(b3, &r3, 16);
++ memcpy(dst + 0 * BPS, b0, n); memcpy(dst + 1 * BPS, b1, n);
++ memcpy(dst + 2 * BPS, b2, n); memcpy(dst + 3 * BPS, b3, n);
++ }
++}
++
++//------------------------------------------------------------------------------
++// Simple in-loop edge filtering.
++
++#define ABSU(p, q) vec_or(vec_subs((u8x16)(q), (u8x16)(p)), \
++ vec_subs((u8x16)(p), (u8x16)(q)))
++
++// Per-byte signed arithmetic >>3, packed with saturation.
++static WEBP_INLINE i8x16 SignedShift3(i8x16 x) {
++ const u8x16 z = vec_splats((unsigned char)0);
++ const u16x8 sh = vec_splats((unsigned short)(3 + 8));
++ const i16x8 lo = vec_sra((i16x8)vec_mergeh(z, (u8x16)x), sh);
++ const i16x8 hi = vec_sra((i16x8)vec_mergel(z, (u8x16)x), sh);
++ return (i8x16)vec_packs(lo, hi);
++}
++
++static WEBP_INLINE void DoFilter2_VSX(u8x16* WEBP_RESTRICT p1,
++ u8x16* WEBP_RESTRICT p0,
++ u8x16* WEBP_RESTRICT q0,
++ u8x16* WEBP_RESTRICT q1, int thresh) {
++ const u8x16 sign = vec_splats((unsigned char)0x80);
++ const u8x16 t1 = ABSU(*p1, *q1);
++ const u8x16 t2 = vec_and(t1, vec_splats((unsigned char)0xFE));
++ const u8x16 t3 = (u8x16)vec_sr((u16x8)t2, vec_splats((unsigned short)1));
++ const u8x16 t4 = ABSU(*p0, *q0);
++ const u8x16 t6 = vec_adds(vec_adds(t4, t4), t3);
++ const u8x16 t7 = vec_subs(t6, vec_splats((unsigned char)thresh));
++ const u8x16 mask = (u8x16)vec_cmpeq(t7, vec_splats((unsigned char)0));
++
++ const i8x16 p1s = (i8x16)vec_xor(*p1, sign);
++ const i8x16 q1s = (i8x16)vec_xor(*q1, sign);
++ i8x16 P0 = (i8x16)vec_xor(*p0, sign);
++ i8x16 Q0 = (i8x16)vec_xor(*q0, sign);
++
++ const i8x16 d0 = vec_subs(Q0, P0);
++ const i8x16 s1 = vec_adds(vec_subs(p1s, q1s), d0);
++ i8x16 a = vec_adds(d0, vec_adds(d0, s1));
++ a = vec_and(a, (i8x16)mask);
++ const i8x16 v3 = SignedShift3(vec_adds(a, vec_splats((signed char)3)));
++ const i8x16 v4 = SignedShift3(vec_adds(a, vec_splats((signed char)4)));
++ Q0 = vec_subs(Q0, v4);
++ P0 = vec_adds(P0, v3);
++ *p0 = vec_xor((u8x16)P0, sign);
++ *q0 = vec_xor((u8x16)Q0, sign);
++}
++
++static void SimpleVFilter16_VSX(uint8_t* p, int stride, int thresh) {
++ u8x16 p1 = vec_xl(0, p - 2 * stride);
++ u8x16 p0 = vec_xl(0, p - stride);
++ u8x16 q0 = vec_xl(0, p);
++ u8x16 q1 = vec_xl(0, p + stride);
++ DoFilter2_VSX(&p1, &p0, &q0, &q1, thresh);
++ vec_xst(p0, 0, p - stride);
++ vec_xst(q0, 0, p);
++}
++
++static void SimpleVFilter16i_VSX(uint8_t* p, int stride, int thresh) {
++ int k;
++ for (k = 3; k > 0; --k) {
++ p += 4 * stride;
++ SimpleVFilter16_VSX(p, stride, thresh);
++ }
++}
++
++// Transpose four columns out of / into 16 rows for horizontal-edge filtering.
++static WEBP_INLINE void Load8x4(const uint8_t* WEBP_RESTRICT b, int s,
++ u8x16* WEBP_RESTRICT p, u8x16* WEBP_RESTRICT q) {
++ uint32_t a0[4], a1[4];
++ memcpy(&a0[0], b + 0 * s, 4); memcpy(&a0[1], b + 4 * s, 4);
++ memcpy(&a0[2], b + 2 * s, 4); memcpy(&a0[3], b + 6 * s, 4);
++ memcpy(&a1[0], b + 1 * s, 4); memcpy(&a1[1], b + 5 * s, 4);
++ memcpy(&a1[2], b + 3 * s, 4); memcpy(&a1[3], b + 7 * s, 4);
++ const u8x16 A0 = vec_xl(0, (unsigned char*)a0);
++ const u8x16 A1 = vec_xl(0, (unsigned char*)a1);
++ const u8x16 B0 = vec_mergeh(A0, A1), B1 = vec_mergel(A0, A1);
++ const u16x8 C0 = vec_mergeh((u16x8)B0, (u16x8)B1);
++ const u16x8 C1 = vec_mergel((u16x8)B0, (u16x8)B1);
++ *p = (u8x16)vec_mergeh((u32x4)C0, (u32x4)C1);
++ *q = (u8x16)vec_mergel((u32x4)C0, (u32x4)C1);
++}
++
++static WEBP_INLINE void Load16x4(const uint8_t* WEBP_RESTRICT r0,
++ const uint8_t* WEBP_RESTRICT r8, int s,
++ u8x16* p1, u8x16* p0, u8x16* q0, u8x16* q1) {
++ Load8x4(r0, s, p1, q0);
++ Load8x4(r8, s, p0, q1);
++ const u8x16 t1 = *p1, t2 = *q0;
++ *p1 = (u8x16)vec_mergeh((i64x2)t1, (i64x2)*p0);
++ *p0 = (u8x16)vec_mergel((i64x2)t1, (i64x2)*p0);
++ *q0 = (u8x16)vec_mergeh((i64x2)t2, (i64x2)*q1);
++ *q1 = (u8x16)vec_mergel((i64x2)t2, (i64x2)*q1);
++}
++
++static WEBP_INLINE void Store4x4(u8x16 x, uint8_t* WEBP_RESTRICT dst, int s) {
++ unsigned char b[16];
++ int i;
++ memcpy(b, &x, 16);
++ for (i = 0; i < 4; ++i) memcpy(dst + i * s, b + 4 * i, 4);
++}
++
++static WEBP_INLINE void Store16x4(u8x16 p1, u8x16 p0, u8x16 q0, u8x16 q1,
++ uint8_t* WEBP_RESTRICT r0,
++ uint8_t* WEBP_RESTRICT r8, int s) {
++ u8x16 t = p0;
++ u8x16 p0s = vec_mergeh(p1, t), p1s = vec_mergel(p1, t);
++ t = q0;
++ u8x16 q0s = vec_mergeh(t, q1), q1s = vec_mergel(t, q1);
++ t = p0s;
++ p0s = (u8x16)vec_mergeh((u16x8)t, (u16x8)q0s);
++ q0s = (u8x16)vec_mergel((u16x8)t, (u16x8)q0s);
++ t = p1s;
++ p1s = (u8x16)vec_mergeh((u16x8)t, (u16x8)q1s);
++ q1s = (u8x16)vec_mergel((u16x8)t, (u16x8)q1s);
++ Store4x4(p0s, r0, s); Store4x4(q0s, r0 + 4 * s, s);
++ Store4x4(p1s, r8, s); Store4x4(q1s, r8 + 4 * s, s);
++}
++
++static void SimpleHFilter16_VSX(uint8_t* p, int stride, int thresh) {
++ u8x16 p1, p0, q0, q1;
++ p -= 2; // beginning of p1
++ Load16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
++ DoFilter2_VSX(&p1, &p0, &q0, &q1, thresh);
++ Store16x4(p1, p0, q0, q1, p, p + 8 * stride, stride);
++}
++
++static void SimpleHFilter16i_VSX(uint8_t* p, int stride, int thresh) {
++ int k;
++ for (k = 3; k > 0; --k) {
++ p += 4;
++ SimpleHFilter16_VSX(p, stride, thresh);
++ }
++}
++
++//------------------------------------------------------------------------------
++// Complex in-loop edge filtering (vertical/luma).
++
++static const u8x16 kSignBit = {
++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
++#define FLIPB(x) ((x) = (i8x16)vec_xor((u8x16)(x), kSignBit))
++
++static WEBP_INLINE u8x16 GetNotHEV(u8x16 p1, u8x16 p0, u8x16 q0, u8x16 q1,
++ int hev_thresh) {
++ const u8x16 d = vec_subs(vec_max(ABSU(p1, p0), ABSU(q1, q0)),
++ vec_splats((unsigned char)hev_thresh));
++ return (u8x16)vec_cmpeq(d, vec_splats((unsigned char)0));
++}
++
++static WEBP_INLINE i8x16 GetBaseDelta(i8x16 p1, i8x16 p0, i8x16 q0, i8x16 q1) {
++ const i8x16 d = vec_subs(q0, p0);
++ const i8x16 s1 = vec_adds(vec_subs(p1, q1), d);
++ return vec_adds(d, vec_adds(d, s1));
++}
++
++static WEBP_INLINE void DoSimpleFilterS(i8x16* p0, i8x16* q0, i8x16 f) {
++ *q0 = vec_subs(*q0, SignedShift3(vec_adds(f, vec_splats((signed char)4))));
++ *p0 = vec_adds(*p0, SignedShift3(vec_adds(f, vec_splats((signed char)3))));
++}
++
++static WEBP_INLINE void Update2Pixels(i8x16* pi, i8x16* qi, i16x8 lo, i16x8 hi) {
++ const u16x8 s7 = vec_splats((unsigned short)7);
++ const i8x16 d = (i8x16)vec_packs(vec_sra(lo, s7), vec_sra(hi, s7));
++ *pi = vec_adds(*pi, d);
++ *qi = vec_subs(*qi, d);
++ FLIPB(*pi);
++ FLIPB(*qi);
++}
++
++// mask = (max inner abs-diff <= ithresh) && NeedsFilter(thresh).
++static WEBP_INLINE u8x16 ComplexMask(u8x16 p3, u8x16 p2, u8x16 p1, u8x16 p0,
++ u8x16 q0, u8x16 q1, u8x16 q2, u8x16 q3,
++ int thresh, int ithresh) {
++ u8x16 m = ABSU(p1, p0);
++ m = vec_max(m, ABSU(p3, p2));
++ m = vec_max(m, ABSU(p2, p1));
++ m = vec_max(m, ABSU(q1, q0));
++ m = vec_max(m, ABSU(q3, q2));
++ m = vec_max(m, ABSU(q2, q1));
++ const u8x16 tm = (u8x16)vec_cmpeq(
++ vec_subs(m, vec_splats((unsigned char)ithresh)),
++ vec_splats((unsigned char)0));
++ const u8x16 t2 = vec_and(ABSU(p1, q1), vec_splats((unsigned char)0xFE));
++ const u8x16 t3 = (u8x16)vec_sr((u16x8)t2, vec_splats((unsigned short)1));
++ const u8x16 t6 = vec_adds(vec_adds(ABSU(p0, q0), ABSU(p0, q0)), t3);
++ const u8x16 fm = (u8x16)vec_cmpeq(
++ vec_subs(t6, vec_splats((unsigned char)thresh)),
++ vec_splats((unsigned char)0));
++ return vec_and(tm, fm);
++}
++
++static WEBP_INLINE void DoFilter4(u8x16* p1u, u8x16* p0u, u8x16* q0u,
++ u8x16* q1u, u8x16 mask, int hev_thresh) {
++ const u8x16 zero = vec_splats((unsigned char)0);
++ const u8x16 not_hev = GetNotHEV(*p1u, *p0u, *q0u, *q1u, hev_thresh);
++ i8x16 p1 = (i8x16)vec_xor(*p1u, kSignBit), p0 = (i8x16)vec_xor(*p0u, kSignBit);
++ i8x16 q0 = (i8x16)vec_xor(*q0u, kSignBit), q1 = (i8x16)vec_xor(*q1u, kSignBit);
++ i8x16 t1 = vec_andc(vec_subs(p1, q1), (i8x16)not_hev);
++ const i8x16 t2 = vec_subs(q0, p0);
++ t1 = vec_adds(t1, t2); t1 = vec_adds(t1, t2); t1 = vec_adds(t1, t2);
++ t1 = vec_and(t1, (i8x16)mask);
++ const i8x16 a3 = SignedShift3(vec_adds(t1, vec_splats((signed char)4)));
++ p0 = vec_adds(p0, SignedShift3(vec_adds(t1, vec_splats((signed char)3))));
++ q0 = vec_subs(q0, a3);
++ FLIPB(p0); FLIPB(q0);
++ const i8x16 t = vec_add(a3, (i8x16)kSignBit);
++ i8x16 t3 = vec_sub((i8x16)vec_avg((u8x16)t, zero), vec_splats((signed char)64));
++ t3 = vec_and((i8x16)not_hev, t3);
++ q1 = vec_subs(q1, t3); p1 = vec_adds(p1, t3);
++ FLIPB(p1); FLIPB(q1);
++ *p1u = (u8x16)p1; *p0u = (u8x16)p0; *q0u = (u8x16)q0; *q1u = (u8x16)q1;
++}
++
++static WEBP_INLINE void DoFilter6(u8x16* p2u, u8x16* p1u, u8x16* p0u,
++ u8x16* q0u, u8x16* q1u, u8x16* q2u,
++ u8x16 mask, int hev_thresh) {
++ const u8x16 zero = vec_splats((unsigned char)0);
++ const u8x16 not_hev = GetNotHEV(*p1u, *p0u, *q0u, *q1u, hev_thresh);
++ i8x16 p2 = (i8x16)vec_xor(*p2u, kSignBit), p1 = (i8x16)vec_xor(*p1u, kSignBit);
++ i8x16 p0 = (i8x16)vec_xor(*p0u, kSignBit), q0 = (i8x16)vec_xor(*q0u, kSignBit);
++ i8x16 q1 = (i8x16)vec_xor(*q1u, kSignBit), q2 = (i8x16)vec_xor(*q2u, kSignBit);
++ const i8x16 a = GetBaseDelta(p1, p0, q0, q1);
++ { // hev pixels: simple filter
++ const i8x16 f = vec_and(a, (i8x16)vec_andc(mask, not_hev));
++ DoSimpleFilterS(&p0, &q0, f);
++ }
++ { // non-hev pixels: strong filter
++ const i8x16 f = vec_and(a, vec_and((i8x16)not_hev, (i8x16)mask));
++ const i16x8 k9 = vec_splats((short)0x0900), k63 = vec_splats((short)63);
++ const i16x8 f9lo = MulHi16_S((i16x8)vec_mergeh(zero, (u8x16)f), k9);
++ const i16x8 f9hi = MulHi16_S((i16x8)vec_mergel(zero, (u8x16)f), k9);
++ const i16x8 a2lo = vec_add(f9lo, k63), a2hi = vec_add(f9hi, k63);
++ const i16x8 a1lo = vec_add(a2lo, f9lo), a1hi = vec_add(a2hi, f9hi);
++ const i16x8 a0lo = vec_add(a1lo, f9lo), a0hi = vec_add(a1hi, f9hi);
++ Update2Pixels(&p2, &q2, a2lo, a2hi);
++ Update2Pixels(&p1, &q1, a1lo, a1hi);
++ Update2Pixels(&p0, &q0, a0lo, a0hi);
++ }
++ *p2u = (u8x16)p2; *p1u = (u8x16)p1; *p0u = (u8x16)p0;
++ *q0u = (u8x16)q0; *q1u = (u8x16)q1; *q2u = (u8x16)q2;
++}
++
++static void VFilter16_VSX(uint8_t* p, int s, int thresh, int ithresh,
++ int hev_thresh) {
++ u8x16 p3 = vec_xl(0, p - 4 * s), p2 = vec_xl(0, p - 3 * s);
++ u8x16 p1 = vec_xl(0, p - 2 * s), p0 = vec_xl(0, p - s);
++ u8x16 q0 = vec_xl(0, p), q1 = vec_xl(0, p + s);
++ u8x16 q2 = vec_xl(0, p + 2 * s), q3 = vec_xl(0, p + 3 * s);
++ const u8x16 m = ComplexMask(p3, p2, p1, p0, q0, q1, q2, q3, thresh, ithresh);
++ DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, m, hev_thresh);
++ vec_xst(p2, 0, p - 3 * s); vec_xst(p1, 0, p - 2 * s); vec_xst(p0, 0, p - s);
++ vec_xst(q0, 0, p); vec_xst(q1, 0, p + s); vec_xst(q2, 0, p + 2 * s);
++}
++
++static void VFilter16i_VSX(uint8_t* p, int s, int thresh, int ithresh,
++ int hev_thresh) {
++ int k;
++ for (k = 3; k > 0; --k) {
++ p += 4 * s;
++ u8x16 p3 = vec_xl(0, p - 4 * s), p2 = vec_xl(0, p - 3 * s);
++ u8x16 p1 = vec_xl(0, p - 2 * s), p0 = vec_xl(0, p - s);
++ u8x16 q0 = vec_xl(0, p), q1 = vec_xl(0, p + s);
++ u8x16 q2 = vec_xl(0, p + 2 * s), q3 = vec_xl(0, p + 3 * s);
++ const u8x16 m = ComplexMask(p3, p2, p1, p0, q0, q1, q2, q3, thresh, ithresh);
++ DoFilter4(&p1, &p0, &q0, &q1, m, hev_thresh);
++ vec_xst(p1, 0, p - 2 * s); vec_xst(p0, 0, p - s);
++ vec_xst(q0, 0, p); vec_xst(q1, 0, p + s);
++ }
++}
++
++// Complex horizontal luma: two 16x4 transposes around the vertical edge feed
++// the same DoFilter4/DoFilter6 used by the vertical variants.
++static void HFilter16_VSX(uint8_t* p, int s, int thresh, int ithresh,
++ int hev_thresh) {
++ uint8_t* const b = p - 4;
++ u8x16 p3, p2, p1, p0, q0, q1, q2, q3;
++ Load16x4(b, b + 8 * s, s, &p3, &p2, &p1, &p0);
++ Load16x4(p, p + 8 * s, s, &q0, &q1, &q2, &q3);
++ const u8x16 m = ComplexMask(p3, p2, p1, p0, q0, q1, q2, q3, thresh, ithresh);
++ DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, m, hev_thresh);
++ Store16x4(p3, p2, p1, p0, b, b + 8 * s, s);
++ Store16x4(q0, q1, q2, q3, p, p + 8 * s, s);
++}
++
++static void HFilter16i_VSX(uint8_t* p, int s, int thresh, int ithresh,
++ int hev_thresh) {
++ int k;
++ for (k = 3; k > 0; --k) {
++ p += 4;
++ u8x16 p3, p2, p1, p0, q0, q1, q2, q3;
++ Load16x4(p - 4, p - 4 + 8 * s, s, &p3, &p2, &p1, &p0);
++ Load16x4(p, p + 8 * s, s, &q0, &q1, &q2, &q3);
++ const u8x16 m =
++ ComplexMask(p3, p2, p1, p0, q0, q1, q2, q3, thresh, ithresh);
++ DoFilter4(&p1, &p0, &q0, &q1, m, hev_thresh);
++ Store16x4(p1, p0, q0, q1, p - 2, p - 2 + 8 * s, s);
++ }
++}
++
++//------------------------------------------------------------------------------
++// Complex chroma filtering: operate on the u and v planes (8 wide) together.
++
++// Pack 8 u-bytes into the low half and 8 v-bytes into the high half.
++static WEBP_INLINE u8x16 LoadUV(const uint8_t* WEBP_RESTRICT u,
++ const uint8_t* WEBP_RESTRICT v) {
++ unsigned char b[16];
++ memcpy(b, u, 8);
++ memcpy(b + 8, v, 8);
++ return vec_xl(0, b);
++}
++
++static WEBP_INLINE void StoreUV(u8x16 x, uint8_t* WEBP_RESTRICT u,
++ uint8_t* WEBP_RESTRICT v) {
++ unsigned char b[16];
++ memcpy(b, &x, 16);
++ memcpy(u, b, 8);
++ memcpy(v, b + 8, 8);
++}
++
++static void VFilter8_VSX(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
++ int s, int thresh, int ithresh, int hev_thresh) {
++ u8x16 p3 = LoadUV(u - 4 * s, v - 4 * s), p2 = LoadUV(u - 3 * s, v - 3 * s);
++ u8x16 p1 = LoadUV(u - 2 * s, v - 2 * s), p0 = LoadUV(u - s, v - s);
++ u8x16 q0 = LoadUV(u, v), q1 = LoadUV(u + s, v + s);
++ u8x16 q2 = LoadUV(u + 2 * s, v + 2 * s), q3 = LoadUV(u + 3 * s, v + 3 * s);
++ const u8x16 m = ComplexMask(p3, p2, p1, p0, q0, q1, q2, q3, thresh, ithresh);
++ DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, m, hev_thresh);
++ StoreUV(p2, u - 3 * s, v - 3 * s); StoreUV(p1, u - 2 * s, v - 2 * s);
++ StoreUV(p0, u - s, v - s); StoreUV(q0, u, v);
++ StoreUV(q1, u + s, v + s); StoreUV(q2, u + 2 * s, v + 2 * s);
++}
++
++static void VFilter8i_VSX(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
++ int s, int thresh, int ithresh, int hev_thresh) {
++ u += 4 * s; v += 4 * s;
++ u8x16 p3 = LoadUV(u - 4 * s, v - 4 * s), p2 = LoadUV(u - 3 * s, v - 3 * s);
++ u8x16 p1 = LoadUV(u - 2 * s, v - 2 * s), p0 = LoadUV(u - s, v - s);
++ u8x16 q0 = LoadUV(u, v), q1 = LoadUV(u + s, v + s);
++ u8x16 q2 = LoadUV(u + 2 * s, v + 2 * s), q3 = LoadUV(u + 3 * s, v + 3 * s);
++ const u8x16 m = ComplexMask(p3, p2, p1, p0, q0, q1, q2, q3, thresh, ithresh);
++ DoFilter4(&p1, &p0, &q0, &q1, m, hev_thresh);
++ StoreUV(p1, u - 2 * s, v - 2 * s); StoreUV(p0, u - s, v - s);
++ StoreUV(q0, u, v); StoreUV(q1, u + s, v + s);
++}
++
++static void HFilter8_VSX(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
++ int s, int thresh, int ithresh, int hev_thresh) {
++ u8x16 p3, p2, p1, p0, q0, q1, q2, q3;
++ Load16x4(u - 4, v - 4, s, &p3, &p2, &p1, &p0);
++ Load16x4(u, v, s, &q0, &q1, &q2, &q3);
++ const u8x16 m = ComplexMask(p3, p2, p1, p0, q0, q1, q2, q3, thresh, ithresh);
++ DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, m, hev_thresh);
++ Store16x4(p3, p2, p1, p0, u - 4, v - 4, s);
++ Store16x4(q0, q1, q2, q3, u, v, s);
++}
++
++static void HFilter8i_VSX(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
++ int s, int thresh, int ithresh, int hev_thresh) {
++ u += 4; v += 4;
++ u8x16 p3, p2, p1, p0, q0, q1, q2, q3;
++ Load16x4(u - 4, v - 4, s, &p3, &p2, &p1, &p0);
++ Load16x4(u, v, s, &q0, &q1, &q2, &q3);
++ const u8x16 m = ComplexMask(p3, p2, p1, p0, q0, q1, q2, q3, thresh, ithresh);
++ DoFilter4(&p1, &p0, &q0, &q1, m, hev_thresh);
++ Store16x4(p1, p0, q0, q1, u - 2, v - 2, s);
++}
++
++//------------------------------------------------------------------------------
++// Intra prediction (16x16 luma, 8x8 chroma). DC top-sums are scalar (the SIMD
++// win is the block fill); TrueMotion/VE/HE are vectorized.
++
++static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
++ const u8x16 x = vec_splats(v);
++ int j;
++ for (j = 0; j < 16; ++j) vec_xst(x, 0, dst + j * BPS);
++}
++static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
++ const u8x16 x = vec_splats(v);
++ unsigned char b[16];
++ int j;
++ memcpy(b, &x, 16);
++ for (j = 0; j < 8; ++j) memcpy(dst + j * BPS, b, 8);
++}
++
++static void VE16_VSX(uint8_t* dst) {
++ const u8x16 top = vec_xl(0, dst - BPS);
++ int j;
++ for (j = 0; j < 16; ++j) vec_xst(top, 0, dst + j * BPS);
++}
++static void HE16_VSX(uint8_t* dst) {
++ int j;
++ for (j = 0; j < 16; ++j) vec_xst(vec_splats(dst[-1 + j * BPS]), 0, dst + j * BPS);
++}
++static void DC16_VSX(uint8_t* dst) {
++ int s = 16, j;
++ for (j = 0; j < 16; ++j) s += dst[-BPS + j] + dst[-1 + j * BPS];
++ Put16(s >> 5, dst);
++}
++static void DC16NoTop_VSX(uint8_t* dst) {
++ int s = 8, j;
++ for (j = 0; j < 16; ++j) s += dst[-1 + j * BPS];
++ Put16(s >> 4, dst);
++}
++static void DC16NoLeft_VSX(uint8_t* dst) {
++ int s = 8, j;
++ for (j = 0; j < 16; ++j) s += dst[-BPS + j];
++ Put16(s >> 4, dst);
++}
++static void DC16NoTopLeft_VSX(uint8_t* dst) { Put16(0x80, dst); }
++static void TM16_VSX(uint8_t* dst) {
++ const u8x16 zero = vec_splats((unsigned char)0);
++ const u8x16 t = vec_xl(0, dst - BPS);
++ const i16x8 tl = (i16x8)vec_mergeh(t, zero), th = (i16x8)vec_mergel(t, zero);
++ const int c = dst[-BPS - 1];
++ int y;
++ for (y = 0; y < 16; ++y) {
++ const i16x8 b = vec_splats((short)(dst[-1 + y * BPS] - c));
++ vec_xst((u8x16)vec_packsu(vec_add(b, tl), vec_add(b, th)), 0, dst + y * BPS);
++ }
++}
++
++static void VE8uv_VSX(uint8_t* dst) {
++ unsigned char t[8];
++ int j;
++ memcpy(t, dst - BPS, 8);
++ for (j = 0; j < 8; ++j) memcpy(dst + j * BPS, t, 8);
++}
++static void DC8uv_VSX(uint8_t* dst) {
++ int s = 8, j;
++ for (j = 0; j < 8; ++j) s += dst[-BPS + j] + dst[-1 + j * BPS];
++ Put8x8uv(s >> 4, dst);
++}
++static void DC8uvNoTop_VSX(uint8_t* dst) {
++ int s = 4, j;
++ for (j = 0; j < 8; ++j) s += dst[-1 + j * BPS];
++ Put8x8uv(s >> 3, dst);
++}
++static void DC8uvNoLeft_VSX(uint8_t* dst) {
++ int s = 4, j;
++ for (j = 0; j < 8; ++j) s += dst[-BPS + j];
++ Put8x8uv(s >> 3, dst);
++}
++static void DC8uvNoTopLeft_VSX(uint8_t* dst) { Put8x8uv(0x80, dst); }
++static void TM8uv_VSX(uint8_t* dst) {
++ const u8x16 zero = vec_splats((unsigned char)0);
++ const u8x16 t = vec_xl(0, dst - BPS);
++ const i16x8 tl = (i16x8)vec_mergeh(t, zero);
++ const int c = dst[-BPS - 1];
++ int y;
++ for (y = 0; y < 8; ++y) {
++ const i16x8 b = vec_splats((short)(dst[-1 + y * BPS] - c));
++ const u8x16 o = (u8x16)vec_packsu(vec_add(b, tl), vec_splats((short)0));
++ unsigned char bb[16];
++ memcpy(bb, &o, 16);
++ memcpy(dst + y * BPS, bb, 8);
++ }
++}
++
++//------------------------------------------------------------------------------
++// 4x4 luma intra prediction. Whole-vector byte shifts window the edge samples:
++// srli_si128(x,n) == vec_sld(zero, x, 16 - n)
++// slli_si128(x,n) == vec_sld(x, zero, n)
++
++#define SRLI(x, n) vec_sld(zero, (x), 16 - (n))
++#define SLLI(x, n) vec_sld((x), zero, (n))
++#define INS16(v, val, i) ((u8x16)vec_insert((short)(val), (i16x8)(v), (i)))
++#define AVG3C(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
++
++static WEBP_INLINE u8x16 Load64(const uint8_t* WEBP_RESTRICT p) {
++ unsigned char b[16] = {0};
++ memcpy(b, p, 8);
++ return vec_xl(0, b);
++}
++static WEBP_INLINE uint32_t GetWord(u8x16 v) {
++ unsigned char b[16];
++ uint32_t r;
++ memcpy(b, &v, 16);
++ memcpy(&r, b, 4);
++ return r;
++}
++static WEBP_INLINE u8x16 SetWord(uint32_t v) {
++ unsigned char b[16] = {0};
++ memcpy(b, &v, 4);
++ return vec_xl(0, b);
++}
++static WEBP_INLINE void StoreWord(uint32_t v, uint8_t* dst) {
++ memcpy(dst, &v, 4);
++}
++
++static void VE4_VSX(uint8_t* dst) {
++ const u8x16 zero = vec_splats((unsigned char)0), one = vec_splats((unsigned char)1);
++ const u8x16 A = Load64(dst - BPS - 1), B = SRLI(A, 1), C = SRLI(A, 2);
++ const u8x16 a = vec_avg(A, C), lsb = vec_and(vec_xor(A, C), one);
++ const u8x16 avg = vec_avg(vec_subs(a, lsb), B);
++ const uint32_t v = GetWord(avg);
++ int i;
++ for (i = 0; i < 4; ++i) StoreWord(v, dst + i * BPS);
++}
++static void LD4_VSX(uint8_t* dst) {
++ const u8x16 zero = vec_splats((unsigned char)0), one = vec_splats((unsigned char)1);
++ const u8x16 A = Load64(dst - BPS), B = SRLI(A, 1), C = SRLI(A, 2);
++ const u8x16 CH = INS16(C, dst[-BPS + 7], 3);
++ const u8x16 a1 = vec_avg(A, CH), lsb = vec_and(vec_xor(A, CH), one);
++ const u8x16 r = vec_avg(vec_subs(a1, lsb), B);
++ StoreWord(GetWord(r), dst + 0 * BPS);
++ StoreWord(GetWord(SRLI(r, 1)), dst + 1 * BPS);
++ StoreWord(GetWord(SRLI(r, 2)), dst + 2 * BPS);
++ StoreWord(GetWord(SRLI(r, 3)), dst + 3 * BPS);
++}
++static void VR4_VSX(uint8_t* dst) {
++ const u8x16 zero = vec_splats((unsigned char)0), one = vec_splats((unsigned char)1);
++ const int I = dst[-1 + 0 * BPS], J = dst[-1 + 1 * BPS], K = dst[-1 + 2 * BPS];
++ const int X = dst[-1 - BPS];
++ const u8x16 XA = Load64(dst - BPS - 1), A0 = SRLI(XA, 1);
++ const u8x16 abcd = vec_avg(XA, A0);
++ const u8x16 IX = INS16(SLLI(XA, 1), (I | (X << 8)), 0);
++ const u8x16 a1 = vec_avg(IX, A0), lsb = vec_and(vec_xor(IX, A0), one);
++ const u8x16 efgh = vec_avg(vec_subs(a1, lsb), XA);
++ StoreWord(GetWord(abcd), dst + 0 * BPS);
++ StoreWord(GetWord(efgh), dst + 1 * BPS);
++ StoreWord(GetWord(SLLI(abcd, 1)), dst + 2 * BPS);
++ StoreWord(GetWord(SLLI(efgh, 1)), dst + 3 * BPS);
++ dst[0 + 2 * BPS] = AVG3C(J, I, X);
++ dst[0 + 3 * BPS] = AVG3C(K, J, I);
++}
++static void VL4_VSX(uint8_t* dst) {
++ const u8x16 zero = vec_splats((unsigned char)0), one = vec_splats((unsigned char)1);
++ const u8x16 A = Load64(dst - BPS), B = SRLI(A, 1), C = SRLI(A, 2);
++ const u8x16 a1 = vec_avg(A, B), a2 = vec_avg(C, B), a3 = vec_avg(a1, a2);
++ const u8x16 lsb1 = vec_and(vec_xor(a1, a2), one);
++ const u8x16 abbc = vec_or(vec_xor(A, B), vec_xor(C, B));
++ const u8x16 a4 = vec_subs(a3, vec_and(abbc, lsb1));
++ const uint32_t extra = GetWord(SRLI(a4, 4));
++ StoreWord(GetWord(a1), dst + 0 * BPS);
++ StoreWord(GetWord(a4), dst + 1 * BPS);
++ StoreWord(GetWord(SRLI(a1, 1)), dst + 2 * BPS);
++ StoreWord(GetWord(SRLI(a4, 1)), dst + 3 * BPS);
++ dst[3 + 2 * BPS] = (extra >> 0) & 0xff;
++ dst[3 + 3 * BPS] = (extra >> 8) & 0xff;
++}
++static void RD4_VSX(uint8_t* dst) {
++ const u8x16 zero = vec_splats((unsigned char)0), one = vec_splats((unsigned char)1);
++ const uint32_t I = dst[-1 + 0 * BPS], J = dst[-1 + 1 * BPS];
++ const uint32_t K = dst[-1 + 2 * BPS], L = dst[-1 + 3 * BPS];
++ const u8x16 XA = Load64(dst - BPS - 1);
++ const u8x16 all = vec_or(SetWord((uint32_t)(L | (K << 8) | (J << 16) | (I << 24))),
++ SLLI(XA, 4));
++ const u8x16 k1 = SRLI(all, 1), j2 = SRLI(all, 2);
++ const u8x16 a1 = vec_avg(j2, all), lsb = vec_and(vec_xor(j2, all), one);
++ const u8x16 r = vec_avg(vec_subs(a1, lsb), k1);
++ StoreWord(GetWord(r), dst + 3 * BPS);
++ StoreWord(GetWord(SRLI(r, 1)), dst + 2 * BPS);
++ StoreWord(GetWord(SRLI(r, 2)), dst + 1 * BPS);
++ StoreWord(GetWord(SRLI(r, 3)), dst + 0 * BPS);
++}
++static void TM4_VSX(uint8_t* dst) {
++ const u8x16 zero = vec_splats((unsigned char)0);
++ const u8x16 t = Load64(dst - BPS);
++ const i16x8 tb = (i16x8)vec_mergeh(t, zero);
++ const int c = dst[-BPS - 1];
++ int y;
++ for (y = 0; y < 4; ++y) {
++ const i16x8 b = vec_splats((short)(dst[-1 + y * BPS] - c));
++ const u8x16 o = (u8x16)vec_packsu(vec_add(b, tb), vec_splats((short)0));
++ StoreWord(GetWord(o), dst + y * BPS);
++ }
++}
++#undef SRLI
++#undef SLLI
++#undef INS16
++#undef AVG3C
++
++extern void VP8DspInitVSX(void);
++
++WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitVSX(void) {
++ VP8Transform = Transform_VSX;
++ VP8SimpleVFilter16 = SimpleVFilter16_VSX;
++ VP8SimpleVFilter16i = SimpleVFilter16i_VSX;
++ VP8SimpleHFilter16 = SimpleHFilter16_VSX;
++ VP8SimpleHFilter16i = SimpleHFilter16i_VSX;
++ VP8VFilter16 = VFilter16_VSX;
++ VP8VFilter16i = VFilter16i_VSX;
++ VP8HFilter16 = HFilter16_VSX;
++ VP8HFilter16i = HFilter16i_VSX;
++ VP8VFilter8 = VFilter8_VSX;
++ VP8VFilter8i = VFilter8i_VSX;
++ VP8HFilter8 = HFilter8_VSX;
++ VP8HFilter8i = HFilter8i_VSX;
++
++ VP8PredLuma16[0] = DC16_VSX;
++ VP8PredLuma16[1] = TM16_VSX;
++ VP8PredLuma16[2] = VE16_VSX;
++ VP8PredLuma16[3] = HE16_VSX;
++ VP8PredLuma16[4] = DC16NoTop_VSX;
++ VP8PredLuma16[5] = DC16NoLeft_VSX;
++ VP8PredLuma16[6] = DC16NoTopLeft_VSX;
++ VP8PredChroma8[0] = DC8uv_VSX;
++ VP8PredChroma8[1] = TM8uv_VSX;
++ VP8PredChroma8[2] = VE8uv_VSX;
++ VP8PredChroma8[4] = DC8uvNoTop_VSX;
++ VP8PredChroma8[5] = DC8uvNoLeft_VSX;
++ VP8PredChroma8[6] = DC8uvNoTopLeft_VSX;
++ VP8PredLuma4[1] = TM4_VSX;
++ VP8PredLuma4[2] = VE4_VSX;
++ VP8PredLuma4[4] = RD4_VSX;
++ VP8PredLuma4[5] = VR4_VSX;
++ VP8PredLuma4[6] = LD4_VSX;
++ VP8PredLuma4[7] = VL4_VSX;
++}
++
++#else // !WEBP_USE_VSX
++
++WEBP_DSP_INIT_STUB(VP8DspInitVSX)
++
++#endif // WEBP_USE_VSX
+diff --git a/media/libwebp/src/dsp/filters.c b/media/libwebp/src/dsp/filters.c
+index 38da5252df3a..9962e1287402 100644
+--- a/media/libwebp/src/dsp/filters.c
++++ b/media/libwebp/src/dsp/filters.c
+@@ -217,6 +217,7 @@ extern void VP8FiltersInitMIPSdspR2(void);
+ extern void VP8FiltersInitMSA(void);
+ extern void VP8FiltersInitNEON(void);
+ extern void VP8FiltersInitSSE2(void);
++extern void VP8FiltersInitVSX(void);
+
+ WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
+ WebPUnfilters[WEBP_FILTER_NONE] = NoneUnfilter_C;
+@@ -248,6 +249,11 @@ WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
+ if (VP8GetCPUInfo(kMSA)) {
+ VP8FiltersInitMSA();
+ }
++#endif
++#if defined(WEBP_HAVE_VSX)
++ if (VP8GetCPUInfo(kVSX)) {
++ VP8FiltersInitVSX();
++ }
+ #endif
+ }
+
+diff --git a/media/libwebp/src/dsp/filters_vsx.c b/media/libwebp/src/dsp/filters_vsx.c
+new file mode 100644
+index 000000000000..ae8e57ac685c
+--- /dev/null
++++ b/media/libwebp/src/dsp/filters_vsx.c
+@@ -0,0 +1,162 @@
++// Copyright 2015 Google Inc. All Rights Reserved.
++//
++// Use of this source code is governed by a BSD-style license
++// that can be found in the COPYING file in the root of the source
++// tree. An additional intellectual property rights grant can be found
++// in the file PATENTS. All contributing project authors may
++// be found in the AUTHORS file in the root of the source tree.
++// -----------------------------------------------------------------------------
++//
++// VSX (PowerPC) version of filtering functions.
++
++#include "src/dsp/dsp.h"
++
++#if defined(WEBP_USE_VSX)
++
++#include <altivec.h>
++#include <assert.h>
++#include <string.h>
++
++#include "src/dsp/cpu.h"
++#include "src/webp/types.h"
++
++typedef __vector unsigned char u8x16;
++typedef __vector unsigned short u16x8;
++typedef __vector signed short i16x8;
++typedef __vector unsigned long long u64x2;
++
++// Byte-wise shifts of the whole 128-bit register, matching the little-endian
++// semantics of _mm_slli_si128 / _mm_srli_si128. 'n' must be a literal.
++#define SLLI(x, n) vec_sld((x), zero, (n))
++#define SRLI(x, n) vec_sld(zero, (x), 16 - (n))
++
++// Loads 8 bytes from 'p' into the low half of a vector (high half undefined).
++static WEBP_INLINE u8x16 Load8(const uint8_t* p) {
++ uint64_t v;
++ memcpy(&v, p, 8);
++ return (u8x16)vec_splats(v);
++}
++
++//------------------------------------------------------------------------------
++// Horizontal unfilter: out[i] = in[i] + out[i - 1] (a prefix sum).
++
++static void HorizontalUnfilter_VSX(const uint8_t* prev, const uint8_t* in,
++ uint8_t* out, int width) {
++ const u8x16 zero = vec_splats((unsigned char)0);
++ const u64x2 sh56 = vec_splats((unsigned long long)56);
++ u8x16 last;
++ int i;
++ out[0] = (uint8_t)(in[0] + (prev == NULL ? 0 : prev[0]));
++ if (width <= 1) return;
++ last = vec_insert(out[0], zero, 0);
++ for (i = 1; i + 8 <= width; i += 8) {
++ const u8x16 A0 = Load8(in + i);
++ const u8x16 A1 = vec_add(A0, last);
++ const u8x16 A2 = SLLI(A1, 1);
++ const u8x16 A3 = vec_add(A1, A2);
++ const u8x16 A4 = SLLI(A3, 2);
++ const u8x16 A5 = vec_add(A3, A4);
++ const u8x16 A6 = SLLI(A5, 4);
++ const u8x16 A7 = vec_add(A5, A6);
++ memcpy(out + i, &A7, 8);
++ last = (u8x16)vec_sr((u64x2)A7, sh56); // broadcast out[i + 7] to byte 0
++ }
++ for (; i < width; ++i) out[i] = (uint8_t)(in[i] + out[i - 1]);
++}
++
++//------------------------------------------------------------------------------
++// Vertical unfilter: out[i] = in[i] + prev[i].
++
++static void VerticalUnfilter_VSX(const uint8_t* prev, const uint8_t* in,
++ uint8_t* out, int width) {
++ if (prev == NULL) {
++ HorizontalUnfilter_VSX(NULL, in, out, width);
++ } else {
++ int i;
++ const int max_pos = width & ~31;
++ for (i = 0; i < max_pos; i += 32) {
++ const u8x16 A0 = vec_xl(0, (unsigned char*)&in[i + 0]);
++ const u8x16 A1 = vec_xl(0, (unsigned char*)&in[i + 16]);
++ const u8x16 B0 = vec_xl(0, (unsigned char*)&prev[i + 0]);
++ const u8x16 B1 = vec_xl(0, (unsigned char*)&prev[i + 16]);
++ vec_xst(vec_add(A0, B0), 0, (unsigned char*)&out[i + 0]);
++ vec_xst(vec_add(A1, B1), 0, (unsigned char*)&out[i + 16]);
++ }
++ for (; i < width; ++i) out[i] = (uint8_t)(in[i] + prev[i]);
++ }
++}
++
++//------------------------------------------------------------------------------
++// Gradient unfilter: row[i] = in[i] + clip(row[i-1] + top[i] - top[i-1]).
++
++static WEBP_INLINE int GradientPredictor_VSX(uint8_t a, uint8_t b, uint8_t c) {
++ const int g = a + b - c;
++ return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255;
++}
++
++static void GradientPredictInverse_VSX(const uint8_t* in, const uint8_t* top,
++ uint8_t* row, int length) {
++ if (length > 0) {
++ int i;
++ const int max_pos = length & ~7;
++ const u8x16 zero = vec_splats((unsigned char)0);
++ u8x16 A = vec_insert((unsigned char)row[-1], zero, 0); // left sample
++ for (i = 0; i < max_pos; i += 8) {
++ const u8x16 t0 = Load8(top + i);
++ const u8x16 t1 = Load8(top + i - 1);
++ const u16x8 B = (u16x8)vec_mergeh(t0, zero);
++ const u16x8 C = (u16x8)vec_mergeh(t1, zero);
++ const u8x16 D = Load8(in + i); // base input
++ const u16x8 E = vec_sub(B, C); // unclipped gradient basis b - c
++ u8x16 out = zero; // accumulator for output
++ u8x16 mask_hi = vec_insert((unsigned char)0xff, zero, 0);
++ int k = 8;
++ while (1) {
++ const u16x8 tmp3 = vec_add((u16x8)A, E); // delta = a + b - c
++ const u8x16 tmp4 = vec_packsu((i16x8)tmp3, (i16x8)zero); // sat. delta
++ const u8x16 tmp5 = vec_add(tmp4, D); // add to in[]
++ A = vec_and(tmp5, mask_hi); // keep new sample
++ out = vec_or(out, A); // accumulate output
++ if (--k == 0) break;
++ A = SLLI(A, 1); // rotate left sample
++ mask_hi = SLLI(mask_hi, 1); // rotate mask
++ A = (u8x16)vec_mergeh(A, zero); // convert 8b -> 16b
++ }
++ A = SRLI(A, 7); // prepare left sample for next iteration
++ memcpy(row + i, &out, 8);
++ }
++ for (; i < length; ++i) {
++ const int delta = GradientPredictor_VSX(row[i - 1], top[i], top[i - 1]);
++ row[i] = (uint8_t)(in[i] + delta);
++ }
++ }
++}
++
++static void GradientUnfilter_VSX(const uint8_t* prev, const uint8_t* in,
++ uint8_t* out, int width) {
++ if (prev == NULL) {
++ HorizontalUnfilter_VSX(NULL, in, out, width);
++ } else {
++ out[0] = (uint8_t)(in[0] + prev[0]); // predict from above
++ GradientPredictInverse_VSX(in + 1, prev + 1, out + 1, width - 1);
++ }
++}
++
++#undef SLLI
++#undef SRLI
++
++//------------------------------------------------------------------------------
++
++extern void VP8FiltersInitVSX(void);
++
++WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitVSX(void) {
++ WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_VSX;
++ WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_VSX;
++ WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_VSX;
++}
++
++#else // !WEBP_USE_VSX
++
++WEBP_DSP_INIT_STUB(VP8FiltersInitVSX)
++
++#endif // WEBP_USE_VSX
+diff --git a/media/libwebp/src/dsp/lossless.c b/media/libwebp/src/dsp/lossless.c
+index 1a3d800c3fbc..48b5d4a3aedc 100644
+--- a/media/libwebp/src/dsp/lossless.c
++++ b/media/libwebp/src/dsp/lossless.c
+@@ -606,6 +606,7 @@ extern void VP8LDspInitAVX2(void);
+ extern void VP8LDspInitNEON(void);
+ extern void VP8LDspInitMIPSdspR2(void);
+ extern void VP8LDspInitMSA(void);
++extern void VP8LDspInitVSX(void);
+
+ #define COPY_PREDICTOR_ARRAY(IN, OUT) do { \
+ (OUT)[0] = IN##0_C; \
+@@ -673,6 +674,11 @@ WEBP_DSP_INIT_FUNC(VP8LDspInit) {
+ if (VP8GetCPUInfo(kMSA)) {
+ VP8LDspInitMSA();
+ }
++#endif
++#if defined(WEBP_HAVE_VSX)
++ if (VP8GetCPUInfo(kVSX)) {
++ VP8LDspInitVSX();
++ }
+ #endif
+ }
+
+diff --git a/media/libwebp/src/dsp/lossless_vsx.c b/media/libwebp/src/dsp/lossless_vsx.c
+new file mode 100644
+index 000000000000..89da30c9589c
+--- /dev/null
++++ b/media/libwebp/src/dsp/lossless_vsx.c
+@@ -0,0 +1,449 @@
++// Copyright 2014 Google Inc. All Rights Reserved.
++//
++// Use of this source code is governed by a BSD-style license
++// that can be found in the COPYING file in the root of the source
++// tree. An additional intellectual property rights grant can be found
++// in the file PATENTS. All contributing project authors may
++// be found in the AUTHORS file in the root of the source tree.
++// -----------------------------------------------------------------------------
++//
++// VSX (PowerPC) version of lossless functions.
++
++#include "src/dsp/dsp.h"
++
++#if defined(WEBP_USE_VSX)
++
++#include <altivec.h>
++#include <string.h>
++
++#include "src/dsp/cpu.h"
++#include "src/dsp/lossless.h"
++#include "src/dsp/lossless_common.h"
++#include "src/webp/format_constants.h"
++#include "src/webp/types.h"
++
++typedef __vector unsigned char u8x16;
++typedef __vector unsigned short u16x8;
++typedef __vector signed short i16x8;
++typedef __vector unsigned int u32x4;
++typedef __vector signed int i32x4;
++
++// Signed multiply-high of 16-bit lanes: (a * b) >> 16, matching
++// _mm_mulhi_epi16.
++static WEBP_INLINE i16x8 MulHiS16(i16x8 a, i16x8 b) {
++ const u32x4 sh = vec_splats((unsigned int)16);
++ const i32x4 e = vec_sra(vec_mule(a, b), sh);
++ const i32x4 o = vec_sra(vec_mulo(a, b), sh);
++ return (i16x8)vec_pack(vec_mergeh(e, o), vec_mergel(e, o));
++}
++
++//------------------------------------------------------------------------------
++// Color transforms.
++
++static void AddGreenToBlueAndRed_VSX(const uint32_t* src, int num_pixels,
++ uint32_t* dst) {
++ const u8x16 zero = vec_splats((unsigned char)0);
++ // Replicate the green byte (offset 1 of each pixel) into the blue/red slots.
++ const u8x16 kSpreadGreen = {1, 16, 1, 16, 5, 16, 5, 16,
++ 9, 16, 9, 16, 13, 16, 13, 16};
++ int i;
++ for (i = 0; i + 4 <= num_pixels; i += 4) {
++ const u8x16 in = (u8x16)vec_xl(0, (uint32_t*)&src[i]);
++ const u8x16 g = vec_perm(in, zero, kSpreadGreen); // 0 g 0 g per pixel
++ vec_xst((u32x4)vec_add(in, g), 0, &dst[i]);
++ }
++ if (i != num_pixels) {
++ VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i);
++ }
++}
++
++static void TransformColorInverse_VSX(const VP8LMultipliers* const m,
++ const uint32_t* src, int num_pixels,
++ uint32_t* dst) {
++// sign-extended multiplying constants, pre-shifted by 5 (see lossless_sse2.c).
++#define CST(X) (((int16_t)((m->X) << 8)) >> 5)
++ const i16x8 mults_rb =
++ (i16x8)vec_splats((int)(((uint32_t)(uint16_t)CST(green_to_red) << 16) |
++ ((uint16_t)CST(green_to_blue))));
++ const i16x8 mults_b2 =
++ (i16x8)vec_splats((int)((uint32_t)(uint16_t)CST(red_to_blue) << 16));
++#undef CST
++ const u8x16 zero = vec_splats((unsigned char)0);
++ const u32x4 mask_ag = vec_splats((uint32_t)0xff00ff00); // alpha/green
++ const u16x8 sh8_16 = vec_splats((unsigned short)8);
++ const u32x4 sh8_32 = vec_splats((unsigned int)8);
++ // Broadcast the green byte (offset 1) into the high byte of both 16-bit
++ // halves of each pixel: yields g << 8 in each lane.
++ const u8x16 kGreenHi = {16, 1, 16, 1, 16, 5, 16, 5,
++ 16, 9, 16, 9, 16, 13, 16, 13};
++ int i;
++ for (i = 0; i + 4 <= num_pixels; i += 4) {
++ const u8x16 in = (u8x16)vec_xl(0, (uint32_t*)&src[i]);
++ const u8x16 A = (u8x16)vec_and((u32x4)in, mask_ag); // a 0 g 0
++ const i16x8 C = (i16x8)vec_perm(A, zero, kGreenHi); // g0g0 (g << 8)
++ const u8x16 D = (u8x16)MulHiS16(C, mults_rb); // x dr x db1
++ const u8x16 E = vec_add(in, D); // x r' x b'
++ const u16x8 F = vec_sl((u16x8)E, sh8_16); // r' 0 b' 0
++ const u8x16 G = (u8x16)MulHiS16((i16x8)F, mults_b2); // x db2 0 0
++ const u8x16 H = (u8x16)vec_sr((u32x4)G, sh8_32); // 0 x db2 0
++ const u16x8 I = (u16x8)vec_add(H, (u8x16)F); // r' x b'' 0
++ const u8x16 J = (u8x16)vec_sr(I, sh8_16); // 0 r' 0 b''
++ vec_xst(vec_or((u32x4)J, (u32x4)A), 0, &dst[i]);
++ }
++ if (i != num_pixels) {
++ VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
++ }
++}
++
++//------------------------------------------------------------------------------
++// Color-space conversion functions.
++
++static void ConvertBGRAToRGBA_VSX(const uint32_t* WEBP_RESTRICT src,
++ int num_pixels, uint8_t* WEBP_RESTRICT dst) {
++ // Swap the blue (offset 0) and red (offset 2) bytes of each pixel.
++ const u8x16 kSwapBR = {2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
++ int i;
++ for (i = 0; i + 4 <= num_pixels; i += 4) {
++ const u8x16 in = (u8x16)vec_xl(0, (uint32_t*)&src[i]);
++ vec_xst(vec_perm(in, in, kSwapBR), 0, &dst[4 * i]);
++ }
++ if (i != num_pixels) {
++ VP8LConvertBGRAToRGBA_C(src + i, num_pixels - i, dst + 4 * i);
++ }
++}
++
++static void ConvertBGRAToRGB_VSX(const uint32_t* WEBP_RESTRICT src,
++ int num_pixels, uint8_t* WEBP_RESTRICT dst) {
++ // BGRA -> RGB: gather R,G,B (offsets 2,1,0) of each pixel, drop alpha.
++ const u8x16 kToRGB = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 0, 0, 0, 0};
++ int i;
++ for (i = 0; i + 4 <= num_pixels; i += 4) {
++ const u8x16 in = (u8x16)vec_xl(0, (uint32_t*)&src[i]);
++ const u8x16 out = vec_perm(in, in, kToRGB);
++ memcpy(&dst[3 * i], &out, 12);
++ }
++ if (i != num_pixels) {
++ VP8LConvertBGRAToRGB_C(src + i, num_pixels - i, dst + 3 * i);
++ }
++}
++
++static void ConvertBGRAToBGR_VSX(const uint32_t* WEBP_RESTRICT src,
++ int num_pixels, uint8_t* WEBP_RESTRICT dst) {
++ // BGRA -> BGR: gather B,G,R (offsets 0,1,2) of each pixel, drop alpha.
++ const u8x16 kToBGR = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 0, 0, 0, 0};
++ int i;
++ for (i = 0; i + 4 <= num_pixels; i += 4) {
++ const u8x16 in = (u8x16)vec_xl(0, (uint32_t*)&src[i]);
++ const u8x16 out = vec_perm(in, in, kToBGR);
++ memcpy(&dst[3 * i], &out, 12);
++ }
++ if (i != num_pixels) {
++ VP8LConvertBGRAToBGR_C(src + i, num_pixels - i, dst + 3 * i);
++ }
++}
++
++//------------------------------------------------------------------------------
++// Predictor transform.
++
++// Byte-wise shifts of the whole register (little-endian _mm_s{l,r}li_si128).
++#define SLLI(x, n) vec_sld((x), kZero, (n))
++#define SRLI(x, n) vec_sld(kZero, (x), 16 - (n))
++static const u8x16 kZero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
++
++// Per-byte floor average (a + b) >> 1, matching the C Average2().
++static WEBP_INLINE u8x16 Average2_u8(u8x16 a, u8x16 b) {
++ const u8x16 one = vec_splats((unsigned char)1);
++ const u8x16 avg1 = vec_avg(a, b); // (a + b + 1) >> 1
++ return vec_sub(avg1, vec_and(vec_xor(a, b), one));
++}
++
++static WEBP_INLINE u32x4 Lane0(uint32_t v) {
++ const u32x4 r = {v, 0, 0, 0};
++ return r;
++}
++
++// Single-pixel helpers operating on the low 32-bit lane only.
++static WEBP_INLINE u16x8 Unpack16(uint32_t a) {
++ return (u16x8)vec_mergeh((u8x16)Lane0(a), kZero);
++}
++
++static WEBP_INLINE uint32_t Average2_VSX(uint32_t a0, uint32_t a1) {
++ return vec_extract((u32x4)Average2_u8((u8x16)Lane0(a0), (u8x16)Lane0(a1)), 0);
++}
++
++static WEBP_INLINE u16x8 Average2_16(uint32_t a0, uint32_t a1) {
++ const u16x8 one = vec_splats((unsigned short)1);
++ return vec_sr(vec_add(Unpack16(a0), Unpack16(a1)), one);
++}
++
++static WEBP_INLINE uint32_t Average3_VSX(uint32_t a0, uint32_t a1,
++ uint32_t a2) {
++ const u16x8 one = vec_splats((unsigned short)1);
++ const u16x8 avg1 = Average2_16(a0, a2);
++ const u16x8 avg2 = vec_sr(vec_add(avg1, Unpack16(a1)), one);
++ return vec_extract((u32x4)vec_packsu((i16x8)avg2, (i16x8)avg2), 0);
++}
++
++static WEBP_INLINE uint32_t Average4_VSX(uint32_t a0, uint32_t a1, uint32_t a2,
++ uint32_t a3) {
++ const u16x8 one = vec_splats((unsigned short)1);
++ const u16x8 avg1 = Average2_16(a0, a1);
++ const u16x8 avg2 = Average2_16(a2, a3);
++ const u16x8 avg3 = vec_sr(vec_add(avg1, avg2), one);
++ return vec_extract((u32x4)vec_packsu((i16x8)avg3, (i16x8)avg3), 0);
++}
++
++static WEBP_INLINE uint32_t ClampedAddSubtractFull_VSX(uint32_t c0, uint32_t c1,
++ uint32_t c2) {
++ const i16x8 v =
++ vec_sub((i16x8)vec_add(Unpack16(c0), Unpack16(c1)), (i16x8)Unpack16(c2));
++ return vec_extract((u32x4)vec_packsu(v, v), 0);
++}
++
++static WEBP_INLINE uint32_t ClampedAddSubtractHalf_VSX(uint32_t c0, uint32_t c1,
++ uint32_t c2) {
++ const u16x8 one = vec_splats((unsigned short)1);
++ const u16x8 C0 = Unpack16(c0);
++ const u16x8 C1 = Unpack16(c1);
++ const u16x8 B0 = Unpack16(c2);
++ const u16x8 A0 = vec_sr(vec_add(C1, C0), one); // ave
++ const i16x8 A1 = vec_sub((i16x8)A0, (i16x8)B0);
++ const i16x8 BgtA = (i16x8)vec_cmpgt(B0, A0); // 0 or -1
++ const i16x8 A2 = vec_sub(A1, BgtA);
++ const i16x8 A3 = vec_sra(A2, one);
++ const i16x8 A4 = vec_add((i16x8)A0, A3);
++ return vec_extract((u32x4)vec_packsu(A4, A4), 0);
++}
++
++static WEBP_INLINE uint32_t Select_VSX(uint32_t a, uint32_t b, uint32_t c) {
++ const u8x16 A = (u8x16)Lane0(a);
++ const u8x16 B = (u8x16)Lane0(b);
++ const u8x16 C = (u8x16)Lane0(c);
++ const u32x4 sa = vec_sum4s(vec_or(vec_subs(A, C), vec_subs(C, A)),
++ vec_splats((unsigned int)0));
++ const u32x4 sb = vec_sum4s(vec_or(vec_subs(B, C), vec_subs(C, B)),
++ vec_splats((unsigned int)0));
++ return vec_extract((u32x4)vec_cmpgt(sb, sa), 0) ? b : a;
++}
++
++static uint32_t Predictor5_VSX(const uint32_t* const left,
++ const uint32_t* const top) {
++ return Average3_VSX(*left, top[0], top[1]);
++}
++static uint32_t Predictor6_VSX(const uint32_t* const left,
++ const uint32_t* const top) {
++ return Average2_VSX(*left, top[-1]);
++}
++static uint32_t Predictor7_VSX(const uint32_t* const left,
++ const uint32_t* const top) {
++ return Average2_VSX(*left, top[0]);
++}
++static uint32_t Predictor13_VSX(const uint32_t* const left,
++ const uint32_t* const top) {
++ return ClampedAddSubtractHalf_VSX(*left, top[0], top[-1]);
++}
++
++static void PredictorAdd0_VSX(const uint32_t* in, const uint32_t* upper,
++ int num_pixels, uint32_t* WEBP_RESTRICT out) {
++ const u8x16 black = (u8x16)vec_splats((uint32_t)ARGB_BLACK);
++ int i;
++ for (i = 0; i + 4 <= num_pixels; i += 4) {
++ const u8x16 src = (u8x16)vec_xl(0, (uint32_t*)&in[i]);
++ vec_xst((u32x4)vec_add(src, black), 0, &out[i]);
++ }
++ if (i != num_pixels) {
++ VP8LPredictorsAdd_C[0](in + i, NULL, num_pixels - i, out + i);
++ }
++ (void)upper;
++}
++
++static void PredictorAdd1_VSX(const uint32_t* in, const uint32_t* upper,
++ int num_pixels, uint32_t* WEBP_RESTRICT out) {
++ u32x4 prev = vec_splats(out[-1]);
++ int i;
++ for (i = 0; i + 4 <= num_pixels; i += 4) {
++ const u8x16 src = (u8x16)vec_xl(0, (uint32_t*)&in[i]);
++ const u8x16 sum0 = vec_add(src, SLLI(src, 4)); // a | a+b | b+c | c+d
++ const u8x16 sum1 = vec_add(sum0, SLLI(sum0, 8)); // running sum
++ const u8x16 res = vec_add(sum1, (u8x16)prev);
++ vec_xst((u32x4)res, 0, &out[i]);
++ prev = vec_splat((u32x4)res, 3); // replicate last pixel
++ }
++ if (i != num_pixels) {
++ VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i);
++ }
++}
++
++#define GENERATE_PREDICTOR_1_VSX(X, IN) \
++ static void PredictorAdd##X##_VSX(const uint32_t* in, const uint32_t* upper, \
++ int num_pixels, \
++ uint32_t* WEBP_RESTRICT out) { \
++ int i; \
++ for (i = 0; i + 4 <= num_pixels; i += 4) { \
++ const u8x16 src = (u8x16)vec_xl(0, (uint32_t*)&in[i]); \
++ const u8x16 other = (u8x16)vec_xl(0, (uint32_t*)&(IN)); \
++ vec_xst((u32x4)vec_add(src, other), 0, &out[i]); \
++ } \
++ if (i != num_pixels) { \
++ VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
++ } \
++ }
++GENERATE_PREDICTOR_1_VSX(2, upper[i]) // Top.
++GENERATE_PREDICTOR_1_VSX(3, upper[i + 1]) // Top-right.
++GENERATE_PREDICTOR_1_VSX(4, upper[i - 1]) // Top-left.
++#undef GENERATE_PREDICTOR_1_VSX
++
++// Predictors 5, 6, 7, 13 use integer averages and cannot be accumulated in
++// parallel, so use the generic one-pixel-at-a-time batch.
++GENERATE_PREDICTOR_ADD(Predictor5_VSX, PredictorAdd5_VSX)
++GENERATE_PREDICTOR_ADD(Predictor6_VSX, PredictorAdd6_VSX)
++GENERATE_PREDICTOR_ADD(Predictor7_VSX, PredictorAdd7_VSX)
++GENERATE_PREDICTOR_ADD(Predictor13_VSX, PredictorAdd13_VSX)
++
++#define GENERATE_PREDICTOR_2_VSX(X, IN) \
++ static void PredictorAdd##X##_VSX(const uint32_t* in, const uint32_t* upper, \
++ int num_pixels, \
++ uint32_t* WEBP_RESTRICT out) { \
++ int i; \
++ for (i = 0; i + 4 <= num_pixels; i += 4) { \
++ const u8x16 Tother = (u8x16)vec_xl(0, (uint32_t*)&(IN)); \
++ const u8x16 T = (u8x16)vec_xl(0, (uint32_t*)&upper[i]); \
++ const u8x16 src = (u8x16)vec_xl(0, (uint32_t*)&in[i]); \
++ vec_xst((u32x4)vec_add(Average2_u8(T, Tother), src), 0, &out[i]); \
++ } \
++ if (i != num_pixels) { \
++ VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
++ } \
++ }
++GENERATE_PREDICTOR_2_VSX(8, upper[i - 1]) // Average TL, T.
++GENERATE_PREDICTOR_2_VSX(9, upper[i + 1]) // Average T, TR.
++#undef GENERATE_PREDICTOR_2_VSX
++
++// Predictor10: average of (average(L, TL), average(T, TR)).
++static void PredictorAdd10_VSX(const uint32_t* in, const uint32_t* upper,
++ int num_pixels, uint32_t* WEBP_RESTRICT out) {
++ u8x16 L = (u8x16)Lane0(out[-1]);
++ int i, k;
++ for (i = 0; i + 4 <= num_pixels; i += 4) {
++ u8x16 src = (u8x16)vec_xl(0, (uint32_t*)&in[i]);
++ u8x16 TL = (u8x16)vec_xl(0, (uint32_t*)&upper[i - 1]);
++ const u8x16 T = (u8x16)vec_xl(0, (uint32_t*)&upper[i]);
++ const u8x16 TR = (u8x16)vec_xl(0, (uint32_t*)&upper[i + 1]);
++ u8x16 avgTTR = Average2_u8(T, TR);
++ for (k = 0; k < 4; ++k) {
++ const u8x16 avg = Average2_u8(avgTTR, Average2_u8(L, TL));
++ L = vec_add(avg, src);
++ out[i + k] = vec_extract((u32x4)L, 0);
++ avgTTR = SRLI(avgTTR, 4);
++ TL = SRLI(TL, 4);
++ src = SRLI(src, 4);
++ }
++ }
++ if (i != num_pixels) {
++ VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
++ }
++}
++
++// Predictor11: select between T and L based on |T-TL| vs |L-TL|.
++static void PredictorAdd11_VSX(const uint32_t* in, const uint32_t* upper,
++ int num_pixels, uint32_t* WEBP_RESTRICT out) {
++ const u32x4 z32 = vec_splats((unsigned int)0);
++ u8x16 L = (u8x16)Lane0(out[-1]);
++ int i, k;
++ for (i = 0; i + 4 <= num_pixels; i += 4) {
++ u8x16 T = (u8x16)vec_xl(0, (uint32_t*)&upper[i]);
++ u8x16 TL = (u8x16)vec_xl(0, (uint32_t*)&upper[i - 1]);
++ u8x16 src = (u8x16)vec_xl(0, (uint32_t*)&in[i]);
++ u8x16 pa = (u8x16)vec_sum4s(vec_or(vec_subs(T, TL), vec_subs(TL, T)), z32);
++ for (k = 0; k < 4; ++k) {
++ const u32x4 pb = vec_sum4s(vec_or(vec_subs(L, TL), vec_subs(TL, L)), z32);
++ const u32x4 mask = (u32x4)vec_cmpgt(pb, (u32x4)pa); // pb > pa ? L : T
++ const u8x16 pred = vec_sel(T, L, (u8x16)mask);
++ L = vec_add(src, pred);
++ out[i + k] = vec_extract((u32x4)L, 0);
++ T = SRLI(T, 4);
++ TL = SRLI(TL, 4);
++ src = SRLI(src, 4);
++ pa = SRLI(pa, 4);
++ }
++ }
++ if (i != num_pixels) {
++ VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
++ }
++}
++
++// Predictor12: ClampedAddSubtractFull. 'L' is kept unpacked to 16 bits in the
++// low 4 lanes; 'diff' (= T - TL) holds two pixels, the active one in lanes 0-3.
++#define DO_PRED12(DIFF) \
++ do { \
++ const i16x8 all = vec_add((i16x8)L, (DIFF)); \
++ const u8x16 res = vec_add(src, vec_packsu(all, all)); \
++ out[i + out_idx++] = vec_extract((u32x4)res, 0); \
++ L = (u16x8)vec_mergeh(res, kZero); \
++ } while (0)
++
++static void PredictorAdd12_VSX(const uint32_t* in, const uint32_t* upper,
++ int num_pixels, uint32_t* WEBP_RESTRICT out) {
++ u16x8 L = Unpack16(out[-1]);
++ int i;
++ for (i = 0; i + 4 <= num_pixels; i += 4) {
++ int out_idx = 0;
++ u8x16 src = (u8x16)vec_xl(0, (uint32_t*)&in[i]);
++ const u8x16 T = (u8x16)vec_xl(0, (uint32_t*)&upper[i]);
++ const u8x16 TL = (u8x16)vec_xl(0, (uint32_t*)&upper[i - 1]);
++ // 16-bit gradient basis T - TL for the four pixels (low and high halves).
++ i16x8 diff_lo =
++ vec_sub((i16x8)vec_mergeh(T, kZero), (i16x8)vec_mergeh(TL, kZero));
++ i16x8 diff_hi =
++ vec_sub((i16x8)vec_mergel(T, kZero), (i16x8)vec_mergel(TL, kZero));
++ DO_PRED12(diff_lo);
++ diff_lo = (i16x8)SRLI((u8x16)diff_lo, 8);
++ src = SRLI(src, 4);
++ DO_PRED12(diff_lo);
++ src = SRLI(src, 4);
++ DO_PRED12(diff_hi);
++ diff_hi = (i16x8)SRLI((u8x16)diff_hi, 8);
++ src = SRLI(src, 4);
++ DO_PRED12(diff_hi);
++ }
++ if (i != num_pixels) {
++ VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i);
++ }
++}
++#undef DO_PRED12
++
++#undef SLLI
++#undef SRLI
++
++//------------------------------------------------------------------------------
++
++extern void VP8LDspInitVSX(void);
++
++WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitVSX(void) {
++ VP8LPredictorsAdd[0] = PredictorAdd0_VSX;
++ VP8LPredictorsAdd[1] = PredictorAdd1_VSX;
++ VP8LPredictorsAdd[2] = PredictorAdd2_VSX;
++ VP8LPredictorsAdd[3] = PredictorAdd3_VSX;
++ VP8LPredictorsAdd[4] = PredictorAdd4_VSX;
++ VP8LPredictorsAdd[5] = PredictorAdd5_VSX;
++ VP8LPredictorsAdd[6] = PredictorAdd6_VSX;
++ VP8LPredictorsAdd[7] = PredictorAdd7_VSX;
++ VP8LPredictorsAdd[8] = PredictorAdd8_VSX;
++ VP8LPredictorsAdd[9] = PredictorAdd9_VSX;
++ VP8LPredictorsAdd[10] = PredictorAdd10_VSX;
++ VP8LPredictorsAdd[11] = PredictorAdd11_VSX;
++ VP8LPredictorsAdd[12] = PredictorAdd12_VSX;
++ VP8LPredictorsAdd[13] = PredictorAdd13_VSX;
++
++ VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_VSX;
++ VP8LTransformColorInverse = TransformColorInverse_VSX;
++ VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_VSX;
++ VP8LConvertBGRAToRGB = ConvertBGRAToRGB_VSX;
++ VP8LConvertBGRAToBGR = ConvertBGRAToBGR_VSX;
++}
++
++#else // !WEBP_USE_VSX
++
++WEBP_DSP_INIT_STUB(VP8LDspInitVSX)
++
++#endif // WEBP_USE_VSX
+diff --git a/media/libwebp/src/dsp/moz.build b/media/libwebp/src/dsp/moz.build
+index 8d6f8427c900..f3e9d1273110 100644
+--- a/media/libwebp/src/dsp/moz.build
++++ b/media/libwebp/src/dsp/moz.build
+@@ -118,6 +118,20 @@ elif CONFIG['TARGET_CPU'].startswith('mips'):
+ 'yuv_mips32.c',
+ 'yuv_mips_dsp_r2.c',
+ ]
++elif CONFIG['TARGET_CPU'] == 'ppc64':
++ SOURCES += [
++ 'alpha_processing_vsx.c',
++ 'dec_vsx.c',
++ 'filters_vsx.c',
++ 'lossless_vsx.c',
++ 'rescaler_vsx.c',
++ 'upsampling_vsx.c',
++ 'yuv_vsx.c',
++ ]
++ DEFINES['WEBP_HAVE_VSX'] = 1;
++ for f in SOURCES:
++ if f.endswith('vsx.c'):
++ SOURCES[f].flags += ['-mvsx']
+
+ if CONFIG['CC_TYPE'] in ('clang', 'clang-cl'):
+ CFLAGS += ['-Wno-unreachable-code']
+diff --git a/media/libwebp/src/dsp/rescaler.c b/media/libwebp/src/dsp/rescaler.c
+index eafccd442f25..2c0c8c47a7a3 100644
+--- a/media/libwebp/src/dsp/rescaler.c
++++ b/media/libwebp/src/dsp/rescaler.c
+@@ -207,6 +207,7 @@ extern void WebPRescalerDspInitMIPS32(void);
+ extern void WebPRescalerDspInitMIPSdspR2(void);
+ extern void WebPRescalerDspInitMSA(void);
+ extern void WebPRescalerDspInitNEON(void);
++extern void WebPRescalerDspInitVSX(void);
+
+ WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
+ #if !defined(WEBP_REDUCE_SIZE)
+@@ -238,6 +239,11 @@ WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
+ if (VP8GetCPUInfo(kMSA)) {
+ WebPRescalerDspInitMSA();
+ }
++#endif
++#if defined(WEBP_HAVE_VSX)
++ if (VP8GetCPUInfo(kVSX)) {
++ WebPRescalerDspInitVSX();
++ }
+ #endif
+ }
+
+diff --git a/media/libwebp/src/dsp/rescaler_vsx.c b/media/libwebp/src/dsp/rescaler_vsx.c
+new file mode 100644
+index 000000000000..002f232d647a
+--- /dev/null
++++ b/media/libwebp/src/dsp/rescaler_vsx.c
+@@ -0,0 +1,201 @@
++// Copyright 2015 Google Inc. All Rights Reserved.
++//
++// Use of this source code is governed by a BSD-style license
++// that can be found in the COPYING file in the root of the source
++// tree. An additional intellectual property rights grant can be found
++// in the file PATENTS. All contributing project authors may
++// be found in the AUTHORS file in the root of the source tree.
++// -----------------------------------------------------------------------------
++//
++// VSX (PowerPC) version of rescaling functions.
++
++#include "src/dsp/dsp.h"
++
++#if defined(WEBP_USE_VSX) && !defined(WEBP_REDUCE_SIZE)
++
++#include <altivec.h>
++#include <assert.h>
++#include <string.h>
++
++#include "src/dsp/cpu.h"
++#include "src/utils/rescaler_utils.h"
++#include "src/webp/types.h"
++
++typedef __vector unsigned char u8x16;
++typedef __vector signed short i16x8;
++typedef __vector unsigned int u32x4;
++typedef __vector signed int i32x4;
++typedef __vector unsigned long long u64x2;
++
++#define ROUNDER (WEBP_RESCALER_ONE >> 1)
++#define MULT_FIX_C(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
++#define MULT_FIX_FLOOR_C(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
++
++#if (WEBP_RESCALER_RFIX != 32)
++#error "MULT_FIX/WEBP_RESCALER_RFIX need some more work"
++#endif
++
++// Returns (x * scale + ROUNDER) >> 32 for each of the four 32-bit lanes.
++static WEBP_INLINE u32x4 MultFix_VSX(u32x4 x, uint32_t scale) {
++ const u64x2 rounder = vec_splats((unsigned long long)ROUNDER);
++ const u64x2 shift = vec_splats((unsigned long long)WEBP_RESCALER_RFIX);
++ const u32x4 s = vec_splats(scale);
++ // vec_mule/vec_mulo produce the 32x32->64 products of the even (0, 2) and
++ // odd (1, 3) lanes respectively.
++ u64x2 e = vec_add(vec_mule(x, s), rounder);
++ u64x2 o = vec_add(vec_mulo(x, s), rounder);
++ e = vec_sr(e, shift);
++ o = vec_sr(o, shift);
++ return vec_mergee((u32x4)e, (u32x4)o);
++}
++
++// Returns (x * scale) >> 32 for each lane (no rounding).
++static WEBP_INLINE u32x4 MultFixFloor_VSX(u32x4 x, uint32_t scale) {
++ const u64x2 shift = vec_splats((unsigned long long)WEBP_RESCALER_RFIX);
++ const u32x4 s = vec_splats(scale);
++ u64x2 e = vec_sr(vec_mule(x, s), shift);
++ u64x2 o = vec_sr(vec_mulo(x, s), shift);
++ return vec_mergee((u32x4)e, (u32x4)o);
++}
++
++// Returns (A * frow + B * irow + ROUNDER) >> 32 for each lane.
++static WEBP_INLINE u32x4 Interpolate_VSX(const rescaler_t* WEBP_RESTRICT frow,
++ const rescaler_t* WEBP_RESTRICT irow,
++ uint32_t A, uint32_t B) {
++ const u64x2 rounder = vec_splats((unsigned long long)ROUNDER);
++ const u64x2 shift = vec_splats((unsigned long long)WEBP_RESCALER_RFIX);
++ const u32x4 f = vec_xl(0, (uint32_t*)frow);
++ const u32x4 ir = vec_xl(0, (uint32_t*)irow);
++ const u32x4 va = vec_splats(A);
++ const u32x4 vb = vec_splats(B);
++ u64x2 e = vec_add(vec_mule(f, va), vec_mule(ir, vb));
++ u64x2 o = vec_add(vec_mulo(f, va), vec_mulo(ir, vb));
++ e = vec_sr(vec_add(e, rounder), shift);
++ o = vec_sr(vec_add(o, rounder), shift);
++ return vec_mergee((u32x4)e, (u32x4)o);
++}
++
++// Saturated pack of two 32-bit lane vectors (8 values) into 8 bytes at dst.
++static WEBP_INLINE void Store8_VSX(u32x4 lo, u32x4 hi, uint8_t* dst) {
++ const i16x8 s16 = vec_packs((i32x4)lo, (i32x4)hi);
++ const u8x16 s8 = vec_packsu(s16, s16);
++ memcpy(dst, &s8, 8);
++}
++
++static void RescalerExportRowExpand_VSX(WebPRescaler* const wrk) {
++ int x_out;
++ uint8_t* const dst = wrk->dst;
++ rescaler_t* const irow = wrk->irow;
++ const int x_out_max = wrk->dst_width * wrk->num_channels;
++ const int max_span = x_out_max & ~7;
++ const rescaler_t* const frow = wrk->frow;
++ const uint32_t fy_scale = wrk->fy_scale;
++ assert(!WebPRescalerOutputDone(wrk));
++ assert(wrk->y_accum <= 0);
++ assert(wrk->y_expand);
++ assert(wrk->y_sub != 0);
++ if (wrk->y_accum == 0) {
++ for (x_out = 0; x_out < max_span; x_out += 8) {
++ const u32x4 A0 = vec_xl(0, (uint32_t*)(frow + x_out + 0));
++ const u32x4 A1 = vec_xl(0, (uint32_t*)(frow + x_out + 4));
++ const u32x4 B0 = MultFix_VSX(A0, fy_scale);
++ const u32x4 B1 = MultFix_VSX(A1, fy_scale);
++ Store8_VSX(B0, B1, dst + x_out);
++ }
++ for (; x_out < x_out_max; ++x_out) {
++ const uint32_t J = frow[x_out];
++ const int v = (int)MULT_FIX_C(J, fy_scale);
++ dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
++ }
++ } else {
++ const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
++ const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
++ for (x_out = 0; x_out < max_span; x_out += 8) {
++ const u32x4 C0 =
++ Interpolate_VSX(frow + x_out + 0, irow + x_out + 0, A, B);
++ const u32x4 C1 =
++ Interpolate_VSX(frow + x_out + 4, irow + x_out + 4, A, B);
++ const u32x4 D0 = MultFix_VSX(C0, fy_scale);
++ const u32x4 D1 = MultFix_VSX(C1, fy_scale);
++ Store8_VSX(D0, D1, dst + x_out);
++ }
++ for (; x_out < x_out_max; ++x_out) {
++ const uint64_t I = (uint64_t)A * frow[x_out] + (uint64_t)B * irow[x_out];
++ const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
++ const int v = (int)MULT_FIX_C(J, fy_scale);
++ dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
++ }
++ }
++}
++
++static void RescalerExportRowShrink_VSX(WebPRescaler* const wrk) {
++ int x_out;
++ uint8_t* const dst = wrk->dst;
++ rescaler_t* const irow = wrk->irow;
++ const int x_out_max = wrk->dst_width * wrk->num_channels;
++ const int max_span = x_out_max & ~7;
++ const rescaler_t* const frow = wrk->frow;
++ const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
++ const uint32_t fxy_scale = wrk->fxy_scale;
++ assert(!WebPRescalerOutputDone(wrk));
++ assert(wrk->y_accum <= 0);
++ assert(!wrk->y_expand);
++ if (yscale) {
++ for (x_out = 0; x_out < max_span; x_out += 8) {
++ const u32x4 in0 = vec_xl(0, (uint32_t*)(frow + x_out + 0));
++ const u32x4 in1 = vec_xl(0, (uint32_t*)(frow + x_out + 4));
++ const u32x4 in2 = vec_xl(0, (uint32_t*)(irow + x_out + 0));
++ const u32x4 in3 = vec_xl(0, (uint32_t*)(irow + x_out + 4));
++ const u32x4 A0 = MultFixFloor_VSX(in0, yscale);
++ const u32x4 A1 = MultFixFloor_VSX(in1, yscale);
++ const u32x4 B0 = vec_sub(in2, A0);
++ const u32x4 B1 = vec_sub(in3, A1);
++ const u32x4 C0 = MultFix_VSX(B0, fxy_scale);
++ const u32x4 C1 = MultFix_VSX(B1, fxy_scale);
++ Store8_VSX(C0, C1, dst + x_out);
++ vec_xst(A0, 0, (uint32_t*)(irow + x_out + 0));
++ vec_xst(A1, 0, (uint32_t*)(irow + x_out + 4));
++ }
++ for (; x_out < x_out_max; ++x_out) {
++ const uint32_t frac = (uint32_t)MULT_FIX_FLOOR_C(frow[x_out], yscale);
++ const int v = (int)MULT_FIX_C(irow[x_out] - frac, fxy_scale);
++ dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
++ irow[x_out] = frac; // new fractional start
++ }
++ } else {
++ const u32x4 zero = vec_splats((uint32_t)0);
++ for (x_out = 0; x_out < max_span; x_out += 8) {
++ const u32x4 in0 = vec_xl(0, (uint32_t*)(irow + x_out + 0));
++ const u32x4 in1 = vec_xl(0, (uint32_t*)(irow + x_out + 4));
++ const u32x4 A0 = MultFix_VSX(in0, fxy_scale);
++ const u32x4 A1 = MultFix_VSX(in1, fxy_scale);
++ Store8_VSX(A0, A1, dst + x_out);
++ vec_xst(zero, 0, (uint32_t*)(irow + x_out + 0));
++ vec_xst(zero, 0, (uint32_t*)(irow + x_out + 4));
++ }
++ for (; x_out < x_out_max; ++x_out) {
++ const int v = (int)MULT_FIX_C(irow[x_out], fxy_scale);
++ dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
++ irow[x_out] = 0;
++ }
++ }
++}
++
++#undef MULT_FIX_FLOOR_C
++#undef MULT_FIX_C
++#undef ROUNDER
++
++//------------------------------------------------------------------------------
++
++extern void WebPRescalerDspInitVSX(void);
++
++WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitVSX(void) {
++ WebPRescalerExportRowExpand = RescalerExportRowExpand_VSX;
++ WebPRescalerExportRowShrink = RescalerExportRowShrink_VSX;
++}
++
++#else // !WEBP_USE_VSX
++
++WEBP_DSP_INIT_STUB(WebPRescalerDspInitVSX)
++
++#endif // WEBP_USE_VSX
+diff --git a/media/libwebp/src/dsp/upsampling.c b/media/libwebp/src/dsp/upsampling.c
+index c57f66c3553f..faecdf277393 100644
+--- a/media/libwebp/src/dsp/upsampling.c
++++ b/media/libwebp/src/dsp/upsampling.c
+@@ -235,6 +235,7 @@ extern VP8CPUInfo VP8GetCPUInfo;
+ extern void WebPInitYUV444ConvertersMIPSdspR2(void);
+ extern void WebPInitYUV444ConvertersSSE2(void);
+ extern void WebPInitYUV444ConvertersSSE41(void);
++extern void WebPInitYUV444ConvertersVSX(void);
+
+ WEBP_DSP_INIT_FUNC(WebPInitYUV444Converters) {
+ WebPYUV444Converters[MODE_RGBA] = WebPYuv444ToRgba_C;
+@@ -264,6 +265,11 @@ WEBP_DSP_INIT_FUNC(WebPInitYUV444Converters) {
+ if (VP8GetCPUInfo(kMIPSdspR2)) {
+ WebPInitYUV444ConvertersMIPSdspR2();
+ }
++#endif
++#if defined(WEBP_HAVE_VSX)
++ if (VP8GetCPUInfo(kVSX)) {
++ WebPInitYUV444ConvertersVSX();
++ }
+ #endif
+ }
+ }
+@@ -276,6 +282,7 @@ extern void WebPInitUpsamplersSSE41(void);
+ extern void WebPInitUpsamplersNEON(void);
+ extern void WebPInitUpsamplersMIPSdspR2(void);
+ extern void WebPInitUpsamplersMSA(void);
++extern void WebPInitUpsamplersVSX(void);
+
+ WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
+ #ifdef FANCY_UPSAMPLING
+@@ -314,6 +321,11 @@ WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
+ if (VP8GetCPUInfo(kMSA)) {
+ WebPInitUpsamplersMSA();
+ }
++#endif
++#if defined(WEBP_HAVE_VSX)
++ if (VP8GetCPUInfo(kVSX)) {
++ WebPInitUpsamplersVSX();
++ }
+ #endif
+ }
+
+diff --git a/media/libwebp/src/dsp/upsampling_vsx.c b/media/libwebp/src/dsp/upsampling_vsx.c
+new file mode 100644
+index 000000000000..a7191972fc6e
+--- /dev/null
++++ b/media/libwebp/src/dsp/upsampling_vsx.c
+@@ -0,0 +1,151 @@
++// Copyright 2011 Google Inc. All Rights Reserved.
++//
++// Use of this source code is governed by a BSD-style license
++// that can be found in the COPYING file in the root of the source
++// tree. An additional intellectual property rights grant can be found
++// in the file PATENTS. All contributing project authors may
++// be found in the AUTHORS file in the root of the source tree.
++// -----------------------------------------------------------------------------
++//
++// VSX (PowerPC) version of YUV to RGB upsampling functions.
++
++#include "src/dsp/dsp.h"
++
++#if defined(WEBP_USE_VSX)
++
++#include <altivec.h>
++#include <assert.h>
++#include <string.h>
++
++#include "src/dsp/cpu.h"
++#include "src/dsp/yuv.h"
++#include "src/webp/decode.h"
++#include "src/webp/types.h"
++
++typedef __vector unsigned char u8x16;
++typedef __vector unsigned short u16x8;
++
++// Upsample 16 chroma pairs from rows r1/r2 (17 readable bytes each) into 32
++// "top" bytes at out[0..31] and 32 "bottom" bytes at out[64..95], matching the
++// fancy-upsampler diagonal weights (a + 3b + 3c + d) / 8 etc.
++#define GET_M(ij, in) \
++ vec_sub(vec_avg(k, (in)), \
++ vec_and(vec_or(vec_and((ij), st), vec_xor(k, (in))), one))
++
++static void Upsample32Pixels(const uint8_t* WEBP_RESTRICT r1,
++ const uint8_t* WEBP_RESTRICT r2,
++ uint8_t* WEBP_RESTRICT out) {
++ const u8x16 one = vec_splats((unsigned char)1);
++ const u8x16 a = vec_xl(0, (const unsigned char*)r1);
++ const u8x16 b = vec_xl(1, (const unsigned char*)r1);
++ const u8x16 c = vec_xl(0, (const unsigned char*)r2);
++ const u8x16 d = vec_xl(1, (const unsigned char*)r2);
++ const u8x16 s = vec_avg(a, d);
++ const u8x16 t = vec_avg(b, c);
++ const u8x16 st = vec_xor(s, t);
++ const u8x16 t3 =
++ vec_and(vec_or(vec_or(vec_xor(a, d), vec_xor(b, c)), st), one);
++ const u8x16 k = vec_sub(vec_avg(s, t), t3);
++ const u8x16 diag1 = GET_M(vec_xor(b, c), t);
++ const u8x16 diag2 = GET_M(vec_xor(a, d), s);
++ const u8x16 ta = vec_avg(a, diag1), tb = vec_avg(b, diag2);
++ const u8x16 tc = vec_avg(c, diag2), td = vec_avg(d, diag1);
++ vec_xst(vec_mergeh(ta, tb), 0, out);
++ vec_xst(vec_mergel(ta, tb), 0, out + 16);
++ vec_xst(vec_mergeh(tc, td), 0, out + 64);
++ vec_xst(vec_mergel(tc, td), 0, out + 80);
++}
++
++#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, FUNC32) \
++static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
++ const uint8_t* WEBP_RESTRICT bottom_y, \
++ const uint8_t* WEBP_RESTRICT top_u, \
++ const uint8_t* WEBP_RESTRICT top_v, \
++ const uint8_t* WEBP_RESTRICT cur_u, \
++ const uint8_t* WEBP_RESTRICT cur_v, \
++ uint8_t* WEBP_RESTRICT top_dst, \
++ uint8_t* WEBP_RESTRICT bottom_dst, int len) { \
++ int uv_pos, pos; \
++ uint8_t uv_buf[14 * 32 + 15] = {0}; \
++ uint8_t* const r_u = (uint8_t*)(((uintptr_t)(uv_buf + 15)) & ~(uintptr_t)15);\
++ uint8_t* const r_v = r_u + 32; \
++ assert(top_y != NULL); \
++ { \
++ const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \
++ const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \
++ FUNC(top_y[0], (top_u[0] + u_diag) >> 1, (top_v[0] + v_diag) >> 1, \
++ top_dst); \
++ if (bottom_y != NULL) { \
++ FUNC(bottom_y[0], (cur_u[0] + u_diag) >> 1, (cur_v[0] + v_diag) >> 1, \
++ bottom_dst); \
++ } \
++ } \
++ for (pos = 1, uv_pos = 0; pos + 32 + 1 <= len; pos += 32, uv_pos += 16) { \
++ Upsample32Pixels(top_u + uv_pos, cur_u + uv_pos, r_u); \
++ Upsample32Pixels(top_v + uv_pos, cur_v + uv_pos, r_v); \
++ FUNC32(top_y + pos, r_u, r_v, top_dst + pos * 4); \
++ if (bottom_y != NULL) { \
++ FUNC32(bottom_y + pos, r_u + 64, r_v + 64, bottom_dst + pos * 4); \
++ } \
++ } \
++ if (len > 1) { \
++ const int left_over = ((len + 1) >> 1) - (pos >> 1); \
++ uint8_t* const tmp_top_dst = r_u + 4 * 32; \
++ uint8_t* const tmp_bottom_dst = tmp_top_dst + 4 * 32; \
++ uint8_t* const tmp_top = tmp_bottom_dst + 4 * 32; \
++ uint8_t* const tmp_bottom = (bottom_y == NULL) ? NULL : tmp_top + 32; \
++ uint8_t r1[17], r2[17]; \
++ assert(left_over > 0); \
++ memcpy(r1, top_u + uv_pos, left_over); \
++ memcpy(r2, cur_u + uv_pos, left_over); \
++ memset(r1 + left_over, r1[left_over - 1], 17 - left_over); \
++ memset(r2 + left_over, r2[left_over - 1], 17 - left_over); \
++ Upsample32Pixels(r1, r2, r_u); \
++ memcpy(r1, top_v + uv_pos, left_over); \
++ memcpy(r2, cur_v + uv_pos, left_over); \
++ memset(r1 + left_over, r1[left_over - 1], 17 - left_over); \
++ memset(r2 + left_over, r2[left_over - 1], 17 - left_over); \
++ Upsample32Pixels(r1, r2, r_v); \
++ memcpy(tmp_top, top_y + pos, len - pos); \
++ if (bottom_y != NULL) memcpy(tmp_bottom, bottom_y + pos, len - pos); \
++ FUNC32(tmp_top, r_u, r_v, tmp_top_dst); \
++ if (bottom_y != NULL) FUNC32(tmp_bottom, r_u + 64, r_v + 64, \
++ tmp_bottom_dst); \
++ memcpy(top_dst + pos * 4, tmp_top_dst, (len - pos) * 4); \
++ if (bottom_y != NULL) { \
++ memcpy(bottom_dst + pos * 4, tmp_bottom_dst, (len - pos) * 4); \
++ } \
++ } \
++}
++
++UPSAMPLE_FUNC(UpsampleRgbaLinePair_VSX, VP8YuvToRgba, VP8YuvToRgba32_VSX)
++UPSAMPLE_FUNC(UpsampleBgraLinePair_VSX, VP8YuvToBgra, VP8YuvToBgra32_VSX)
++UPSAMPLE_FUNC(UpsampleArgbLinePair_VSX, VP8YuvToArgb, VP8YuvToArgb32_VSX)
++
++extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
++
++extern void WebPInitUpsamplersVSX(void);
++
++WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersVSX(void) {
++ WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_VSX;
++ WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_VSX;
++ WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_VSX;
++ WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_VSX;
++#if !defined(WEBP_REDUCE_CSP)
++ WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_VSX;
++ WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_VSX;
++#endif
++}
++
++extern void WebPInitYUV444ConvertersVSX(void);
++
++// YUV444 point converters stay on the C path for now.
++WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersVSX(void) {}
++
++#else // !WEBP_USE_VSX
++
++WEBP_DSP_INIT_STUB(WebPInitYUV444ConvertersVSX)
++
++WEBP_DSP_INIT_STUB(WebPInitUpsamplersVSX)
++
++#endif // WEBP_USE_VSX
+diff --git a/media/libwebp/src/dsp/yuv.c b/media/libwebp/src/dsp/yuv.c
+index 62f1ecc1567d..9a95c5de1e23 100644
+--- a/media/libwebp/src/dsp/yuv.c
++++ b/media/libwebp/src/dsp/yuv.c
+@@ -81,6 +81,7 @@ extern void WebPInitSamplersSSE2(void);
+ extern void WebPInitSamplersSSE41(void);
+ extern void WebPInitSamplersMIPS32(void);
+ extern void WebPInitSamplersMIPSdspR2(void);
++extern void WebPInitSamplersVSX(void);
+
+ WEBP_DSP_INIT_FUNC(WebPInitSamplers) {
+ WebPSamplers[MODE_RGB] = YuvToRgbRow;
+@@ -117,6 +118,11 @@ WEBP_DSP_INIT_FUNC(WebPInitSamplers) {
+ WebPInitSamplersMIPSdspR2();
+ }
+ #endif // WEBP_USE_MIPS_DSP_R2
++#if defined(WEBP_HAVE_VSX)
++ if (VP8GetCPUInfo(kVSX)) {
++ WebPInitSamplersVSX();
++ }
++#endif
+ }
+ }
+
+diff --git a/media/libwebp/src/dsp/yuv.h b/media/libwebp/src/dsp/yuv.h
+index 6f218cf7e07f..979891d3232d 100644
+--- a/media/libwebp/src/dsp/yuv.h
++++ b/media/libwebp/src/dsp/yuv.h
+@@ -182,6 +182,27 @@ void VP8YuvToRgb56532_SSE2(const uint8_t* WEBP_RESTRICT y,
+
+ #endif // WEBP_USE_SSE2
+
++//-----------------------------------------------------------------------------
++// VSX extra functions (mostly for upsampling_vsx.c)
++
++#if defined(WEBP_USE_VSX)
++
++// Process 32 pixels and store the 32b-per-pixel result in *dst.
++void VP8YuvToRgba32_VSX(const uint8_t* WEBP_RESTRICT y,
++ const uint8_t* WEBP_RESTRICT u,
++ const uint8_t* WEBP_RESTRICT v,
++ uint8_t* WEBP_RESTRICT dst);
++void VP8YuvToBgra32_VSX(const uint8_t* WEBP_RESTRICT y,
++ const uint8_t* WEBP_RESTRICT u,
++ const uint8_t* WEBP_RESTRICT v,
++ uint8_t* WEBP_RESTRICT dst);
++void VP8YuvToArgb32_VSX(const uint8_t* WEBP_RESTRICT y,
++ const uint8_t* WEBP_RESTRICT u,
++ const uint8_t* WEBP_RESTRICT v,
++ uint8_t* WEBP_RESTRICT dst);
++
++#endif // WEBP_USE_VSX
++
+ //-----------------------------------------------------------------------------
+ // SSE41 extra functions (mostly for upsampling_sse41.c)
+
+diff --git a/media/libwebp/src/dsp/yuv_vsx.c b/media/libwebp/src/dsp/yuv_vsx.c
+new file mode 100644
+index 000000000000..1fdc5c80ba16
+--- /dev/null
++++ b/media/libwebp/src/dsp/yuv_vsx.c
+@@ -0,0 +1,206 @@
++// Copyright 2014 Google Inc. All Rights Reserved.
++//
++// Use of this source code is governed by a BSD-style license
++// that can be found in the COPYING file in the root of the source
++// tree. An additional intellectual property rights grant can be found
++// in the file PATENTS. All contributing project authors may
++// be found in the AUTHORS file in the root of the source tree.
++// -----------------------------------------------------------------------------
++//
++// VSX (PowerPC) version of YUV->RGB conversion functions.
++
++#include "src/dsp/dsp.h"
++
++#if defined(WEBP_USE_VSX)
++
++#include <altivec.h>
++#include <string.h>
++
++#include "src/dsp/yuv.h"
++
++typedef __vector unsigned char u8x16;
++typedef __vector unsigned short u16x8;
++typedef __vector signed short i16x8;
++typedef __vector unsigned int u32x4;
++
++// POWER8 has no "multiply-high unsigned halfword", so emulate _mm_mulhi_epu16
++// via even/odd 16x16->32 products, >>16, then interleave back.
++static WEBP_INLINE u16x8 MulHi16(u16x8 a, u16x8 b) {
++ const u32x4 sh = vec_splats((unsigned int)16);
++ const u32x4 e = vec_sr(vec_mule(a, b), sh);
++ const u32x4 o = vec_sr(vec_mulo(a, b), sh);
++ return vec_pack(vec_mergeh(e, o), vec_mergel(e, o));
++}
++
++// 14b fixed-point ITU-R BT.601 YUV->RGB, matching the SSE2/scalar path.
++// Inputs are samples pre-shifted into the high byte (<< 8).
++static WEBP_INLINE void ConvertYUV444ToRGB(u16x8 Y0, u16x8 U0, u16x8 V0,
++ i16x8* const R, i16x8* const G,
++ u16x8* const B) {
++ const u16x8 k19077 = vec_splats((unsigned short)19077);
++ const u16x8 k26149 = vec_splats((unsigned short)26149);
++ const u16x8 k14234 = vec_splats((unsigned short)14234);
++ const u16x8 k33050 = vec_splats((unsigned short)33050);
++ const u16x8 k17685 = vec_splats((unsigned short)17685);
++ const u16x8 k6419 = vec_splats((unsigned short)6419);
++ const u16x8 k13320 = vec_splats((unsigned short)13320);
++ const u16x8 k8708 = vec_splats((unsigned short)8708);
++ const u16x8 six = vec_splats((unsigned short)6);
++
++ const u16x8 Y1 = MulHi16(Y0, k19077);
++ const u16x8 R2 = vec_add(vec_sub(Y1, k14234), MulHi16(V0, k26149));
++ const u16x8 G4 = vec_sub(vec_add(Y1, k8708),
++ vec_add(MulHi16(U0, k6419), MulHi16(V0, k13320)));
++ // 33050 needs unsigned saturating arithmetic; B can exceed 32767.
++ const u16x8 B2 = vec_subs(vec_adds(MulHi16(U0, k33050), Y1), k17685);
++
++ *R = vec_sra((i16x8)R2, six);
++ *G = vec_sra((i16x8)G4, six);
++ *B = vec_sr(B2, six);
++}
++
++// Load 8 bytes into the high byte of 8 u16 lanes (i.e. sample << 8).
++// Use an 8-byte copy (not a 16-byte vector load) to avoid reading past the
++// end of the source row, matching the SSE2 _mm_loadl_epi64 behavior.
++static WEBP_INLINE u16x8 LoadHi16(const uint8_t* WEBP_RESTRICT src) {
++ const u8x16 zero = vec_splats((unsigned char)0);
++ unsigned char tmp[16] = {0};
++ memcpy(tmp, src, 8);
++ return (u16x8)vec_mergeh(zero, vec_xl(0, tmp));
++}
++
++// Load 4 U/V bytes, shift into the high byte, and replicate each sample.
++static WEBP_INLINE u16x8 LoadUVHi8(const uint8_t* WEBP_RESTRICT src) {
++ const u8x16 zero = vec_splats((unsigned char)0);
++ unsigned char tmp[16] = {0};
++ memcpy(tmp, src, 4);
++ const u16x8 t = (u16x8)vec_mergeh(zero, vec_xl(0, tmp));
++ return vec_mergeh(t, t);
++}
++
++static WEBP_INLINE void YUV420ToRGB(const uint8_t* WEBP_RESTRICT y,
++ const uint8_t* WEBP_RESTRICT u,
++ const uint8_t* WEBP_RESTRICT v,
++ i16x8* const R, i16x8* const G,
++ u16x8* const B) {
++ ConvertYUV444ToRGB(LoadHi16(y), LoadUVHi8(u), LoadUVHi8(v), R, G, B);
++}
++
++// Pack four 8-lane channels into 32 interleaved bytes (c0 c1 c2 c3 per pixel).
++static WEBP_INLINE void PackAndStore4(i16x8 c0, i16x8 c1, i16x8 c2, i16x8 c3,
++ uint8_t* WEBP_RESTRICT dst) {
++ const u8x16 c02 = vec_packsu(c0, c2);
++ const u8x16 c13 = vec_packsu(c1, c3);
++ const u8x16 lo8 = vec_mergeh(c02, c13);
++ const u8x16 hi8 = vec_mergel(c02, c13);
++ vec_xst((u8x16)vec_mergeh((u16x8)lo8, (u16x8)hi8), 0, dst);
++ vec_xst((u8x16)vec_mergel((u16x8)lo8, (u16x8)hi8), 0, dst + 16);
++}
++
++static const i16x8 kAlpha = {255, 255, 255, 255, 255, 255, 255, 255};
++
++static void YuvToRgbaRow_VSX(const uint8_t* WEBP_RESTRICT y,
++ const uint8_t* WEBP_RESTRICT u,
++ const uint8_t* WEBP_RESTRICT v,
++ uint8_t* WEBP_RESTRICT dst, int len) {
++ int n;
++ for (n = 0; n + 8 <= len; n += 8, dst += 32) {
++ i16x8 R, G; u16x8 B;
++ YUV420ToRGB(y, u, v, &R, &G, &B);
++ PackAndStore4(R, G, (i16x8)B, kAlpha, dst);
++ y += 8; u += 4; v += 4;
++ }
++ for (; n < len; ++n) {
++ VP8YuvToRgba(y[0], u[0], v[0], dst);
++ dst += 4; y += 1; u += (n & 1); v += (n & 1);
++ }
++}
++
++static void YuvToBgraRow_VSX(const uint8_t* WEBP_RESTRICT y,
++ const uint8_t* WEBP_RESTRICT u,
++ const uint8_t* WEBP_RESTRICT v,
++ uint8_t* WEBP_RESTRICT dst, int len) {
++ int n;
++ for (n = 0; n + 8 <= len; n += 8, dst += 32) {
++ i16x8 R, G; u16x8 B;
++ YUV420ToRGB(y, u, v, &R, &G, &B);
++ PackAndStore4((i16x8)B, G, R, kAlpha, dst);
++ y += 8; u += 4; v += 4;
++ }
++ for (; n < len; ++n) {
++ VP8YuvToBgra(y[0], u[0], v[0], dst);
++ dst += 4; y += 1; u += (n & 1); v += (n & 1);
++ }
++}
++
++static void YuvToArgbRow_VSX(const uint8_t* WEBP_RESTRICT y,
++ const uint8_t* WEBP_RESTRICT u,
++ const uint8_t* WEBP_RESTRICT v,
++ uint8_t* WEBP_RESTRICT dst, int len) {
++ int n;
++ for (n = 0; n + 8 <= len; n += 8, dst += 32) {
++ i16x8 R, G; u16x8 B;
++ YUV420ToRGB(y, u, v, &R, &G, &B);
++ PackAndStore4(kAlpha, R, G, (i16x8)B, dst);
++ y += 8; u += 4; v += 4;
++ }
++ for (; n < len; ++n) {
++ VP8YuvToArgb(y[0], u[0], v[0], dst);
++ dst += 4; y += 1; u += (n & 1); v += (n & 1);
++ }
++}
++
++// Convert 32 YUV444 pixels and store the 32b-per-pixel result. Used by the
++// fancy upsampler in upsampling_vsx.c.
++void VP8YuvToRgba32_VSX(const uint8_t* WEBP_RESTRICT y,
++ const uint8_t* WEBP_RESTRICT u,
++ const uint8_t* WEBP_RESTRICT v,
++ uint8_t* WEBP_RESTRICT dst) {
++ int n;
++ for (n = 0; n < 32; n += 8, dst += 32) {
++ i16x8 R, G; u16x8 B;
++ ConvertYUV444ToRGB(LoadHi16(y + n), LoadHi16(u + n), LoadHi16(v + n),
++ &R, &G, &B);
++ PackAndStore4(R, G, (i16x8)B, kAlpha, dst);
++ }
++}
++
++void VP8YuvToBgra32_VSX(const uint8_t* WEBP_RESTRICT y,
++ const uint8_t* WEBP_RESTRICT u,
++ const uint8_t* WEBP_RESTRICT v,
++ uint8_t* WEBP_RESTRICT dst) {
++ int n;
++ for (n = 0; n < 32; n += 8, dst += 32) {
++ i16x8 R, G; u16x8 B;
++ ConvertYUV444ToRGB(LoadHi16(y + n), LoadHi16(u + n), LoadHi16(v + n),
++ &R, &G, &B);
++ PackAndStore4((i16x8)B, G, R, kAlpha, dst);
++ }
++}
++
++void VP8YuvToArgb32_VSX(const uint8_t* WEBP_RESTRICT y,
++ const uint8_t* WEBP_RESTRICT u,
++ const uint8_t* WEBP_RESTRICT v,
++ uint8_t* WEBP_RESTRICT dst) {
++ int n;
++ for (n = 0; n < 32; n += 8, dst += 32) {
++ i16x8 R, G; u16x8 B;
++ ConvertYUV444ToRGB(LoadHi16(y + n), LoadHi16(u + n), LoadHi16(v + n),
++ &R, &G, &B);
++ PackAndStore4(kAlpha, R, G, (i16x8)B, dst);
++ }
++}
++
++extern void WebPInitSamplersVSX(void);
++
++WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersVSX(void) {
++ WebPSamplers[MODE_RGBA] = YuvToRgbaRow_VSX;
++ WebPSamplers[MODE_BGRA] = YuvToBgraRow_VSX;
++ WebPSamplers[MODE_ARGB] = YuvToArgbRow_VSX;
++}
++
++#else // !WEBP_USE_VSX
++
++WEBP_DSP_INIT_STUB(WebPInitSamplersVSX)
++
++#endif // WEBP_USE_VSX
+diff --git a/media/libwebp/src/moz/cpu.cpp b/media/libwebp/src/moz/cpu.cpp
+index c6633170c923..82986d2f631e 100644
+--- a/media/libwebp/src/moz/cpu.cpp
++++ b/media/libwebp/src/moz/cpu.cpp
+@@ -35,6 +35,10 @@ static int MozCPUInfo(CPUFeature feature)
+ case kMIPSdspR2:
+ case kMSA:
+ return 1;
++#endif
++#if defined(WEBP_USE_VSX)
++ case kVSX:
++ return 1;
+ #endif
+ default:
+ return 0;
+--
+2.52.0
+
diff --git a/0003-Add-PPC64LE-JIT-backend.patch b/0003-Add-PPC64LE-JIT-backend.patch
new file mode 100644
index 0000000..ee08b33
--- /dev/null
+++ b/0003-Add-PPC64LE-JIT-backend.patch
@@ -0,0 +1,38205 @@
+From c79926e41764c6aa6ae596812b23bc35b470028c Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Trung=20L=C3=AA?= <8@tle.id.au>
+Date: Fri, 12 Jun 2026 16:02:28 +1000
+Subject: [PATCH 3/3] Add PPC64LE JIT backend
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Based on the work done by Cameron Kaiser and Justin Hibbits
+https://github.com/chmeeedalf/gecko-dev
+
+Co-authored-by: Cameron Kaiser <classilla@floodgap.com>
+Co-authored-by: Justin Hibbits <chmeeedalf@gmail.com>
+Assisted-by: Lance Albertson <lance@osuosl.org>
+Assisted-by: Thushan Fernando <thushan@thushanfernando.com>
+Assisted-by: Timothy Pearson <tpearson@solidsilicon.com>
+Assisted-by: Dan Horák <dan@danny.cz>
+Assisted-by: Hiếu Lê <modology@gmail.com>
+Assisted-by: Claude Fable 5 <noreply@anthropic.com>
+---
+ config/check_macroassembler_style.py | 2 +
+ js/moz.configure | 34 +-
+ js/src/builtin/TestingFunctions.cpp | 18 +
+ js/src/irregexp/RegExpAPI.cpp | 5 +-
+ .../irregexp/RegExpNativeMacroAssembler.cpp | 28 +
+ .../tests/baseline/ppc64-branch8-16-narrow.js | 103 +
+ js/src/jit-test/tests/gc/gcparam.js | 3 +-
+ .../tests/ion/mod-constant-pow2-minus-one.js | 78 +
+ .../tests/ion/mod-pow2-negative-dividend.js | 71 +
+ .../tests/math-min-max-corner-cases.js | 50 +
+ js/src/jit-test/tests/wasm/atomicity.js | 8 +-
+ .../jit-test/tests/wasm/excessive-inlining.js | 19 +-
+ .../jit-test/tests/wasm/memory-oob-message.js | 10 +-
+ .../tests/wasm/ppc64-argon2-tiering.js | 124 +
+ .../tests/wasm/ppc64-compare-select-bench.js | 70 +
+ .../jit-test/tests/wasm/ppc64-extmul-alias.js | 107 +
+ .../tests/wasm/ppc64-simd-vr-clobber.js | 179 +
+ js/src/jit-test/tests/wasm/profiling.js | 7 +
+ .../wasm/regress-ppc64-extract-lane-ctz.js | 49 +
+ .../wasm/regress-ppc64-select-condition.js | 30 +
+ .../wasm/regress-ppc64-trap-exit-simd-save.js | 64 +
+ .../bug-ppc64-simd-reduce-and-branch.js | 7 +
+ .../bug-ppc64-simd-reduce-and-branch.wasm | Bin 0 -> 1148 bytes
+ js/src/jit-test/tests/wasm/simd/bug1946618.js | 7 +-
+ .../jit-test/tests/wasm/simd/ion-analysis.js | 7 +-
+ js/src/jit/Assembler.h | 2 +
+ js/src/jit/BaselineIC.cpp | 2 +
+ js/src/jit/CacheIRCompiler.cpp | 16 +
+ js/src/jit/CodeGenerator.cpp | 6 +
+ js/src/jit/CodeGenerator.h | 2 +
+ js/src/jit/EffectiveAddressAnalysis.cpp | 2 +-
+ js/src/jit/ExecutableAllocator.cpp | 10 +-
+ js/src/jit/FlushICache.cpp | 3 +-
+ js/src/jit/FlushICache.h | 11 +-
+ js/src/jit/GenerateABIFunctionType.py | 100 +
+ js/src/jit/JitContext.cpp | 4 +
+ js/src/jit/JitFrames.cpp | 10 +
+ js/src/jit/JitFrames.h | 12 +-
+ js/src/jit/LIR.cpp | 4 +-
+ js/src/jit/LIR.h | 10 +-
+ js/src/jit/LIROps.yaml | 82 +-
+ js/src/jit/Label.h | 2 +-
+ js/src/jit/Lowering.cpp | 2 +-
+ js/src/jit/Lowering.h | 2 +
+ js/src/jit/MacroAssembler-inl.h | 2 +
+ js/src/jit/MacroAssembler.cpp | 25 +-
+ js/src/jit/MacroAssembler.h | 647 +-
+ js/src/jit/MoveEmitter.h | 2 +
+ js/src/jit/MoveResolver.cpp | 16 +
+ js/src/jit/RegisterAllocator.h | 7 +-
+ js/src/jit/Registers.h | 2 +
+ js/src/jit/Safepoints.cpp | 11 +
+ js/src/jit/SharedICHelpers-inl.h | 2 +
+ js/src/jit/SharedICHelpers.h | 2 +
+ js/src/jit/SharedICRegisters.h | 2 +
+ js/src/jit/Simulator.h | 2 +
+ js/src/jit/moz.build | 12 +
+ js/src/jit/ppc64/Architecture-ppc64.cpp | 221 +
+ js/src/jit/ppc64/Architecture-ppc64.h | 581 ++
+ js/src/jit/ppc64/Assembler-ppc64.cpp | 3028 +++++++
+ js/src/jit/ppc64/Assembler-ppc64.h | 2114 +++++
+ js/src/jit/ppc64/CodeGenerator-ppc64.cpp | 3647 ++++++++
+ js/src/jit/ppc64/CodeGenerator-ppc64.h | 101 +
+ js/src/jit/ppc64/LIR-ppc64.h | 135 +
+ js/src/jit/ppc64/Lowering-ppc64.cpp | 1324 +++
+ js/src/jit/ppc64/Lowering-ppc64.h | 105 +
+ js/src/jit/ppc64/MacroAssembler-ppc64-inl.h | 6142 ++++++++++++++
+ js/src/jit/ppc64/MacroAssembler-ppc64.cpp | 3467 ++++++++
+ js/src/jit/ppc64/MacroAssembler-ppc64.h | 2031 +++++
+ js/src/jit/ppc64/MoveEmitter-ppc64.cpp | 357 +
+ js/src/jit/ppc64/MoveEmitter-ppc64.h | 64 +
+ js/src/jit/ppc64/SharedICHelpers-ppc64-inl.h | 83 +
+ js/src/jit/ppc64/SharedICHelpers-ppc64.h | 97 +
+ js/src/jit/ppc64/SharedICRegisters-ppc64.h | 46 +
+ js/src/jit/ppc64/Simulator-ppc64.cpp | 7296 +++++++++++++++++
+ js/src/jit/ppc64/Simulator-ppc64.h | 556 ++
+ js/src/jit/ppc64/Trampoline-ppc64.cpp | 648 ++
+ js/src/jit/shared/Assembler-shared.h | 5 +-
+ .../AtomicOperations-feeling-lucky-gcc.h | 3 +-
+ js/src/jit/shared/CodeGenerator-shared.cpp | 6 +-
+ js/src/jit/shared/Lowering-shared-inl.h | 2 +-
+ js/src/js-config.mozbuild | 1 +
+ js/src/jsapi-tests/testJitABIcalls.cpp | 3 +
+ js/src/jsapi-tests/testWasmReturnCalls.cpp | 10 +-
+ js/src/jsapi-tests/testsJit.cpp | 20 +
+ js/src/shell/js.cpp | 25 +
+ js/src/shell/jsshell.h | 3 +-
+ js/src/tests/shell/os.js | 8 +-
+ js/src/util/Poison.h | 2 +
+ js/src/wasm/WasmAnyRef.h | 7 +-
+ js/src/wasm/WasmBCDefs.h | 7 +
+ js/src/wasm/WasmBCMemory.cpp | 47 +-
+ js/src/wasm/WasmBCRegDefs.h | 12 +-
+ js/src/wasm/WasmBaselineCompile.cpp | 148 +-
+ js/src/wasm/WasmCodegenConstants.h | 3 +-
+ js/src/wasm/WasmCodegenTypes.cpp | 11 +-
+ js/src/wasm/WasmCompile.cpp | 6 +-
+ js/src/wasm/WasmFrameIter.cpp | 118 +
+ js/src/wasm/WasmGC.cpp | 8 +
+ js/src/wasm/WasmGenerator.cpp | 18 +-
+ js/src/wasm/WasmIonCompile.cpp | 2 +-
+ js/src/wasm/WasmMemory.cpp | 4 +-
+ js/src/wasm/WasmSignalHandlers.cpp | 20 +-
+ js/src/wasm/WasmStacks.cpp | 31 +-
+ js/src/wasm/WasmStubs.cpp | 43 +-
+ js/src/wasm/WasmSummarizeInsn.cpp | 163 +
+ js/src/wasm/WasmValue.cpp | 2 +-
+ mfbt/Assertions.h | 5 +
+ 108 files changed, 34442 insertions(+), 438 deletions(-)
+ create mode 100644 js/src/jit-test/tests/baseline/ppc64-branch8-16-narrow.js
+ create mode 100644 js/src/jit-test/tests/ion/mod-constant-pow2-minus-one.js
+ create mode 100644 js/src/jit-test/tests/ion/mod-pow2-negative-dividend.js
+ create mode 100644 js/src/jit-test/tests/math-min-max-corner-cases.js
+ create mode 100644 js/src/jit-test/tests/wasm/ppc64-argon2-tiering.js
+ create mode 100644 js/src/jit-test/tests/wasm/ppc64-compare-select-bench.js
+ create mode 100644 js/src/jit-test/tests/wasm/ppc64-extmul-alias.js
+ create mode 100644 js/src/jit-test/tests/wasm/ppc64-simd-vr-clobber.js
+ create mode 100644 js/src/jit-test/tests/wasm/regress-ppc64-extract-lane-ctz.js
+ create mode 100644 js/src/jit-test/tests/wasm/regress-ppc64-select-condition.js
+ create mode 100644 js/src/jit-test/tests/wasm/regress-ppc64-trap-exit-simd-save.js
+ create mode 100644 js/src/jit-test/tests/wasm/regress/bug-ppc64-simd-reduce-and-branch.js
+ create mode 100644 js/src/jit-test/tests/wasm/regress/bug-ppc64-simd-reduce-and-branch.wasm
+ create mode 100644 js/src/jit/ppc64/Architecture-ppc64.cpp
+ create mode 100644 js/src/jit/ppc64/Architecture-ppc64.h
+ create mode 100644 js/src/jit/ppc64/Assembler-ppc64.cpp
+ create mode 100644 js/src/jit/ppc64/Assembler-ppc64.h
+ create mode 100644 js/src/jit/ppc64/CodeGenerator-ppc64.cpp
+ create mode 100644 js/src/jit/ppc64/CodeGenerator-ppc64.h
+ create mode 100644 js/src/jit/ppc64/LIR-ppc64.h
+ create mode 100644 js/src/jit/ppc64/Lowering-ppc64.cpp
+ create mode 100644 js/src/jit/ppc64/Lowering-ppc64.h
+ create mode 100644 js/src/jit/ppc64/MacroAssembler-ppc64-inl.h
+ create mode 100644 js/src/jit/ppc64/MacroAssembler-ppc64.cpp
+ create mode 100644 js/src/jit/ppc64/MacroAssembler-ppc64.h
+ create mode 100644 js/src/jit/ppc64/MoveEmitter-ppc64.cpp
+ create mode 100644 js/src/jit/ppc64/MoveEmitter-ppc64.h
+ create mode 100644 js/src/jit/ppc64/SharedICHelpers-ppc64-inl.h
+ create mode 100644 js/src/jit/ppc64/SharedICHelpers-ppc64.h
+ create mode 100644 js/src/jit/ppc64/SharedICRegisters-ppc64.h
+ create mode 100644 js/src/jit/ppc64/Simulator-ppc64.cpp
+ create mode 100644 js/src/jit/ppc64/Simulator-ppc64.h
+ create mode 100644 js/src/jit/ppc64/Trampoline-ppc64.cpp
+
+diff --git a/config/check_macroassembler_style.py b/config/check_macroassembler_style.py
+index aa1a54104e26..ba73de388099 100644
+--- a/config/check_macroassembler_style.py
++++ b/config/check_macroassembler_style.py
+@@ -33,6 +33,7 @@ all_architecture_names = set([
+ "arm64",
+ "loong64",
+ "riscv64",
++ "ppc64",
+ "wasm32",
+ ])
+ all_shared_architecture_names = set([
+@@ -41,6 +42,7 @@ all_shared_architecture_names = set([
+ "arm64",
+ "loong64",
+ "riscv64",
++ "ppc64",
+ "wasm32",
+ ])
+
+diff --git a/js/moz.configure b/js/moz.configure
+index 26cc85622654..5310dd08506f 100644
+--- a/js/moz.configure
++++ b/js/moz.configure
+@@ -264,6 +264,7 @@ def jit_default(target, enable_portable_baseline_interp):
+ "aarch64",
+ "mips64",
+ "loongarch64",
++ "ppc64",
+ "riscv64",
+ ):
+ return True
+@@ -285,7 +286,7 @@ def report_deprecated(value):
+ # =======================================================
+ option(
+ "--enable-simulator",
+- choices=("arm", "arm64", "mips64", "loong64", "riscv64"),
++ choices=("arm", "arm64", "mips64", "loong64", "riscv64", "ppc64"),
+ nargs=1,
+ help="Enable a JIT code simulator for the specified architecture",
+ )
+@@ -302,7 +303,7 @@ def simulator(jit_enabled, simulator_enabled, target):
+ if target.cpu != "x86":
+ die("The %s simulator only works on x86." % sim_cpu)
+
+- if sim_cpu in ("arm64", "mips64", "loong64", "riscv64"):
++ if sim_cpu in ("arm64", "mips64", "loong64", "riscv64", "ppc64"):
+ if target.cpu != "x86_64" and target.cpu != "aarch64":
+ die("The %s simulator only works on x86-64 or arm64." % sim_cpu)
+
+@@ -315,12 +316,14 @@ set_config("JS_SIMULATOR_ARM64", simulator.arm64)
+ set_config("JS_SIMULATOR_MIPS64", simulator.mips64)
+ set_config("JS_SIMULATOR_LOONG64", simulator.loong64)
+ set_config("JS_SIMULATOR_RISCV64", simulator.riscv64)
++set_config("JS_SIMULATOR_PPC64", simulator.ppc64)
+ set_define("JS_SIMULATOR", depends_if(simulator)(lambda x: True))
+ set_define("JS_SIMULATOR_ARM", simulator.arm)
+ set_define("JS_SIMULATOR_ARM64", simulator.arm64)
+ set_define("JS_SIMULATOR_MIPS64", simulator.mips64)
+ set_define("JS_SIMULATOR_LOONG64", simulator.loong64)
+ set_define("JS_SIMULATOR_RISCV64", simulator.riscv64)
++set_define("JS_SIMULATOR_PPC64", simulator.ppc64)
+
+
+ @depends("--enable-jit", simulator, target)
+@@ -337,6 +340,8 @@ def jit_codegen(jit_enabled, simulator, target):
+ return namespace(x64=True)
+ elif target.cpu == "loongarch64":
+ return namespace(loong64=True)
++ elif target.cpu == "ppc64":
++ return namespace(ppc64=True)
+ elif target.cpu == "riscv64":
+ return namespace(riscv64=True)
+
+@@ -348,6 +353,7 @@ set_config("JS_CODEGEN_ARM", jit_codegen.arm)
+ set_config("JS_CODEGEN_ARM64", jit_codegen.arm64)
+ set_config("JS_CODEGEN_MIPS64", jit_codegen.mips64)
+ set_config("JS_CODEGEN_LOONG64", jit_codegen.loong64)
++set_config("JS_CODEGEN_PPC64", jit_codegen.ppc64)
+ set_config("JS_CODEGEN_RISCV64", jit_codegen.riscv64)
+ set_config("JS_CODEGEN_X86", jit_codegen.x86)
+ set_config("JS_CODEGEN_X64", jit_codegen.x64)
+@@ -358,6 +364,7 @@ set_define("JS_CODEGEN_ARM", jit_codegen.arm)
+ set_define("JS_CODEGEN_ARM64", jit_codegen.arm64)
+ set_define("JS_CODEGEN_MIPS64", jit_codegen.mips64)
+ set_define("JS_CODEGEN_LOONG64", jit_codegen.loong64)
++set_define("JS_CODEGEN_PPC64", jit_codegen.ppc64)
+ set_define("JS_CODEGEN_RISCV64", jit_codegen.riscv64)
+ set_define("JS_CODEGEN_X86", jit_codegen.x86)
+ set_define("JS_CODEGEN_X64", jit_codegen.x64)
+@@ -728,7 +735,7 @@ def default_wasm_jspi(
+ return
+
+ if simulator:
+- return simulator[0] in ("arm64", "arm", "loong64", "mips64", "riscv64")
++ return simulator[0] in ("arm64", "arm", "loong64", "mips64", "ppc64", "riscv64")
+
+ if target.cpu in (
+ "x86_64",
+@@ -737,6 +744,7 @@ def default_wasm_jspi(
+ "arm",
+ "loongarch64",
+ "mips64",
++ "ppc64",
+ "riscv64",
+ ):
+ return True
+@@ -768,10 +776,11 @@ def wasm_jspi(value, jit_enabled, simulator, no_experimental, target):
+ "arm",
+ "loong64",
+ "mips64",
++ "ppc64",
+ "riscv64",
+ ):
+ die(
+- "--enable-wasm-jspi is only supported for arm64/arm/loong64/mips64/riscv64 simulators"
++ "--enable-wasm-jspi is only supported for arm64/arm/loong64/mips64/ppc64/riscv64 simulators"
+ )
+
+ if target.cpu in (
+@@ -781,12 +790,13 @@ def wasm_jspi(value, jit_enabled, simulator, no_experimental, target):
+ "arm",
+ "loongarch64",
+ "mips64",
++ "ppc64",
+ "riscv64",
+ ):
+ return True
+
+ die(
+- "--enable-wasm-jspi only possible when targeting the x86_64/x86/arm64/arm/loongarch64/mips64/riscv64 jits"
++ "--enable-wasm-jspi only possible when targeting the x86_64/x86/arm64/arm/loongarch64/mips64/ppc64/riscv64 jits"
+ )
+
+
+@@ -821,10 +831,10 @@ def default_wasm_simd(jit_enabled, simulator, target):
+ if not jit_enabled:
+ return
+
+- if simulator and (simulator[0] != "arm64"):
++ if simulator and simulator[0] not in ("arm64", "ppc64"):
+ return
+
+- if target.cpu in ("x86_64", "x86", "aarch64"):
++ if target.cpu in ("x86_64", "x86", "aarch64", "ppc64"):
+ return True
+
+
+@@ -849,13 +859,15 @@ def wasm_simd(value, jit_enabled, simulator, target, no_experimental):
+ if not jit_enabled:
+ die("--enable-wasm-simd requires --enable-jit")
+
+- if simulator and (simulator[0] != "arm64"):
+- die("--enable-wasm-simd is not supported for simulators, except arm64")
++ if simulator and simulator[0] not in ("arm64", "ppc64"):
++ die(
++ "--enable-wasm-simd is not supported for simulators, except arm64 and ppc64"
++ )
+
+- if target.cpu in ("x86_64", "x86", "aarch64"):
++ if target.cpu in ("x86_64", "x86", "aarch64", "ppc64"):
+ return True
+
+- die("--enable-wasm-simd only possible when targeting the x86_64/x86/arm64 jits")
++ die("--enable-wasm-simd only possible when targeting the x86_64/x86/arm64/ppc64 jits")
+
+
+ set_config("ENABLE_WASM_SIMD", wasm_simd)
+diff --git a/js/src/builtin/TestingFunctions.cpp b/js/src/builtin/TestingFunctions.cpp
+index be8b3d0e16b6..2291d58dc0a1 100644
+--- a/js/src/builtin/TestingFunctions.cpp
++++ b/js/src/builtin/TestingFunctions.cpp
+@@ -447,6 +447,15 @@ static bool GetBuildConfiguration(JSContext* cx, unsigned argc, Value* vp) {
+ return false;
+ }
+
++#ifdef JS_CODEGEN_PPC64
++ value = BooleanValue(true);
++#else
++ value = BooleanValue(false);
++#endif
++ if (!JS_SetProperty(cx, info, "ppc64", value)) {
++ return false;
++ }
++
+ #ifdef JS_CODEGEN_LOONG64
+ value = BooleanValue(true);
+ #else
+@@ -483,6 +492,15 @@ static bool GetBuildConfiguration(JSContext* cx, unsigned argc, Value* vp) {
+ return false;
+ }
+
++#ifdef JS_SIMULATOR_PPC64
++ value = BooleanValue(true);
++#else
++ value = BooleanValue(false);
++#endif
++ if (!JS_SetProperty(cx, info, "ppc64-simulator", value)) {
++ return false;
++ }
++
+ #ifdef MOZ_ASAN
+ value = BooleanValue(true);
+ #else
+diff --git a/js/src/irregexp/RegExpAPI.cpp b/js/src/irregexp/RegExpAPI.cpp
+index 310cd85c6a20..377509574f28 100644
+--- a/js/src/irregexp/RegExpAPI.cpp
++++ b/js/src/irregexp/RegExpAPI.cpp
+@@ -495,7 +495,10 @@ class RegExpDepthCheck final : public v8::internal::regexp::Visitor {
+
+ // This size is picked to be comfortably larger than any
+ // RegExp*::ToNode stack frame.
+-#if !defined(DEBUG) && !defined(MOZ_CODE_COVERAGE)
++#if defined(__powerpc64__)
++ // PPC64 ELFv2 has larger minimum stack frames.
++ static const size_t FRAME_PADDING = 256 * 4;
++#elif !defined(DEBUG) && !defined(MOZ_CODE_COVERAGE)
+ static const size_t FRAME_PADDING = 256;
+ #else
+ // Use a slightly larger padding for debug and code coverage builds.
+diff --git a/js/src/irregexp/RegExpNativeMacroAssembler.cpp b/js/src/irregexp/RegExpNativeMacroAssembler.cpp
+index ae351226797b..a396aeb3c731 100644
+--- a/js/src/irregexp/RegExpNativeMacroAssembler.cpp
++++ b/js/src/irregexp/RegExpNativeMacroAssembler.cpp
+@@ -990,8 +990,21 @@ void SMRegExpMacroAssembler::CheckBacktrackStackLimit() {
+ AbsoluteAddress(isolate()->regexp_stack()->limit_address_address()),
+ backtrack_stack_pointer_, &no_stack_overflow);
+
++#ifdef JS_CODEGEN_PPC64
++ // LR on PowerPC isn't a GPR, so we have to explicitly save it before
++ // calling or the regexp's return address will be clobbered.
++ masm_.xs_mflr(temp1_);
++ masm_.as_stdu(temp1_, masm_.getStackPointer(), -8);
++#endif
++
+ masm_.call(&stack_overflow_label_);
+
++#ifdef JS_CODEGEN_PPC64
++ masm_.as_ld(temp1_, masm_.getStackPointer(), 0);
++ masm_.xs_mtlr(temp1_);
++ masm_.as_addi(masm_.getStackPointer(), masm_.getStackPointer(), 8);
++#endif
++
+ // Exit with an exception if the call failed
+ masm_.branchTest32(Assembler::Zero, temp0_, temp0_,
+ &exit_with_exception_label_);
+@@ -1080,6 +1093,13 @@ void SMRegExpMacroAssembler::createStackFrame() {
+ masm_.initPseudoStackPtr();
+ #endif
+
++#ifdef JS_CODEGEN_PPC64
++ // PPC64's link register is an SPR, not a GPR, so it cannot be included in
++ // SavedNonVolatileRegisters. Save it explicitly before the frame pointer
++ // so that abiret()'s blr can return to the caller after we restore it.
++ masm_.pushReturnAddress();
++#endif
++
+ masm_.Push(js::jit::FramePointer);
+ masm_.moveStackPtrTo(js::jit::FramePointer);
+
+@@ -1308,6 +1328,9 @@ void SMRegExpMacroAssembler::exitHandler() {
+ // Perform a plain Ret(), as abiret() will move SP <- PSP and that is wrong.
+ masm_.Ret(vixl::lr);
+ #else
++# ifdef JS_CODEGEN_PPC64
++ masm_.popReturnAddress();
++# endif
+ masm_.abiret();
+ #endif
+
+@@ -1351,6 +1374,11 @@ void SMRegExpMacroAssembler::stackOverflowHandler() {
+
+ // Adjust for the return address on the stack.
+ size_t frameOffset = sizeof(void*);
++#ifdef JS_CODEGEN_PPC64
++ // CheckBacktrackStackLimit pushes LR before calling us, so there's a
++ // second return address on the stack.
++ frameOffset += sizeof(void*);
++#endif
+
+ volatileRegs.takeUnchecked(temp0_);
+ volatileRegs.takeUnchecked(temp1_);
+diff --git a/js/src/jit-test/tests/baseline/ppc64-branch8-16-narrow.js b/js/src/jit-test/tests/baseline/ppc64-branch8-16-narrow.js
+new file mode 100644
+index 000000000000..fc1074a9ef8b
+--- /dev/null
++++ b/js/src/jit-test/tests/baseline/ppc64-branch8-16-narrow.js
+@@ -0,0 +1,103 @@
++// Regression test for PPC64 branch8/branch16 width-narrowing under Equal /
++// NotEqual / unsigned comparisons. Two prior bugs:
++//
++// 1. Sign-extending the load while move32(Imm32) zero-extended the imm
++// caused spurious mismatch when the loaded byte/halfword had its high
++// bit set (e.g. "ÀÁÂ".startsWith("ÀÁÂ") returned false because byte 0xC0
++// sign-extended to 0xFF...C0 but the imm 0xC0 zero-extended to 0x00C0,
++// so cmpw on the low 32 bits saw a negative vs positive value).
++//
++// 2. Always zero-extending the load broke `byte == Imm32(-1)` because -1
++// sign-extends in the imm path: the loaded 0x000000FF didn't match the
++// materialized 0xFFFFFFFF.
++//
++// Fix: cast the immediate to uint8/uint16 (equality + unsigned) or int8/int16
++// (signed relational) so both sides have matching bit patterns regardless of
++// how move32(Imm32) chose to materialize it. Match ARM64/LoongArch64/RISC-V.
++//
++// We exercise both byte and halfword branch paths via TypedArray loads and
++// String.prototype.startsWith with a constant search string (the original
++// failing site lowered to branch16(NotEqual, addr, Imm32(0xC1C0))).
++
++// --- Direct byte/halfword equality through TypedArray ---
++{
++ let u8 = new Uint8Array([0, 1, 0x7F, 0x80, 0xC0, 0xC1, 0xFE, 0xFF]);
++ let i8 = new Int8Array(u8.buffer);
++ let u16 = new Uint16Array([0x0000, 0x7FFF, 0x8000, 0xC1C0, 0xFFFE, 0xFFFF]);
++ let i16 = new Int16Array(u16.buffer);
++
++ // Force baseline + Ion to specialize the comparisons.
++ function eqU8(arr, idx, val) {
++ return arr[idx] === val;
++ }
++ function eqI8(arr, idx, val) {
++ return arr[idx] === val;
++ }
++ function eqU16(arr, idx, val) {
++ return arr[idx] === val;
++ }
++ function eqI16(arr, idx, val) {
++ return arr[idx] === val;
++ }
++
++ for (let i = 0; i < 200; i++) {
++ // High-bit-set bytes: bit pattern equality must hold both signed and
++ // unsigned interpretations of the immediate.
++ assertEq(eqU8(u8, 4, 0xC0), true); // unsigned compare 0xC0 == 0xC0
++ assertEq(eqU8(u8, 4, 0xC1), false);
++ assertEq(eqU8(u8, 7, 0xFF), true);
++ assertEq(eqU8(u8, 7, -1 & 0xFF), true); // 0xFF written as -1&0xFF
++
++ // Signed Int8 view: 0xFF is -1, 0xC0 is -64.
++ assertEq(eqI8(i8, 4, -64), true);
++ assertEq(eqI8(i8, 7, -1), true);
++ assertEq(eqI8(i8, 4, -63), false);
++
++ // Halfword variants: the original startswith failure pattern was
++ // (Latin-1 char 0xC1C0) — a 16-bit value with bit 15 set.
++ assertEq(eqU16(u16, 3, 0xC1C0), true);
++ assertEq(eqU16(u16, 3, 0xC1C1), false);
++ assertEq(eqU16(u16, 5, 0xFFFF), true);
++ assertEq(eqU16(u16, 5, -1 & 0xFFFF), true);
++
++ assertEq(eqI16(i16, 3, -15936), true); // 0xC1C0 as i16 = -15936
++ assertEq(eqI16(i16, 5, -1), true);
++ assertEq(eqI16(i16, 5, -2), false);
++ }
++}
++
++// --- String.prototype.startsWith with a Latin-1 constant search ---
++// This was the original failing site — Ion lowers a constant search string
++// of length 1..32 into a sequence of byte-wise comparisons.
++{
++ let s = "ÀÁÂ"; // Latin-1 length 3, bytes 0xC0 0xC1 0xC2 (all high-bit set)
++ function check() {
++ return s.startsWith("ÀÁÂ");
++ }
++ for (let i = 0; i < 200; i++) {
++ assertEq(check(), true);
++ }
++
++ // Mismatch on a single high-bit byte must report not-equal.
++ let s2 = "ÀÁÃ"; // last byte 0xC3 instead of 0xC2
++ function check2() {
++ return s2.startsWith("ÀÁÂ");
++ }
++ for (let i = 0; i < 200; i++) {
++ assertEq(check2(), false);
++ }
++}
++
++// --- Signed relational comparisons still work (we kept the sign-extend path) ---
++{
++ let i8 = new Int8Array([0x7F, -1, -128, 1, 0]);
++ function ltZero(idx) {
++ return i8[idx] < 0;
++ }
++ for (let i = 0; i < 200; i++) {
++ assertEq(ltZero(0), false); // 0x7F = +127
++ assertEq(ltZero(1), true); // -1
++ assertEq(ltZero(2), true); // -128
++ assertEq(ltZero(3), false); // 1
++ }
++}
+diff --git a/js/src/jit-test/tests/gc/gcparam.js b/js/src/jit-test/tests/gc/gcparam.js
+index 51d58662193f..48e5a97c135f 100644
+--- a/js/src/jit-test/tests/gc/gcparam.js
++++ b/js/src/jit-test/tests/gc/gcparam.js
+@@ -30,7 +30,8 @@ testGetParam("chunkBytes");
+ testGetParam("helperThreadCount");
+
+ testChangeParam("maxBytes");
+-testChangeParam("minNurseryBytes", 16 * 1024);
++var pageSize = gcparam("systemPageSizeKB") * 1024;
++testChangeParam("minNurseryBytes", pageSize);
+ testChangeParam("maxNurseryBytes", 1024 * 1024);
+ testChangeParam("incrementalGCEnabled");
+ testChangeParam("perZoneGCEnabled");
+diff --git a/js/src/jit-test/tests/ion/mod-constant-pow2-minus-one.js b/js/src/jit-test/tests/ion/mod-constant-pow2-minus-one.js
+new file mode 100644
+index 000000000000..9028f5587c65
+--- /dev/null
++++ b/js/src/jit-test/tests/ion/mod-constant-pow2-minus-one.js
+@@ -0,0 +1,78 @@
++// Regression test for a PPC64 Ion miscompile of integer modulo by a
++// constant of the form 2^n - 1 (e.g. 65535).
++//
++// lowerModI routes `x % (2^n - 1)` to LModMaskI, whose codegen
++// (ma_mod_mask) materialized the mask 2^n - 1 with xs_li(). xs_li takes a
++// signed int16_t, so a mask of 0xFFFF was truncated to -1, corrupting the
++// digit-summing reduction. The bug only affected masks that do not fit in a
++// signed 16-bit immediate, i.e. divisors >= 65535 (n >= 16); smaller
++// 2^n - 1 divisors such as 255 were unaffected.
++//
++// The reference uses a non-constant divisor, which lowers to the
++// hardware-divide modulo path (LModI) and is therefore independent of the
++// LModMaskI codegen under test.
++
++function refmod(x, d) {
++ // d is not a constant here -> divide-based modulo, not LModMaskI.
++ return (x % d) | 0;
++}
++
++// One function per constant divisor so the divisor is a literal and the
++// LModMaskI path is selected.
++function mod255(x) { return (x % 255) | 0; }
++function mod32767(x) { return (x % 32767) | 0; }
++function mod65535(x) { return (x % 65535) | 0; }
++function mod131071(x) { return (x % 131071) | 0; }
++function mod1048575(x) { return (x % 1048575) | 0; }
++
++const cases = [
++ [mod255, 255],
++ [mod32767, 32767],
++ [mod65535, 65535],
++ [mod131071, 131071],
++ [mod1048575, 1048575],
++];
++
++// Inputs spanning small values, values with bits above the mask width
++// (so the multi-digit reduction is exercised), and negatives.
++const inputs = [];
++for (let i = 0; i < 64; i++) {
++ inputs.push(Math.imul(i, 2654435761) | 0);
++ inputs.push((i * 65535 + i) | 0);
++ inputs.push((i * 131071 - 7) | 0);
++ inputs.push(-Math.imul(i, 40503) | 0);
++}
++inputs.push(0, 1, -1, 65534, 65535, 65536, 0x7fffffff, -0x80000000);
++
++// Warm up through the tiers, then assert each constant-divisor result
++// matches the divide-based reference.
++for (let iter = 0; iter < 2000; iter++) {
++ for (const [fn, d] of cases) {
++ for (const x of inputs) {
++ assertEq(fn(x), refmod(x, d));
++ }
++ }
++}
++
++// Register-pressure variant: mirrors the shape that exposed the bug (many
++// live locals forcing the mask materialization to interact with spills).
++function pressure(buf, i) {
++ let v0 = i, v1 = i + 1, v2 = i + 2, v3 = i + 3, v4 = i + 4, v5 = i + 5;
++ let v6 = i + 6, v7 = i + 7, v8 = i + 8, v9 = i + 9, v10 = i + 10, v11 = i + 11;
++ let v12 = i + 12, v13 = i + 13, v14 = i + 14, v15 = i + 15;
++ const r = (buf[i & 63] % 65535) | 0;
++ // Keep every local live to the return without altering r.
++ const live = (v0 ^ v1 ^ v2 ^ v3 ^ v4 ^ v5 ^ v6 ^ v7 ^
++ v8 ^ v9 ^ v10 ^ v11 ^ v12 ^ v13 ^ v14 ^ v15) & 0;
++ return r + live;
++}
++
++const buf = new Int32Array(64);
++for (let i = 0; i < buf.length; i++) {
++ buf[i] = Math.imul(i, 2654435761) | 0;
++}
++for (let iter = 0; iter < 5000; iter++) {
++ for (let i = 0; i < 64; i++) {
++ assertEq(pressure(buf, i), refmod(buf[i & 63], 65535));
++ }
++}
+diff --git a/js/src/jit-test/tests/ion/mod-pow2-negative-dividend.js b/js/src/jit-test/tests/ion/mod-pow2-negative-dividend.js
+new file mode 100644
+index 000000000000..9905cc4a8f36
+--- /dev/null
++++ b/js/src/jit-test/tests/ion/mod-pow2-negative-dividend.js
+@@ -0,0 +1,71 @@
++// Regression test for a PPC64 Ion miscompile of integer modulo by a constant
++// power of two (e.g. 65536) with a negative dividend.
++//
++// lowerModI routes `x % 2^n` to LModPowTwoI, whose codegen tested the sign of
++// the dividend with branchPtr (a 64-bit compare). When the int32 dividend was
++// held zero-extended in its register, the 64-bit test misclassified a negative
++// value as non-negative and took the unmasked positive path, returning
++// `x & (2^n - 1)` instead of the correct (negative) `x % 2^n`. Fixed by using a
++// 32-bit sign test (branch32).
++//
++// The reference uses a non-constant divisor, which lowers to the divide-based
++// modulo path (LModI), independent of LModPowTwoI.
++
++function refmod(x, d) {
++ return (x % d) | 0;
++}
++
++function mod256(x) { return (x % 256) | 0; }
++function mod1024(x) { return (x % 1024) | 0; }
++function mod4096(x) { return (x % 4096) | 0; }
++function mod65536(x) { return (x % 65536) | 0; }
++function mod1048576(x) { return (x % 1048576) | 0; }
++function mod1073741824(x) { return (x % 1073741824) | 0; }
++
++const cases = [
++ [mod256, 256],
++ [mod1024, 1024],
++ [mod4096, 4096],
++ [mod65536, 65536],
++ [mod1048576, 1048576],
++ [mod1073741824, 1073741824],
++];
++
++// Heavy on negative dividends (the broken path), plus boundary values.
++const inputs = [];
++for (let i = 1; i <= 64; i++) {
++ inputs.push(-Math.imul(i, 2654435761) | 0);
++ inputs.push(-(i * 168));
++ inputs.push(-(i * 70001));
++ inputs.push(Math.imul(i, 40503) | 0);
++}
++inputs.push(0, -1, 1, -168, -65535, -65536, -65537, 168,
++ 0x7fffffff, -0x80000000, -0x7fffffff);
++
++for (let iter = 0; iter < 3000; iter++) {
++ for (const [fn, d] of cases) {
++ for (const x of inputs) {
++ assertEq(fn(x), refmod(x, d));
++ }
++ }
++}
++
++// Register-pressure variant: a negative dividend produced at runtime
++// (float->int) with many live locals, mirroring the shape that exposed the bug.
++function pressure(seed) {
++ let v0 = seed, v1 = seed + 1, v2 = seed + 2, v3 = seed + 3, v4 = seed + 4;
++ let v5 = seed + 5, v6 = seed + 6, v7 = seed + 7, v8 = seed + 8, v9 = seed + 9;
++ let v10 = seed + 10, v11 = seed + 11, v12 = seed + 12, v13 = seed + 13;
++ let d0 = seed * 0.5, d1 = seed * 1.5, d2 = -seed * 2.5;
++ const neg = (Math.fround(-(Math.abs(seed) + 0.7)) | 0);
++ const r = (neg % 65536) | 0;
++ const live = (v0 ^ v1 ^ v2 ^ v3 ^ v4 ^ v5 ^ v6 ^ v7 ^ v8 ^ v9 ^
++ v10 ^ v11 ^ v12 ^ v13 ^ (d0 | 0) ^ (d1 | 0) ^ (d2 | 0)) & 0;
++ return r + live;
++}
++for (let iter = 0; iter < 5000; iter++) {
++ for (let s = 1; s <= 200; s++) {
++ const expect = ((Math.fround(-(s + 0.7)) | 0) % 65536) | 0;
++ assertEq(pressure(s), expect);
++ }
++}
+diff --git a/js/src/jit-test/tests/math-min-max-corner-cases.js b/js/src/jit-test/tests/math-min-max-corner-cases.js
+new file mode 100644
+index 000000000000..7ac2c59caeff
+--- /dev/null
++++ b/js/src/jit-test/tests/math-min-max-corner-cases.js
+@@ -0,0 +1,50 @@
++// Math.min / Math.max corner cases. Exercises the POWER9 xsminjdp /
++// xsmaxjdp J-form fast path on PPC64 (and the fcmpu/branch fallback on
++// POWER8 forced); other backends already cover this via shared fp tests
++// but the truth table is small and worth pinning explicitly.
++//
++// JS semantics (ECMA-262):
++// - Math.max(-0, +0) === +0; Math.min(-0, +0) === -0
++// - Math.max(-0, -0) === -0; Math.min(+0, +0) === +0
++// - Any NaN operand → NaN
++// - ±Inf and ordinary numerics by value
++
++function objectIsPositiveZero(v) {
++ return v === 0 && Object.is(v, 0);
++}
++function objectIsNegativeZero(v) {
++ return v === 0 && Object.is(v, -0);
++}
++
++// Direct calls — these get inlined by Ion as MMinMax intrinsics, which
++// emit the relevant min/max helper.
++function check() {
++ // Max corner cases.
++ assertEq(objectIsPositiveZero(Math.max(-0, +0)), true);
++ assertEq(objectIsPositiveZero(Math.max(+0, -0)), true);
++ assertEq(objectIsNegativeZero(Math.max(-0, -0)), true);
++ assertEq(objectIsPositiveZero(Math.max(+0, +0)), true);
++ assertEq(Number.isNaN(Math.max(NaN, 5)), true);
++ assertEq(Number.isNaN(Math.max(5, NaN)), true);
++ assertEq(Number.isNaN(Math.max(NaN, NaN)), true);
++ assertEq(Math.max(-Infinity, 5), 5);
++ assertEq(Math.max(Infinity, 5), Infinity);
++ assertEq(Math.max(1, 2), 2);
++ assertEq(Math.max(-1, -2), -1);
++ assertEq(Math.max(1.5, 2.5), 2.5);
++
++ // Min corner cases.
++ assertEq(objectIsNegativeZero(Math.min(-0, +0)), true);
++ assertEq(objectIsNegativeZero(Math.min(+0, -0)), true);
++ assertEq(objectIsNegativeZero(Math.min(-0, -0)), true);
++ assertEq(objectIsPositiveZero(Math.min(+0, +0)), true);
++ assertEq(Number.isNaN(Math.min(NaN, 5)), true);
++ assertEq(Number.isNaN(Math.min(5, NaN)), true);
++ assertEq(Math.min(-Infinity, 5), -Infinity);
++ assertEq(Math.min(Infinity, 5), 5);
++ assertEq(Math.min(1, 2), 1);
++}
++
++// Run cold (Baseline) and hot (Ion).
++check();
++for (let i = 0; i < 50000; i++) check();
+diff --git a/js/src/jit-test/tests/wasm/atomicity.js b/js/src/jit-test/tests/wasm/atomicity.js
+index 34327ec95741..ac1516083325 100644
+--- a/js/src/jit-test/tests/wasm/atomicity.js
++++ b/js/src/jit-test/tests/wasm/atomicity.js
+@@ -8,7 +8,11 @@
+ const DEBUG = 0;
+
+ // The longer we run, the better, really, but we don't want to time out.
+-const ITERATIONS = 100000;
++// Real PPC64 hardware retries lwarx/stwcx. reservation loops under
++// contention, which makes the default count exceed jit-test's 150 s
++// budget on POWER8 and (less so) POWER9/POWER10. Quarter the count
++// there to keep coverage while fitting the default budget.
++const ITERATIONS = getBuildConfiguration("ppc64") ? 25000 : 100000;
+
+ // If you change NUMWORKERS you must also change the tables for INIT, VAL, and
+ // RESULT for all the operations, below, by adding or removing bits.
+@@ -39,7 +43,7 @@ if (getCoreCount() < NUMAGENTS) {
+
+ if (getBuildConfiguration("arm-simulator") || getBuildConfiguration("arm64-simulator") ||
+ getBuildConfiguration("mips64-simulator") || getBuildConfiguration("riscv64-simulator") ||
+- getBuildConfiguration("loong64-simulator"))
++ getBuildConfiguration("loong64-simulator") || getBuildConfiguration("ppc64-simulator"))
+ {
+ if (DEBUG > 0)
+ print("Atomicity test disabled on simulator");
+diff --git a/js/src/jit-test/tests/wasm/excessive-inlining.js b/js/src/jit-test/tests/wasm/excessive-inlining.js
+index 91ec710e4e46..a7d3b3211515 100644
+--- a/js/src/jit-test/tests/wasm/excessive-inlining.js
++++ b/js/src/jit-test/tests/wasm/excessive-inlining.js
+@@ -74,23 +74,26 @@ assertEq(tier2codeBytesUsed > 2000, true);
+
+ // But not an excessive amount. This is the assertion that checks that
+ // the inlining-budget cutoff mechanism is working.
+-assertEq(tier2codeBytesUsed < 15000, true);
++// PPC64 generates larger code due to fixed-width 4-byte instructions,
++// multi-instruction branch stanzas, and longer constant-loading sequences.
++let tier2limit = getBuildConfiguration("ppc64") ? 25000 : 15000;
++assertEq(tier2codeBytesUsed < tier2limit, true);
+
+ // The thresholds above are based on the following measurements.
+ //
+ // tier1codeBytesUsed (baseline size)
+ //
+-// x64 x32 arm64 arm32
++// x64 x32 arm64 arm32 ppc64
+ //
+-// 1378 1010 1408 1008 --enable-debug build
+-// 1218 866 1248 856 --disable-debug build
++// 1378 1010 1408 1008 2736 --enable-debug build
++// 1218 866 1248 856 --disable-debug build
+ //
+ // tier2codeBytesUsed (optimized size), with inline-size budgeting enabled
+ //
+-// x64 x32 arm64 arm32
++// x64 x32 arm64 arm32 ppc64
+ //
+-// 5186 6994 7136 5472 --enable-debug build
+-// 3698 3730 5472 3888 --disable-debug build
++// 5186 6994 7136 5472 17408 --enable-debug build
++// 3698 3730 5472 3888 --disable-debug build
+ //
+ // tier2codeBytesUsed (optimized size), with inline-size budgeting disabled
+ //
+@@ -108,7 +111,7 @@ assertEq(tier2codeBytesUsed < 15000, true);
+ // (2) the optimized size will be at least 2000 bytes
+ //
+ // (3) if the inline-budget mechanism is working as intended, the optimized
+-// size will be less than 15000 bytes
++// size will be less than 15000 bytes (25000 on PPC64)
+ //
+ //
+ // Note (for future testing): inline-size budgeting was disabled by changing
+diff --git a/js/src/jit-test/tests/wasm/memory-oob-message.js b/js/src/jit-test/tests/wasm/memory-oob-message.js
+index 75248c6e6a56..c08e49bcc6e4 100644
+--- a/js/src/jit-test/tests/wasm/memory-oob-message.js
++++ b/js/src/jit-test/tests/wasm/memory-oob-message.js
+@@ -8,8 +8,16 @@ const hasOffsetMessage = wasmHugeMemoryEnabled();
+
+ function oobPattern(memIdx, byteOffset) {
+ if (hasOffsetMessage) {
++ // The reported address is whatever the kernel returned in
++ // siginfo.si_addr for the faulting instruction. Most backends emit
++ // the wasm access directly so si_addr equals byteOffset. PPC64 emits
++ // a 1-byte probing load at byteOffset + (size - 1) before each
++ // multi-byte access (to enforce wasm-spec atomicity on POWER ISA),
++ // so si_addr there can be up to 15 bytes past byteOffset.
++ const offsets = [];
++ for (let i = 0; i < 16; ++i) offsets.push(`${byteOffset + i}`);
+ return new RegExp(
+- `out of bounds: memory ${memIdx} access at memory address ${byteOffset}`
++ `out of bounds: memory ${memIdx} access at memory address (?:${offsets.join('|')})`
+ );
+ }
+ return /index out of bounds/;
+diff --git a/js/src/jit-test/tests/wasm/ppc64-argon2-tiering.js b/js/src/jit-test/tests/wasm/ppc64-argon2-tiering.js
+new file mode 100644
+index 000000000000..04dad9240539
+--- /dev/null
++++ b/js/src/jit-test/tests/wasm/ppc64-argon2-tiering.js
+@@ -0,0 +1,124 @@
++// Test for wasm tiering correctness with argon2-style SIMD computation.
++// The argon2 fBlaMka function uses i64x2.extmul_low_i32x4_u, i64x2.shl,
++// i64x2.add, v128.xor, v128.or, i64x2.shr_u, and i8x16.shuffle.
++// A tiering bug can cause hash and verify to produce different results
++// when tier-up happens between them.
++//
++// This test runs the computation under both baseline and optimizing
++// compilers and verifies they produce identical results.
++
++var mod = new WebAssembly.Module(wasmTextToBinary(`
++ (module
++ (memory (export "mem") 10)
++ ;; Argon2 fBlaMka: a + b + 2 * trunc32(a) * trunc32(b)
++ ;; then rotations by 32, 24, 16, 63
++ (func $G_round (param i32)
++ (local v128 v128 v128 v128 v128 v128 v128 v128 v128)
++ (local.set 1 (v128.load (i32.add (local.get 0) (i32.const 0))))
++ (local.set 2 (v128.load (i32.add (local.get 0) (i32.const 16))))
++ (local.set 3 (v128.load (i32.add (local.get 0) (i32.const 32))))
++ (local.set 4 (v128.load (i32.add (local.get 0) (i32.const 48))))
++ (local.set 5 (v128.load (i32.add (local.get 0) (i32.const 64))))
++ (local.set 6 (v128.load (i32.add (local.get 0) (i32.const 80))))
++ (local.set 7 (v128.load (i32.add (local.get 0) (i32.const 96))))
++ (local.set 8 (v128.load (i32.add (local.get 0) (i32.const 112))))
++
++ ;; fBlaMka(v0, v2) + rotr32
++ (local.set 1 (i64x2.add (i64x2.add (local.get 1) (local.get 3))
++ (i64x2.shl (i64x2.extmul_low_i32x4_u
++ (i8x16.shuffle 0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 (local.get 1) (local.get 1))
++ (i8x16.shuffle 0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 (local.get 3) (local.get 3)))
++ (i32.const 1))))
++ (local.set 9 (v128.xor (local.get 7) (local.get 1)))
++ (local.set 7 (v128.or (i64x2.shl (local.get 9) (i32.const 32)) (i64x2.shr_u (local.get 9) (i32.const 32))))
++
++ ;; fBlaMka(v4, v6) + rotr24
++ (local.set 5 (i64x2.add (i64x2.add (local.get 5) (local.get 7))
++ (i64x2.shl (i64x2.extmul_low_i32x4_u
++ (i8x16.shuffle 0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 (local.get 5) (local.get 5))
++ (i8x16.shuffle 0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 (local.get 7) (local.get 7)))
++ (i32.const 1))))
++ (local.set 9 (v128.xor (local.get 3) (local.get 5)))
++ (local.set 3 (v128.or (i64x2.shl (local.get 9) (i32.const 40)) (i64x2.shr_u (local.get 9) (i32.const 24))))
++
++ ;; fBlaMka(v0, v2) + rotr16
++ (local.set 1 (i64x2.add (i64x2.add (local.get 1) (local.get 3))
++ (i64x2.shl (i64x2.extmul_low_i32x4_u
++ (i8x16.shuffle 0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 (local.get 1) (local.get 1))
++ (i8x16.shuffle 0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 (local.get 3) (local.get 3)))
++ (i32.const 1))))
++ (local.set 9 (v128.xor (local.get 7) (local.get 1)))
++ (local.set 7 (v128.or (i64x2.shl (local.get 9) (i32.const 48)) (i64x2.shr_u (local.get 9) (i32.const 16))))
++
++ ;; fBlaMka(v4, v6) + rotr63
++ (local.set 5 (i64x2.add (i64x2.add (local.get 5) (local.get 7))
++ (i64x2.shl (i64x2.extmul_low_i32x4_u
++ (i8x16.shuffle 0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 (local.get 5) (local.get 5))
++ (i8x16.shuffle 0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 (local.get 7) (local.get 7)))
++ (i32.const 1))))
++ (local.set 9 (v128.xor (local.get 3) (local.get 5)))
++ (local.set 3 (v128.or (i64x2.shl (local.get 9) (i32.const 1)) (i64x2.shr_u (local.get 9) (i32.const 63))))
++
++ (v128.store (i32.add (local.get 0) (i32.const 0)) (local.get 1))
++ (v128.store (i32.add (local.get 0) (i32.const 16)) (local.get 2))
++ (v128.store (i32.add (local.get 0) (i32.const 32)) (local.get 3))
++ (v128.store (i32.add (local.get 0) (i32.const 48)) (local.get 4))
++ (v128.store (i32.add (local.get 0) (i32.const 64)) (local.get 5))
++ (v128.store (i32.add (local.get 0) (i32.const 80)) (local.get 6))
++ (v128.store (i32.add (local.get 0) (i32.const 96)) (local.get 7))
++ (v128.store (i32.add (local.get 0) (i32.const 112)) (local.get 8)))
++
++ (func (export "hash") (param i32) (result i64)
++ (local i32)
++ ;; Init with Blake2b IV
++ (v128.store (i32.const 0) (v128.const i64x2 0x6a09e667f3bcc908 0xbb67ae8584caa73b))
++ (v128.store (i32.const 16) (v128.const i64x2 0x3c6ef372fe94f82b 0xa54ff53a5f1d36f1))
++ (v128.store (i32.const 32) (v128.const i64x2 0x510e527fade682d1 0x9b05688c2b3e6c1f))
++ (v128.store (i32.const 48) (v128.const i64x2 0x1f83d9abfb41bd6b 0x5be0cd19137e2179))
++ (v128.store (i32.const 64) (v128.const i64x2 0x0123456789abcdef 0xfedcba9876543210))
++ (v128.store (i32.const 80) (v128.const i64x2 0xdeadbeefcafebabe 0x1122334455667788))
++ (v128.store (i32.const 96) (v128.const i64x2 0xaabbccdd11223344 0x5566778899aabbcc))
++ (v128.store (i32.const 112) (v128.const i64x2 0xddeeff0011223344 0x5566778899aabbcc))
++ (local.set 1 (i32.const 0))
++ (block (loop
++ (call $G_round (i32.const 0))
++ (local.set 1 (i32.add (local.get 1) (i32.const 1)))
++ (br_if 1 (i32.ge_u (local.get 1) (local.get 0)))
++ (br 0)))
++ (i64.xor (i64.load (i32.const 0))
++ (i64.xor (i64.load (i32.const 8))
++ (i64.xor (i64.load (i32.const 16))
++ (i64.xor (i64.load (i32.const 24))
++ (i64.xor (i64.load (i32.const 32))
++ (i64.xor (i64.load (i32.const 40))
++ (i64.xor (i64.load (i32.const 48))
++ (i64.xor (i64.load (i32.const 56))
++ (i64.xor (i64.load (i32.const 64))
++ (i64.xor (i64.load (i32.const 72))
++ (i64.xor (i64.load (i32.const 80))
++ (i64.xor (i64.load (i32.const 88))
++ (i64.xor (i64.load (i32.const 96))
++ (i64.xor (i64.load (i32.const 104))
++ (i64.xor (i64.load (i32.const 112))
++ (i64.load (i32.const 120))))))))))))))))))
++ )
++`));
++
++var inst = new WebAssembly.Instance(mod);
++
++// Get a reference result from the first call.
++var reference = inst.exports.hash(100);
++
++// Run many times to trigger tier-up, then verify result stays the same.
++var pass = true;
++for (var i = 0; i < 1000; i++) {
++ var r = inst.exports.hash(100);
++ if (r !== reference) {
++ pass = false;
++ throw new Error("Tiering mismatch at iteration " + i +
++ ": got 0x" + BigInt.asUintN(64, r).toString(16) +
++ ", expected 0x" + BigInt.asUintN(64, reference).toString(16));
++ }
++}
++
++assertEq(pass, true);
+diff --git a/js/src/jit-test/tests/wasm/ppc64-compare-select-bench.js b/js/src/jit-test/tests/wasm/ppc64-compare-select-bench.js
+new file mode 100644
+index 000000000000..c11ce713f514
+--- /dev/null
++++ b/js/src/jit-test/tests/wasm/ppc64-compare-select-bench.js
+@@ -0,0 +1,70 @@
++// |jit-test| skip-if: true
++//
++// Benchmark only, not a correctness test. Invoke manually as shown below.
++//
++// Microbenchmark for wasm compare+select fusion on PPC64.
++//
++// Run with:
++// $JS --wasm-compiler=optimizing \
++// js/src/jit-test/tests/wasm/ppc64-compare-select-bench.js
++//
++// Prints timings for four variants (i32, i64, f32, f64) that exercise a
++// tight loop of N select-on-compare operations. Used to decide whether
++// specializing lowerWasmCompareAndSelect beyond Int32 is worth the code.
++//
++// The kernel is a 10-stage select chain so the per-op overhead dominates
++// the loop frame. Each iteration touches 10 compare+select ops plus
++// ~trivial address math.
++
++const N_ITERS = 1_000_000;
++
++function buildModule(kind) {
++ const types = {i32: ['i32', 'i32', 'i32.lt_s'],
++ u32: ['i32', 'i32', 'i32.lt_u'],
++ i64: ['i64', 'i64', 'i64.lt_s'],
++ f32: ['f32', 'i32', 'f32.lt'],
++ f64: ['f64', 'i32', 'f64.lt']}[kind];
++ const [ty, iterTy, cmpOp] = types;
++ // Load a, b; compute chain of (b < a ? b : a) 10 times per iter.
++ const stage = `
++ (local.set $a
++ (select (result ${ty})
++ (local.get $b) (local.get $a)
++ (${cmpOp} (local.get $b) (local.get $a))))`;
++ const body = Array(10).fill(stage).join('\n');
++ const text = `
++ (module
++ (func (export "run") (param $n i32) (result ${ty})
++ (local $i i32) (local $a ${ty}) (local $b ${ty})
++ (local.set $a (${ty}.const ${kind === 'f32' || kind === 'f64' ? '3.14' : '12345'}))
++ (local.set $b (${ty}.const ${kind === 'f32' || kind === 'f64' ? '2.71' : '67890'}))
++ (loop $L
++ ${body}
++ (local.set $i (i32.add (local.get $i) (i32.const 1)))
++ (br_if $L (i32.lt_s (local.get $i) (local.get $n))))
++ (local.get $a)))`;
++ return new WebAssembly.Module(wasmTextToBinary(text));
++}
++
++function bench(kind) {
++ const inst = new WebAssembly.Instance(buildModule(kind));
++ // Warmup — ensure Ion compiles.
++ for (let i = 0; i < 3; i++) inst.exports.run(N_ITERS);
++ const t0 = dateNow();
++ const res = inst.exports.run(N_ITERS);
++ const t1 = dateNow();
++ return {ms: t1 - t0, result: res};
++}
++
++const kinds = ['i32', 'u32', 'i64', 'f32', 'f64'];
++const runs = 5;
++print(`\nwasm compare+select microbench (${N_ITERS.toLocaleString()} iters, 10 ops/iter):`);
++print(` Each timing is the best of ${runs} runs.\n`);
++for (const kind of kinds) {
++ const samples = [];
++ for (let i = 0; i < runs; i++) samples.push(bench(kind).ms);
++ samples.sort((a, b) => a - b);
++ const best = samples[0];
++ const median = samples[(runs / 2) | 0];
++ print(` ${kind.padEnd(4)} best=${best.toFixed(1)}ms median=${median.toFixed(1)}ms (samples: ${samples.map(s => s.toFixed(0)).join(',')})`);
++}
+diff --git a/js/src/jit-test/tests/wasm/ppc64-extmul-alias.js b/js/src/jit-test/tests/wasm/ppc64-extmul-alias.js
+new file mode 100644
+index 000000000000..2aa9507751b6
+--- /dev/null
++++ b/js/src/jit-test/tests/wasm/ppc64-extmul-alias.js
+@@ -0,0 +1,107 @@
++// Regression test for PPC64 i64x2.extmul_{low,high}_i32x4_{s,u} when the
++// Ion register allocator picks dest == rhs.
++//
++// On PPC64 LE, the old implementation extracted lanes via mtvsrd/mfvsrd and
++// wrote the low-lane product to dest before reading rhs for the high lane.
++// `mtvsrd XT, RA` leaves DW1 of XT undefined (POWER9 zeros it), so when
++// dest aliased rhs the high-lane extract from rhs read garbage, producing
++// zero in the high i64 lane. On POWER8 the ExtractLaneToGPR fallback
++// additionally clobbered ScratchSimd128Reg between the two extracts.
++//
++// The loop below, discovered via wasm-reduce from argon2.wasm, reliably
++// reproduced the miscompile: the result's high i64 lane went to 0 on
++// POWER9 Ion / garbage on POWER8 Ion, while baseline kept the correct
++// value (lane1 = 48*48 = 2304 in the final iteration).
++
++var mod = new WebAssembly.Module(wasmTextToBinary(`
++ (module
++ (memory (export "mem") 1)
++ (func (export "run_u") (param $out i32)
++ (local $i i32) (local $v4 v128) (local $v5 v128) (local $v9 v128)
++ (loop
++ (local.set $v9
++ (i64x2.add
++ (v128.const i32x4 1 0 0 0)
++ (i64x2.extmul_low_i32x4_u (local.get $v5) (local.get $v9))))
++ (local.set $v4 (local.get $v9))
++ (local.set $v5 (local.get $v4))
++ (v128.store (i32.const 0) (local.get $v5))
++ (local.set $i (i32.add (local.get $i) (i32.const 1)))
++ (br_if 0 (i32.ne (local.get $i) (i32.const 8))))
++ (v128.store (local.get $out) (local.get $v9)))
++
++ (func (export "run_s") (param $out i32)
++ (local $i i32) (local $v v128)
++ (local.set $v (v128.const i32x4 2 3 5 7))
++ (loop
++ ;; Force dest==rhs aliasing: v = extmul_low_i32x4_s(const, v).
++ (local.set $v
++ (i64x2.extmul_low_i32x4_s
++ (v128.const i32x4 2 3 5 7)
++ (local.get $v)))
++ (local.set $i (i32.add (local.get $i) (i32.const 1)))
++ (br_if 0 (i32.ne (local.get $i) (i32.const 2))))
++ (v128.store (local.get $out) (local.get $v)))
++
++ (func (export "run_high_u") (param $out i32)
++ (local $i i32) (local $v v128)
++ (local.set $v (v128.const i32x4 0 0 2 3))
++ (loop
++ (local.set $v
++ (i64x2.extmul_high_i32x4_u
++ (v128.const i32x4 0 0 2 3)
++ (local.get $v)))
++ (local.set $i (i32.add (local.get $i) (i32.const 1)))
++ (br_if 0 (i32.ne (local.get $i) (i32.const 2))))
++ (v128.store (local.get $out) (local.get $v)))
++
++ (func (export "run_high_s") (param $out i32)
++ (local $i i32) (local $v v128)
++ (local.set $v (v128.const i32x4 0 0 2 3))
++ (loop
++ (local.set $v
++ (i64x2.extmul_high_i32x4_s
++ (v128.const i32x4 0 0 2 3)
++ (local.get $v)))
++ (local.set $i (i32.add (local.get $i) (i32.const 1)))
++ (br_if 0 (i32.ne (local.get $i) (i32.const 2))))
++ (v128.store (local.get $out) (local.get $v))))
++`));
++
++function runAndCheck(inst) {
++ inst.exports.run_u(0);
++ // After 8 iterations, the value in memory should have lane1 == 2304 = 0x900.
++ // Bytes 8-15 (i64 lane 1, little-endian) = 0x0000000000000900.
++ var buf = new Uint8Array(inst.exports.mem.buffer, 0, 16);
++ var hex = Array.from(buf).map(b => b.toString(16).padStart(2,'0')).join('');
++ // Expect bytes 8-9 = "00 09" and bytes 10-15 = "00 00 00 00 00 00".
++ assertEq(hex.slice(16, 32), "0009000000000000");
++
++ inst.exports.run_s(16);
++ // After 2 iterations of v = extmul_low_s(const(2,3,5,7), v) starting v=(2,3,5,7):
++ // iter 1: i64x2 lane0 = 2*2 = 4, lane1 = 3*3 = 9.
++ // v becomes i32x4 [4, 0, 9, 0] (each i64 lane occupies two i32 lanes).
++ // iter 2: extmul_low_s reads i32 lanes 0, 1 of v = (4, 0).
++ // i64 lane0 = 2*4 = 8; i64 lane1 = 3*0 = 0.
++ var buf2 = new Uint8Array(inst.exports.mem.buffer, 16, 16);
++ var hex2 = Array.from(buf2).map(b => b.toString(16).padStart(2,'0')).join('');
++ assertEq(hex2, "08000000000000000000000000000000");
++
++ inst.exports.run_high_u(32);
++ // v = (0, 0, 2, 3). extmul_high picks lanes 2 and 3.
++ // iter 1: lane2_prod = 2*2 = 4; lane3_prod = 3*3 = 9. Result stored at bytes 0-7 (lane2_prod) and 8-15 (lane3_prod).
++ // iter 2: v now has i64x2 lane0 = 4, lane1 = 9, i.e. i32x4 lanes [4, 0, 9, 0].
++ // extmul_high_u(const(0,0,2,3), v) reads lanes 2, 3 of both:
++ // const lane2 = 2, lane3 = 3; v lane2 = 9, lane3 = 0.
++ // result: lane2_prod = 2*9 = 18 at bytes 0-7; lane3_prod = 3*0 = 0 at bytes 8-15.
++ var buf3 = new Uint8Array(inst.exports.mem.buffer, 32, 16);
++ var hex3 = Array.from(buf3).map(b => b.toString(16).padStart(2,'0')).join('');
++ assertEq(hex3, "12000000000000000000000000000000");
++
++ inst.exports.run_high_s(48);
++ var buf4 = new Uint8Array(inst.exports.mem.buffer, 48, 16);
++ var hex4 = Array.from(buf4).map(b => b.toString(16).padStart(2,'0')).join('');
++ assertEq(hex4, "12000000000000000000000000000000");
++}
++
++runAndCheck(new WebAssembly.Instance(mod));
+diff --git a/js/src/jit-test/tests/wasm/ppc64-simd-vr-clobber.js b/js/src/jit-test/tests/wasm/ppc64-simd-vr-clobber.js
+new file mode 100644
+index 000000000000..d5f79a1840a6
+--- /dev/null
++++ b/js/src/jit-test/tests/wasm/ppc64-simd-vr-clobber.js
+@@ -0,0 +1,179 @@
++// |jit-test| skip-if: !wasmSimdEnabled()
++//
++// Regression tests for PPC64 SIMD helpers that use VR1..VR5 as undeclared
++// scratch and silently corrupt live wasm v128 values the register allocator
++// has placed in those VRs.
++//
++// Background: PPC64 Simd128 lives in VR0..VR31. VR0 is non-allocatable
++// (= ScratchSimd128Reg); VR1..VR31 are allocatable. The helpers below
++// historically used VR1..VR5 as undeclared scratch:
++//
++// negInt8x16, negInt16x8 : clobber VR1 (all CPUs)
++// negInt32x4, negInt64x2 (POWER8 fallback) : clobber VR1 (POWER8 only)
++// extAddPairwiseInt8x16 (signed/unsigned) : clobber VR1, VR2, VR3
++// extAddPairwiseInt16x8 (signed/unsigned) : clobber VR1, VR2, VR3
++// unsignedWidenHighInt32x4 : clobber VR1
++//
++// Each test:
++// - loads `nLive` "preserve" v128 values from memory at offsets 16..16+16*nLive
++// - loads ONE additional "input" v128 = repeat(0x18) at offset 128
++// - applies the suspect helper to the input
++// - stores the nLive preserved values back to memory at offsets 0..16*nLive
++// - stores the helper result at offset 16*nLive
++//
++// Without the fix, one of the preserved locals (whichever the allocator
++// placed in the clobbered VR) reads back as the staged input value (0x18)
++// instead of its original. With the fix (the helper using ScratchSimd128Scope
++// or proper VR-namespace emit), all preserved locals retain their values.
++
++const PRESERVE_PATTERNS = [0xA1, 0xB2, 0xC3, 0xD4, 0xE5, 0xF6, 0x07, 0x29];
++const INPUT_BYTE = 0x18;
++
++function init(mem) {
++ // Slots at offset 16, 32, ..., 16+16*7 hold the preserve patterns.
++ for (let slot = 0; slot < PRESERVE_PATTERNS.length; slot++) {
++ for (let i = 0; i < 16; i++) {
++ mem[16 + slot * 16 + i] = PRESERVE_PATTERNS[slot];
++ }
++ }
++ // The helper input is at offset 128 (= 16 + 16*7 + 16 = 144? no, 16 + 16*8 = 144).
++ // Use a fixed offset PAST the preserve area. With nLive max 7, preserve uses
++ // 16..(16+16*7-1) = 16..127. Input goes at 144 to leave a 16-byte gap.
++ const INPUT_OFFSET = 144;
++ for (let i = 0; i < 16; i++) mem[INPUT_OFFSET + i] = INPUT_BYTE;
++}
++
++function repeat(byte) {
++ const a = new Array(16);
++ for (let i = 0; i < 16; i++) a[i] = byte;
++ return a;
++}
++
++// Verify nLive preserved slots match PRESERVE_PATTERNS at output offsets
++// 0..16*nLive, and that the result slot at 16*nLive matches `expectedResult`.
++function check(opName, mem, nLive, expectedResult) {
++ for (let slot = 0; slot < nLive; slot++) {
++ for (let i = 0; i < 16; i++) {
++ const got = mem[slot * 16 + i];
++ const want = PRESERVE_PATTERNS[slot];
++ assertEq(got, want,
++ `${opName}: live slot ${slot} byte ${i}: got 0x${got.toString(16)}, expected 0x${want.toString(16)} (allocator-clobbered VR?)`);
++ }
++ }
++ for (let i = 0; i < 16; i++) {
++ const got = mem[nLive * 16 + i];
++ const want = expectedResult[i];
++ assertEq(got, want,
++ `${opName}: result byte ${i}: got 0x${got.toString(16)}, expected 0x${want.toString(16)}`);
++ }
++}
++
++// Build a wasm module that:
++// - loads `nLive` preserve v128 locals from memory at offsets 16..16*nLive
++// - loads ONE input v128 from offset 144
++// - applies `op` to the input
++// - stores all `nLive + 1` v128 values back to memory at offsets 0..16*nLive
++function buildModule(op, nLive) {
++ const localDecls = [];
++ const initLoads = [];
++ const finalStores = [];
++ for (let i = 0; i < nLive; i++) {
++ localDecls.push(`(local $v${i} v128)`);
++ initLoads.push(`(local.set $v${i} (v128.load (i32.const ${16 + i * 16})))`);
++ finalStores.push(`(v128.store (i32.const ${i * 16}) (local.get $v${i}))`);
++ }
++ // The helper input + result.
++ localDecls.push(`(local $input v128)`);
++ initLoads.push(`(local.set $input (v128.load (i32.const 144)))`);
++ finalStores.push(`(v128.store (i32.const ${nLive * 16}) (local.get $input))`);
++
++ const text = `
++ (module
++ (memory (export "mem") 1)
++ (func (export "run")
++ ${localDecls.join('\n ')}
++ ${initLoads.join('\n ')}
++ (local.set $input (${op} (local.get $input)))
++ ${finalStores.join('\n ')}
++ )
++ )`;
++ return new WebAssembly.Module(wasmTextToBinary(text));
++}
++
++function runOne(opName, op, nLive, expectedResult) {
++ const mod = buildModule(op, nLive);
++ const inst = new WebAssembly.Instance(mod);
++ const mem = new Uint8Array(inst.exports.mem.buffer);
++ // Run many times so Baseline + Ion both see it.
++ for (let warm = 0; warm < 50; warm++) {
++ init(mem);
++ inst.exports.run();
++ check(opName, mem, nLive, expectedResult);
++ }
++}
++
++// ---- Negate helpers ----
++//
++// Input lane = 0x18 = 24. neg(24) = -24.
++// i8x16.neg : -24 mod 256 = 232 = 0xE8 per byte.
++// i16x8.neg : lane = 0x1818 = 6168, neg = -6168 mod 65536 = 0xE7E8.
++// Memory LE: per i16 lane bytes 0xE8 0xE7.
++// i32x4.neg : lane = 0x18181818 = 404232216, neg = 0xE7E7E7E8.
++// Memory LE: per i32 lane bytes 0xE8 0xE7 0xE7 0xE7.
++// i64x2.neg : lane = 0x1818181818181818, neg = 0xE7E7E7E7E7E7E7E8.
++// Memory LE: per i64 lane bytes 0xE8 0xE7 0xE7 0xE7 0xE7 0xE7 0xE7 0xE7.
++
++runOne("i8x16.neg", "i8x16.neg", 4, repeat(0xE8));
++runOne("i16x8.neg", "i16x8.neg", 4,
++ [0xE8,0xE7, 0xE8,0xE7, 0xE8,0xE7, 0xE8,0xE7,
++ 0xE8,0xE7, 0xE8,0xE7, 0xE8,0xE7, 0xE8,0xE7]);
++runOne("i32x4.neg", "i32x4.neg", 4,
++ [0xE8,0xE7,0xE7,0xE7, 0xE8,0xE7,0xE7,0xE7,
++ 0xE8,0xE7,0xE7,0xE7, 0xE8,0xE7,0xE7,0xE7]);
++runOne("i64x2.neg", "i64x2.neg", 4,
++ [0xE8,0xE7,0xE7,0xE7,0xE7,0xE7,0xE7,0xE7,
++ 0xE8,0xE7,0xE7,0xE7,0xE7,0xE7,0xE7,0xE7]);
++
++// ---- extAddPairwise helpers ----
++//
++// extadd_pairwise reads adjacent pairs and widens-then-sums them.
++// Input = repeat(0x18) = 24.
++// i16x8.extadd_pairwise_i8x16_s : 24 + 24 = 48 = 0x0030 per i16 lane.
++// Memory LE: 0x30 0x00 per lane × 8 lanes.
++// i16x8.extadd_pairwise_i8x16_u : same since input is positive.
++// i32x4.extadd_pairwise_i16x8_s : i16 lane = 0x1818 = 6168, sum = 12336 = 0x00003030.
++// Memory LE: 0x30 0x30 0x00 0x00 per lane × 4 lanes.
++// i32x4.extadd_pairwise_i16x8_u : same since input is positive.
++
++runOne("i16x8.extadd_pairwise_i8x16_s",
++ "i16x8.extadd_pairwise_i8x16_s", 4,
++ [0x30,0x00, 0x30,0x00, 0x30,0x00, 0x30,0x00,
++ 0x30,0x00, 0x30,0x00, 0x30,0x00, 0x30,0x00]);
++
++runOne("i16x8.extadd_pairwise_i8x16_u",
++ "i16x8.extadd_pairwise_i8x16_u", 4,
++ [0x30,0x00, 0x30,0x00, 0x30,0x00, 0x30,0x00,
++ 0x30,0x00, 0x30,0x00, 0x30,0x00, 0x30,0x00]);
++
++runOne("i32x4.extadd_pairwise_i16x8_s",
++ "i32x4.extadd_pairwise_i16x8_s", 4,
++ [0x30,0x30,0x00,0x00, 0x30,0x30,0x00,0x00,
++ 0x30,0x30,0x00,0x00, 0x30,0x30,0x00,0x00]);
++
++runOne("i32x4.extadd_pairwise_i16x8_u",
++ "i32x4.extadd_pairwise_i16x8_u", 4,
++ [0x30,0x30,0x00,0x00, 0x30,0x30,0x00,0x00,
++ 0x30,0x30,0x00,0x00, 0x30,0x30,0x00,0x00]);
++
++// ---- unsignedWidenHighInt32x4 ----
++//
++// i64x2.extend_high_i32x4_u: take the high two i32 lanes (lanes 2 and 3) of
++// the input, zero-extend each to i64, lay them out as i64x2.
++// Input lane = 0x18181818 (positive, =404232216).
++// Result: two i64 lanes, each = 0x0000000018181818.
++// Memory LE: per i64 lane bytes 0x18 0x18 0x18 0x18 0x00 0x00 0x00 0x00.
++
++runOne("i64x2.extend_high_i32x4_u",
++ "i64x2.extend_high_i32x4_u", 4,
++ [0x18,0x18,0x18,0x18,0x00,0x00,0x00,0x00,
++ 0x18,0x18,0x18,0x18,0x00,0x00,0x00,0x00]);
+diff --git a/js/src/jit-test/tests/wasm/profiling.js b/js/src/jit-test/tests/wasm/profiling.js
+index f4872b07cde8..ccd9690a262f 100644
+--- a/js/src/jit-test/tests/wasm/profiling.js
++++ b/js/src/jit-test/tests/wasm/profiling.js
+@@ -117,6 +117,13 @@ for (let type of ['f32', 'f64']) {
+ if (getBuildConfiguration("arm64")) {
+ continue;
+ }
++ // PPC64 inlines ceil/floor/trunc as frip/frim/friz (see
++ // Assembler-ppc64.h HasRoundInstruction), so no builtin thunk
++ // frames exist to profile. `nearest` still goes through the
++ // thunk because PPC64's frin is not IEEE round-to-even.
++ if (getBuildConfiguration("ppc64") && func !== 'nearest') {
++ continue;
++ }
+ test(`(module
+ (func (export "") (param ${type}) (result ${type})
+ local.get 0
+diff --git a/js/src/jit-test/tests/wasm/regress-ppc64-extract-lane-ctz.js b/js/src/jit-test/tests/wasm/regress-ppc64-extract-lane-ctz.js
+new file mode 100644
+index 000000000000..e2cf5def541e
+--- /dev/null
++++ b/js/src/jit-test/tests/wasm/regress-ppc64-extract-lane-ctz.js
+@@ -0,0 +1,49 @@
++// |jit-test| --wasm-compiler=optimizing; skip-if: !wasmSimdEnabled()
++//
++// Regression test for a PPC64 i32x4.extract_lane canonicalization bug.
++//
++// ExtractLaneToGPR leaves the adjacent lane in the high 32 bits of the GPR for
++// the unshifted lanes (0 and 2), so extractLaneInt32x4 must sign-extend its i32
++// result (as the i8x16/i16x8 extracts do). Without that, a consumer that reads
++// the full 64-bit register sees garbage in the high half. The POWER8 i32.ctz
++// emulation is such a consumer: its 64-bit neg/and. zero-check disagrees with
++// its 32-bit cntlzw, so ctz of a zero lane sitting next to a nonzero neighbour
++// returned -1 instead of 32.
++//
++// The vector comes from memory (runtime, not constant-foldable) and is passed
++// through a SIMD op so the extract is a genuine vector-register extract. Run
++// under MOZ_PPC64_FORCE_POWER8=1 to exercise the emulated ctz path; in every
++// other mode this is simply a correctness check.
++
++const ins = wasmEvalText(`(module
++ (memory (export "mem") 1)
++ (func $v (result v128)
++ ;; identity AND keeps the value in a vector register and forces a real
++ ;; extractLaneInt32x4 rather than an extract-of-load fold.
++ (v128.and (v128.load (i32.const 0)) (v128.const i32x4 -1 -1 -1 -1)))
++ (func (export "ctz0") (result i32) (i32.ctz (i32x4.extract_lane 0 (call $v))))
++ (func (export "ctz1") (result i32) (i32.ctz (i32x4.extract_lane 1 (call $v))))
++ (func (export "ctz2") (result i32) (i32.ctz (i32x4.extract_lane 2 (call $v))))
++ (func (export "ctz3") (result i32) (i32.ctz (i32x4.extract_lane 3 (call $v))))
++ (func (export "sext0") (result i64) (i64.extend_i32_s (i32x4.extract_lane 0 (call $v))))
++ (func (export "sext2") (result i64) (i64.extend_i32_s (i32x4.extract_lane 2 (call $v))))
++)`).exports;
++
++const mem = new Int32Array(ins.mem.buffer);
++function setLanes(a, b, c, d) { mem[0] = a; mem[1] = b; mem[2] = c; mem[3] = d; }
++
++// Each lane = 0 surrounded by nonzero neighbours: ctz must be 32, never -1.
++setLanes(0, -1, -1, -1); assertEq(ins.ctz0(), 32);
++setLanes(-1, 0, -1, -1); assertEq(ins.ctz1(), 32);
++setLanes(-1, -1, 0, -1); assertEq(ins.ctz2(), 32);
++setLanes(-1, -1, -1, 0); assertEq(ins.ctz3(), 32);
++
++// Nonzero lanes: ctz of the lane value, regardless of neighbours.
++setLanes(0x10, -1, 0x100000, -1);
++assertEq(ins.ctz0(), 4);
++assertEq(ins.ctz2(), 20);
++
++// A negative lane must sign-extend correctly (the canonicalization is extsw).
++setLanes(-2, 7, -3, 7);
++assertEq(ins.sext0(), -2n);
++assertEq(ins.sext2(), -3n);
+diff --git a/js/src/jit-test/tests/wasm/regress-ppc64-select-condition.js b/js/src/jit-test/tests/wasm/regress-ppc64-select-condition.js
+new file mode 100644
+index 000000000000..c38975dce859
+--- /dev/null
++++ b/js/src/jit-test/tests/wasm/regress-ppc64-select-condition.js
+@@ -0,0 +1,30 @@
++// |jit-test| --wasm-compiler=optimizing; skip-if: !wasmSimdEnabled()
++//
++// Regression test for a PPC64 wasm Ion miscompile of `select` with a 32-bit
++// condition. visitWasmSelect tested the i32 condition with a 64-bit compare
++// (cmpdi / branchTestPtr). When the condition was zero in its low 32 bits but
++// had garbage in the high 32 bits (as can happen under register pressure), the
++// 64-bit test read it as non-zero and select returned the wrong operand.
++//
++// Here the condition `$x3` is 0; `select($x8, -952809828, $x3)` must therefore
++// return -952809828. The surrounding SIMD shuffle/bitselect/swizzle chain
++// supplies the v128 register pressure that exposed the bug.
++
++const wat = `(module (func (export "f") (result i64)
++ (local $x3 i32)(local $x7 i32)(local $x8 i32)
++ (local $w0 v128)(local $w1 v128)(local $w2 v128)(local $w3 v128)
++ (local $w4 v128)(local $w5 v128)(local $w6 v128)(local $w7 v128)
++ (local.set $w0 (v128.const i32x4 1708443454 1532218695 2107423610 -1265775005))
++ (local.set $w2 (v128.const i32x4 -752312355 -625530572 -844666500 832036408))
++ (local.set $w7 (v128.const i32x4 115003496 -970441117 -43225935 1874128204))
++ (local.set $w4 (i8x16.shuffle 15 18 13 2 6 22 20 8 19 10 12 8 11 5 6 28 (local.get $w7) (local.get $w3)))
++ (local.set $w6 (v128.bitselect (local.get $w4) (local.get $w0) (local.get $w7)))
++ (local.set $w1 (v128.const i32x4 -1635025264 -629784132 1517869852 1651771825))
++ (local.set $w7 (v128.bitselect (local.get $w6) (local.get $w2) (local.get $w2)))
++ (local.set $w6 (i8x16.swizzle (local.get $w1) (local.get $w7)))
++ (local.set $x3 (i32x4.extract_lane 2 (local.get $w6)))
++ (local.set $x7 (select (local.get $x8) (i32.const -952809828) (local.get $x3)))
++ (i64.extend_i32_s (local.get $x7))))`;
++
++const ins = new WebAssembly.Instance(new WebAssembly.Module(wasmTextToBinary(wat)));
++assertEq(ins.exports.f(), -952809828n);
+diff --git a/js/src/jit-test/tests/wasm/regress-ppc64-trap-exit-simd-save.js b/js/src/jit-test/tests/wasm/regress-ppc64-trap-exit-simd-save.js
+new file mode 100644
+index 000000000000..4887f8df119c
+--- /dev/null
++++ b/js/src/jit-test/tests/wasm/regress-ppc64-trap-exit-simd-save.js
+@@ -0,0 +1,64 @@
++// |jit-test| exitstatus: 0; skip-if: !wasmSimdEnabled()
++//
++// Regression test for the PPC64 wasm trap exit losing live v128 state.
++//
++// On PPC64, doubles live in the FPRs (VSR0-31) while wasm v128 values live in
++// the VRs (VSR32-63) -- disjoint physical pools. The trap exit's
++// RegsToPreserve used AllDoubleMask only, so a trap firing while a v128 was
++// live resumed with whatever the C++ interrupt path's libc left in the VRs
++// (glibc's misaligned vector memcpy leaves lvsl alignment-control byte
++// patterns there). Interrupt checks fire via traps at loop back-edges, where
++// a loop-carried v128 accumulator is exactly what is live.
++//
++// The loop below keeps an i32x4 accumulator live across every back-edge while
++// interrupts fire repeatedly; the callback does large misaligned copies to
++// pull libc's vector memcpy through the VRs. On an unfixed build (real
++// silicon; the simulator's VRs are insulated from native libc) the
++// accumulator comes back holding garbage and the final lane values are wrong.
++
++const ins = wasmEvalText(`(module
++ (func (export "run") (param $n i32) (result i32)
++ (local $acc v128)
++ (block $done
++ (loop $top
++ (br_if $done (i32.eqz (local.get $n)))
++ (local.set $acc (i32x4.add (local.get $acc) (v128.const i32x4 1 2 3 4)))
++ (local.set $n (i32.sub (local.get $n) (i32.const 1)))
++ (br $top)))
++ ;; Fold the four lanes so any lane corruption shows up.
++ (i32.xor
++ (i32.xor (i32x4.extract_lane 0 (local.get $acc))
++ (i32.rotl (i32x4.extract_lane 1 (local.get $acc)) (i32.const 8)))
++ (i32.xor (i32.rotl (i32x4.extract_lane 2 (local.get $acc)) (i32.const 16))
++ (i32.rotl (i32x4.extract_lane 3 (local.get $acc)) (i32.const 24)))))
++)`).exports;
++
++// Misaligned big copies drive glibc's lvsl/vperm memcpy path on PPC.
++const big = new Uint8Array(1 << 20);
++const src = big.subarray(1, (1 << 19) + 1);
++const dst = new Uint8Array(1 << 19);
++
++let fires = 0;
++function onInterrupt() {
++ fires++;
++ for (let i = 0; i < 4; i++) {
++ dst.set(src);
++ }
++ if (fires < 25) {
++ timeout(0.02, onInterrupt);
++ }
++ return true;
++}
++
++function expected(n) {
++ const r = (x, k) => ((x << k) | (x >>> (32 - k))) | 0;
++ const l = [n | 0, (2 * n) | 0, (3 * n) | 0, (4 * n) | 0];
++ return ((l[0] ^ r(l[1], 8)) ^ (r(l[2], 16) ^ r(l[3], 24))) | 0;
++}
++
++const N = 1 << 26;
++timeout(0.02, onInterrupt);
++const got = ins.run(N);
++// Cancel any pending watchdog before finishing.
++timeout(-1);
++assertEq(got, expected(N));
+diff --git a/js/src/jit-test/tests/wasm/regress/bug-ppc64-simd-reduce-and-branch.js b/js/src/jit-test/tests/wasm/regress/bug-ppc64-simd-reduce-and-branch.js
+new file mode 100644
+index 000000000000..b7ec0d9548bb
+--- /dev/null
++++ b/js/src/jit-test/tests/wasm/regress/bug-ppc64-simd-reduce-and-branch.js
+@@ -0,0 +1,7 @@
++// Regression test for a PPC64-specific wasm Ion crash in
++// CodeGenerator::visitWasmReduceAndBranchSimd128 — it called
++// LBlock::label() directly on the branch targets without going through
++// skipTrivialBlocks(), so a trivial goto-only successor tripped
++// LBlock::label()'s !isTrivial() assertion. Reduced from grantkot.com/poly
++// with wasm-reduce. Triggers the bug under --wasm-compiler=optimizing.
++new WebAssembly.Module(os.file.readFile(scriptdir + "/bug-ppc64-simd-reduce-and-branch.wasm", "binary"));
+diff --git a/js/src/jit-test/tests/wasm/simd/bug1946618.js b/js/src/jit-test/tests/wasm/simd/bug1946618.js
+index cc02d0d8dfd7..fcf3a2a35e82 100644
+--- a/js/src/jit-test/tests/wasm/simd/bug1946618.js
++++ b/js/src/jit-test/tests/wasm/simd/bug1946618.js
+@@ -48,7 +48,12 @@ for (let op of ["f32x4.relaxed_min", "f32x4.relaxed_max",
+ // baseline.
+ let result1 = i.exports.variant1();
+ let result2 = i.exports.variant2();
+- if (getBuildConfiguration("arm64")) {
++ if (getBuildConfiguration("ppc64")) {
++ // PPC64: xvminsp/xvmaxsp always returns the non-NaN operand,
++ // regardless of operand order. Both variants give zero (non-NaN).
++ assertEq(result1, 0);
++ assertEq(result2, 0);
++ } else if (getBuildConfiguration("arm64")) {
+ // The relaxed_min/max operation appears to propagate NaNs symmetrically
+ // from either arg
+ assertEq(result1, 65535);
+diff --git a/js/src/jit-test/tests/wasm/simd/ion-analysis.js b/js/src/jit-test/tests/wasm/simd/ion-analysis.js
+index d12af6e6fbc9..335f831ff6a9 100644
+--- a/js/src/jit-test/tests/wasm/simd/ion-analysis.js
++++ b/js/src/jit-test/tests/wasm/simd/ion-analysis.js
+@@ -12,6 +12,7 @@
+ // generates the expected result.
+
+ var isArm64 = getBuildConfiguration("arm64");
++var isPPC64 = getBuildConfiguration("ppc64");
+
+ // 32-bit permutation that is not a rotation.
+ let perm32x4_pattern = [4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 0, 1, 2, 3];
+@@ -846,7 +847,7 @@ for ( let [ty128,size] of [['i8x16',1], ['i16x8',2], ['i32x4',4]] ) {
+ let ops = { all_true: allTrue, any_true: anyTrue, bitmask };
+
+ for ( let op of ['any_true', 'all_true', 'bitmask'] ) {
+- let folded = op != 'bitmask' || (size == 2 && !isArm64);
++ let folded = op != 'bitmask' || (size == 2 && !isArm64 && !isPPC64);
+ let operation = op == 'any_true' ? 'v128.any_true' : `${ty128}.${op}`;
+ let positive =
+ wasmCompile(
+@@ -898,12 +899,12 @@ for ( let [ty128,size] of [['i8x16',1], ['i16x8',2], ['i32x4',4]] ) {
+
+ // Bitselect with constant mask folded into shuffle operation
+
+-if (!isArm64) {
++if (!isArm64 && !isPPC64) {
+ wasmCompile(`
+ (module (func (param v128) (param v128) (result v128)
+ (v128.bitselect (local.get 0) (local.get 1) (v128.const i8x16 0 -1 -1 0 0 0 0 0 -1 -1 -1 -1 -1 -1 0 0))))
+ `);
+- assertEq(wasmSimdAnalysis(), "shuffle -> blend 8x16");
++ assertEq(wasmSimdAnalysis(), "shuffle -> blend 8x16");
+ }
+
+ // Library
+diff --git a/js/src/jit/Assembler.h b/js/src/jit/Assembler.h
+index 97c2e337625b..cb7244776605 100644
+--- a/js/src/jit/Assembler.h
++++ b/js/src/jit/Assembler.h
+@@ -19,6 +19,8 @@
+ # include "jit/loong64/Assembler-loong64.h"
+ #elif defined(JS_CODEGEN_RISCV64)
+ # include "jit/riscv64/Assembler-riscv64.h"
++#elif defined(JS_CODEGEN_PPC64)
++# include "jit/ppc64/Assembler-ppc64.h"
+ #elif defined(JS_CODEGEN_WASM32)
+ # include "jit/wasm32/Assembler-wasm32.h"
+ #elif defined(JS_CODEGEN_NONE)
+diff --git a/js/src/jit/BaselineIC.cpp b/js/src/jit/BaselineIC.cpp
+index c356538a024e..5ab631838f0e 100644
+--- a/js/src/jit/BaselineIC.cpp
++++ b/js/src/jit/BaselineIC.cpp
+@@ -120,6 +120,8 @@ AllocatableGeneralRegisterSet BaselineICAvailableGeneralRegs(size_t numInputs) {
+ MOZ_ASSERT(!regs.has(PseudoStackPointer));
+ MOZ_ASSERT(!regs.has(RealStackPointer));
+ MOZ_ASSERT(!regs.has(ICTailCallReg));
++#elif defined(JS_CODEGEN_PPC64)
++ regs.take(ICTailCallReg);
+ #endif
+ regs.take(ICStubReg);
+
+diff --git a/js/src/jit/CacheIRCompiler.cpp b/js/src/jit/CacheIRCompiler.cpp
+index 4eb952e497e3..ee4888495103 100644
+--- a/js/src/jit/CacheIRCompiler.cpp
++++ b/js/src/jit/CacheIRCompiler.cpp
+@@ -10302,6 +10302,14 @@ bool CacheIRCompiler::emitConcatStringsResult(StringOperandId lhsId,
+ liveRegs.add(ICTailCallReg);
+ #endif
+ liveRegs.takeUnchecked(output.valueReg());
++
++#ifdef JS_CODEGEN_PPC64
++ // On PPC64, LR is an SPR, not a GPR, so ICTailCallReg is a regular
++ // GPR that does not shadow LR. The inner bctrl will clobber LR, so
++ // save/restore it explicitly.
++ masm.xs_mflr(r0);
++ masm.push(r0);
++#endif
+ masm.PushRegsInMask(liveRegs);
+
+ // The stub expects lhs in CallTempReg0 and rhs in CallTempReg1.
+@@ -10322,11 +10330,19 @@ bool CacheIRCompiler::emitConcatStringsResult(StringOperandId lhsId,
+ masm.branchTestPtr(Assembler::Zero, CallTempReg5, CallTempReg5, &vmCall);
+ masm.tagValue(JSVAL_TYPE_STRING, CallTempReg5, output.valueReg());
+ masm.PopRegsInMask(liveRegs);
++#ifdef JS_CODEGEN_PPC64
++ masm.pop(r0);
++ masm.xs_mtlr(r0);
++#endif
+ masm.jump(&done);
+
+ masm.bind(&vmCall);
+ masm.setFramePushed(framePushed);
+ masm.PopRegsInMask(liveRegs);
++#ifdef JS_CODEGEN_PPC64
++ masm.pop(r0);
++ masm.xs_mtlr(r0);
++#endif
+ }
+
+ {
+diff --git a/js/src/jit/CodeGenerator.cpp b/js/src/jit/CodeGenerator.cpp
+index a1c01409e9f7..2a2c6007aec0 100644
+--- a/js/src/jit/CodeGenerator.cpp
++++ b/js/src/jit/CodeGenerator.cpp
+@@ -2519,6 +2519,12 @@ static bool PrepareAndExecuteRegExp(MacroAssembler& masm, Register regexp,
+ masm.computeEffectiveAddress(Address(FramePointer, ioOffset), temp2);
+ masm.PushRegsInMask(volatileRegs);
+ masm.setupUnalignedABICall(temp3);
++#if defined(JS_CODEGEN_PPC64)
++ // temp1 aliases argregs on this platform, so we need to reuse temp3
++ // or we'll stomp on the code pointer when we pass the first ABI argument.
++ masm.movePtr(codePointer, temp3);
++ codePointer = temp3;
++#endif
+ masm.passABIArg(temp2);
+ masm.callWithABI(codePointer);
+ masm.storeCallInt32Result(temp1);
+diff --git a/js/src/jit/CodeGenerator.h b/js/src/jit/CodeGenerator.h
+index 58c047dea41b..3781b9595dfd 100644
+--- a/js/src/jit/CodeGenerator.h
++++ b/js/src/jit/CodeGenerator.h
+@@ -23,6 +23,8 @@
+ # include "jit/loong64/CodeGenerator-loong64.h"
+ #elif defined(JS_CODEGEN_RISCV64)
+ # include "jit/riscv64/CodeGenerator-riscv64.h"
++#elif defined(JS_CODEGEN_PPC64)
++# include "jit/ppc64/CodeGenerator-ppc64.h"
+ #elif defined(JS_CODEGEN_WASM32)
+ # include "jit/wasm32/CodeGenerator-wasm32.h"
+ #elif defined(JS_CODEGEN_NONE)
+diff --git a/js/src/jit/EffectiveAddressAnalysis.cpp b/js/src/jit/EffectiveAddressAnalysis.cpp
+index e1bd1bd045ef..88697c06907c 100644
+--- a/js/src/jit/EffectiveAddressAnalysis.cpp
++++ b/js/src/jit/EffectiveAddressAnalysis.cpp
+@@ -60,7 +60,7 @@ static bool OffsetIsSmallEnough(int32_t imm) {
+ // `movn #imm`. arm32 is similar.
+ return imm >= -0xFFFF && imm <= 0xFFFF;
+ #elif defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_LOONG64) || \
+- defined(JS_CODEGEN_MIPS64)
++ defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_PPC64)
+ return imm >= -0xFFF && imm <= 0xFFF;
+ #elif defined(JS_CODEGEN_WASM32) || defined(JS_CODEGEN_NONE)
+ return true;
+diff --git a/js/src/jit/ExecutableAllocator.cpp b/js/src/jit/ExecutableAllocator.cpp
+index 340a63964b52..c9336fe8ec4e 100644
+--- a/js/src/jit/ExecutableAllocator.cpp
++++ b/js/src/jit/ExecutableAllocator.cpp
+@@ -306,13 +306,19 @@ void ExecutableAllocator::poisonCode(JSRuntime* rt,
+ }
+ }
+
+- // Make the pools executable again and drop references. We don't flush the
+- // ICache here to not add extra overhead.
++ // Make the pools executable again and drop references. On architectures with
++ // incoherent ICache (PPC64), we must flush to prevent stale instruction
++ // execution when code regions are reused after sweeping.
+ for (size_t i = 0; i < ranges.length(); i++) {
+ ExecutablePool* pool = ranges[i].pool;
+ if (pool->isMarked()) {
++#ifdef JS_CODEGEN_PPC64
++ reprotectPool(rt, pool, ProtectionSetting::Executable,
++ MustFlushICache::Yes);
++#else
+ reprotectPool(rt, pool, ProtectionSetting::Executable,
+ MustFlushICache::No);
++#endif
+ pool->unmark();
+ }
+ pool->release();
+diff --git a/js/src/jit/FlushICache.cpp b/js/src/jit/FlushICache.cpp
+index d3b1657a6be2..9590687c9803 100644
+--- a/js/src/jit/FlushICache.cpp
++++ b/js/src/jit/FlushICache.cpp
+@@ -13,7 +13,8 @@
+ # include "jit/arm64/vixl/Simulator-vixl.h"
+ #endif
+
+-#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64)
++#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64) || \
++ defined(JS_CODEGEN_PPC64)
+
+ # ifdef __linux__
+ # include <linux/version.h>
+diff --git a/js/src/jit/FlushICache.h b/js/src/jit/FlushICache.h
+index af79da356ee5..58396f62ae0d 100644
+--- a/js/src/jit/FlushICache.h
++++ b/js/src/jit/FlushICache.h
+@@ -21,7 +21,7 @@ inline void FlushICache(void* code, size_t size) {
+ }
+ #elif (defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64)) || \
+ defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+- defined(JS_CODEGEN_RISCV64)
++ defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+
+ // Invalidate the given code range from the icache. This will also flush the
+ // execution context for this core. If this code is to be executed on another
+@@ -37,7 +37,7 @@ inline void FlushICache(void* code, size_t size) { MOZ_CRASH(); }
+ # error "Unknown architecture!"
+ #endif
+
+-#if (defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_X64)) || \
++#if (defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_X64)) || \
+ defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+ defined(JS_CODEGEN_RISCV64)
+
+@@ -55,10 +55,11 @@ inline void FlushExecutionContext() { MOZ_CRASH(); }
+ inline bool CanFlushExecutionContextForAllThreads() { MOZ_CRASH(); }
+ inline void FlushExecutionContextForAllThreads() { MOZ_CRASH(); }
+
+-#elif defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64)
++#elif defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64) || \
++ defined(JS_CODEGEN_PPC64)
+
+-// ARM and ARM64 must flush the instruction pipeline of the current core
+-// before executing newly JIT'ed code. This will remove any stale data from
++// ARM, ARM64, and PPC64 must flush the instruction pipeline of the current
++// core before executing newly JIT'ed code. This will remove any stale data from
+ // the pipeline that may have referenced invalidated instructions.
+ //
+ // `FlushICache` will perform this for the thread that compiles the code, but
+diff --git a/js/src/jit/GenerateABIFunctionType.py b/js/src/jit/GenerateABIFunctionType.py
+index 04be10d1de2a..815427ec6771 100644
+--- a/js/src/jit/GenerateABIFunctionType.py
++++ b/js/src/jit/GenerateABIFunctionType.py
+@@ -538,6 +538,102 @@ def riscv64_simulator_dispatch(func_types):
+ return contents
+
+
++# PPC64 ELFv2 ABI: 8 int arg regs (r3-r10), 13 FP arg regs (f1-f13).
++# Each floating-point argument consumes BOTH a float-arg slot AND a
++# general-purpose-register shadow slot (capped at 8 GPR slots), matching
++# what GCC and the JIT's ABIArgGenerator do for ELFv2 PPC64LE. Without
++# the shadow, integer args following a float go to the wrong register
++# at the call boundary, producing a use-after-free / wrong-pointer crash
++# in the C callee. (Verified empirically by disassembling
++# NumberBigIntCompare(double, BigInt*) on real PPC64: BigInt* is read
++# from r4, not r3.)
++def ppc64_args(func_type):
++ contents = ""
++ numIntArgRegs = 8
++ numFloatArgRegs = 13
++ intRegIndex = 0
++ floatRegIndex = 0
++ stackOffset = 0
++ for i, arg in enumerate(func_type["args"]):
++ if i != 0:
++ contents += ", "
++
++ if arg == "General":
++ if intRegIndex == numIntArgRegs:
++ contents += f"sp_[{stackOffset}]"
++ stackOffset += 1
++ else:
++ contents += f"a{intRegIndex}_"
++ intRegIndex += 1
++ elif arg == "Int32":
++ if intRegIndex == numIntArgRegs:
++ contents += f"I32(sp_[{stackOffset}])"
++ stackOffset += 1
++ else:
++ contents += f"I32(a{intRegIndex}_)"
++ intRegIndex += 1
++ elif arg == "Int64":
++ if intRegIndex == numIntArgRegs:
++ contents += f"sp_[{stackOffset}]"
++ stackOffset += 1
++ else:
++ contents += f"a{intRegIndex}_"
++ intRegIndex += 1
++ elif arg == "Float32":
++ if floatRegIndex == numFloatArgRegs:
++ contents += f"*mozilla::BitwiseCast<float*>(sp_[{stackOffset}])"
++ stackOffset += 1
++ else:
++ contents += f"f{floatRegIndex}_s"
++ floatRegIndex += 1
++ # ELFv2: FP arg also consumes a GPR shadow slot.
++ if intRegIndex < numIntArgRegs:
++ intRegIndex += 1
++ elif arg == "Float64":
++ if floatRegIndex == numFloatArgRegs:
++ contents += f"mozilla::BitwiseCast<double>(sp_[{stackOffset}])"
++ stackOffset += 1
++ else:
++ contents += f"f{floatRegIndex}_d"
++ floatRegIndex += 1
++ # ELFv2: FP arg also consumes a GPR shadow slot.
++ if intRegIndex < numIntArgRegs:
++ intRegIndex += 1
++ assert intRegIndex <= numIntArgRegs
++ assert floatRegIndex <= numFloatArgRegs
++ return contents
++
++
++def ppc64_simulator_dispatch(func_types):
++ contents = ""
++ for func_type in func_types:
++ args = ppc64_args(func_type)
++ contents += f"case js::jit::Args_{func_type_name(func_type)}: {{\\\n"
++ contents += f" auto target = reinterpret_cast<Prototype_{func_type_name(func_type)}>(nativeFn);\\\n"
++ ret = func_type["ret"]
++ if ret == "Void":
++ contents += f" target({args});\\\n"
++ else:
++ contents += f" auto ret = target({args});\\\n"
++ if ret == "Void":
++ pass
++ elif ret == "General":
++ contents += " setCallResult(ret);\\\n"
++ elif ret == "Int32":
++ contents += " setCallResult(I64(ret));\\\n"
++ elif ret == "Int64":
++ contents += " setCallResult(ret);\\\n"
++ elif ret == "Float32":
++ contents += " setCallResultFloat(ret);\\\n"
++ elif ret == "Float64":
++ contents += " setCallResultDouble(ret);\\\n"
++ else:
++ raise ValueError(f"Unknown ret type: {ret}")
++ contents += " break;\\\n"
++ contents += "}\\\n"
++ return contents
++
++
+ def main(c_out, yaml_path):
+ func_types = load_yaml(yaml_path)
+
+@@ -581,4 +677,8 @@ def main(c_out, yaml_path):
+ contents += riscv64_simulator_dispatch(func_types)
+ contents += "\n"
+
++ contents += "#define ABI_FUNCTION_TYPE_PPC64_SIM_DISPATCH \\\n"
++ contents += ppc64_simulator_dispatch(func_types)
++ contents += "\n"
++
+ generate_header(c_out, "jit_ABIFunctionTypeGenerated_h", contents)
+diff --git a/js/src/jit/JitContext.cpp b/js/src/jit/JitContext.cpp
+index 79b22d9f249f..d399ddd36fd4 100644
+--- a/js/src/jit/JitContext.cpp
++++ b/js/src/jit/JitContext.cpp
+@@ -121,6 +121,10 @@ bool jit::InitializeJit() {
+ RVFlags::Init();
+ #endif
+
++#ifdef JS_CODEGEN_PPC64
++ PPC64Flags::Init();
++#endif
++
+ #ifndef JS_CODEGEN_NONE
+ MOZ_ASSERT(js::jit::CPUFlagsHaveBeenComputed());
+ #endif
+diff --git a/js/src/jit/JitFrames.cpp b/js/src/jit/JitFrames.cpp
+index 3653af3a21f4..bbd1376dec69 100644
+--- a/js/src/jit/JitFrames.cpp
++++ b/js/src/jit/JitFrames.cpp
+@@ -1824,7 +1824,12 @@ Value SnapshotIterator::allocationValue(const RValueAllocation& alloc,
+ return DoubleValue(fromRegister<double>(alloc.fpuReg()));
+
+ case RValueAllocation::FLOAT32_REG:
++#if defined(JS_CODEGEN_PPC64) || defined(JS_CODEGEN_RISCV64)
++ return Float32Value(
++ float(fromRegister<double>(alloc.fpuReg().asDouble())));
++#else
+ return Float32Value(fromRegister<float>(alloc.fpuReg()));
++#endif
+
+ case RValueAllocation::FLOAT32_STACK:
+ return Float32Value(ReadFrameFloat32Slot(fp_, alloc.stackOffset()));
+@@ -2625,7 +2630,12 @@ uintptr_t MachineState::read(Register reg) const {
+
+ template <typename T>
+ T MachineState::read(FloatRegister reg) const {
++#if defined(JS_CODEGEN_PPC64) || defined(JS_CODEGEN_RISCV64)
++ // PPC64/RISCV64 always store FloatRegisters as 64-bit doubles.
++ MOZ_ASSERT(reg.size() >= sizeof(T));
++#else
+ MOZ_ASSERT(reg.size() == sizeof(T));
++#endif
+
+ #if !defined(JS_CODEGEN_NONE) && !defined(JS_CODEGEN_WASM32)
+ if (state_.is<BailoutState>()) {
+diff --git a/js/src/jit/JitFrames.h b/js/src/jit/JitFrames.h
+index ac7005a5fcfc..490834e62fec 100644
+--- a/js/src/jit/JitFrames.h
++++ b/js/src/jit/JitFrames.h
+@@ -322,6 +322,16 @@ enum class ExceptionResumeKind : int32_t {
+
+ // Data needed to recover from an exception.
+ struct ResumeFromException {
++#if defined(JS_CODEGEN_PPC64)
++ // This struct is built on the stack as part of exception returns. Because
++ // it goes right on top of the stack, an ABI-compliant routine can wreck
++ // it, so we implement a minimum Power ISA linkage area (four doublewords).
++ void* _ppc_sp_;
++ void* _ppc_cr_;
++ void* _ppc_lr_;
++ void* _ppc_toc_;
++#endif
++
+ uint8_t* framePointer;
+ uint8_t* stackPointer;
+ uint8_t* target;
+@@ -373,7 +383,7 @@ struct ResumeFromException {
+ }
+ };
+
+-#if defined(JS_CODEGEN_ARM64)
++#if defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_PPC64)
+ static_assert(sizeof(ResumeFromException) % 16 == 0,
+ "ResumeFromException should be aligned");
+ #endif
+diff --git a/js/src/jit/LIR.cpp b/js/src/jit/LIR.cpp
+index 2f89fb407349..a9f634b7fcc1 100644
+--- a/js/src/jit/LIR.cpp
++++ b/js/src/jit/LIR.cpp
+@@ -779,8 +779,8 @@ bool LMoveGroup::add(LAllocation from, LAllocation to, LDefinition::Type type) {
+ // CodeGeneratorShared::CodeGeneratorShared and in general everywhere
+ // SimdMemoryAignment is used. Likely, alignment requirements will return.
+ # if defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_X64) || \
+- defined(JS_CODEGEN_ARM64)
+- // No need for any check on x86/x64/arm64.
++ defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_PPC64)
++ // No need for any check on x86/x64/arm64/ppc64.
+ # else
+ # error "Need to consider SIMD alignment on this target."
+ // The following code may be of use if we need alignment checks on
+diff --git a/js/src/jit/LIR.h b/js/src/jit/LIR.h
+index 3f4efeda7955..3354cb96b0cb 100644
+--- a/js/src/jit/LIR.h
++++ b/js/src/jit/LIR.h
+@@ -200,7 +200,7 @@ class LUse : public LAllocation {
+ static const uint32_t POLICY_BITS = 3;
+ static const uint32_t POLICY_SHIFT = 0;
+ static const uint32_t POLICY_MASK = (1 << POLICY_BITS) - 1;
+-#ifdef JS_CODEGEN_ARM64
++#if defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_PPC64)
+ static const uint32_t REG_BITS = 7;
+ #else
+ static const uint32_t REG_BITS = 6;
+@@ -619,12 +619,18 @@ class LDefinition {
+ Type type() const { return (Type)((bits_ >> TYPE_SHIFT) & TYPE_MASK); }
+
+ static bool isFloatRegCompatible(Type type, FloatRegister reg) {
++#if defined(JS_CODEGEN_PPC64) || defined(JS_CODEGEN_RISCV64)
++ if (type == FLOAT32 || type == DOUBLE) {
++ return reg.isSingle() || reg.isDouble();
++ }
++#else
+ if (type == FLOAT32) {
+ return reg.isSingle();
+ }
+ if (type == DOUBLE) {
+ return reg.isDouble();
+ }
++#endif
+ MOZ_ASSERT(type == SIMD128);
+ return reg.isSimd128();
+ }
+@@ -2292,6 +2298,8 @@ AnyRegister LAllocation::toAnyRegister() const {
+ # include "jit/loong64/LIR-loong64.h"
+ #elif defined(JS_CODEGEN_RISCV64)
+ # include "jit/riscv64/LIR-riscv64.h"
++#elif defined(JS_CODEGEN_PPC64)
++# include "jit/ppc64/LIR-ppc64.h"
+ #elif defined(JS_CODEGEN_MIPS64)
+ # include "jit/mips-shared/LIR-mips-shared.h"
+ # include "jit/mips64/LIR-mips64.h"
+diff --git a/js/src/jit/LIROps.yaml b/js/src/jit/LIROps.yaml
+index 315ff5fd5348..7fbea9e2ebec 100644
+--- a/js/src/jit/LIROps.yaml
++++ b/js/src/jit/LIROps.yaml
+@@ -2210,7 +2210,7 @@
+ oldval: WordSized
+ newval: WordSized
+ # Needs additional temps on LL/SC platforms to extract/insert bits of word.
+-#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ num_temps: 4
+ #else
+ num_temps: 1
+@@ -2224,7 +2224,7 @@
+ index: WordSized
+ value: WordSized
+ # Needs additional temps on LL/SC platforms to extract/insert bits of word.
+-#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ num_temps: 4
+ #else
+ num_temps: 1
+@@ -2238,7 +2238,7 @@
+ index: WordSized
+ value: WordSized
+ # Needs additional temps on LL/SC platforms to extract/insert bits of word.
+-#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ num_temps: 4
+ #else
+ num_temps: 2
+@@ -2255,7 +2255,7 @@
+ # Needs additional temps on LL/SC platforms to extract/insert bits of word.
+ #if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64)
+ num_temps: 1
+-#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ num_temps: 3
+ #endif
+ mir_op: AtomicTypedArrayElementBinop
+@@ -3066,7 +3066,7 @@
+ operands:
+ ptr: WordSized
+ memoryBase: WordSized
+-#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ num_temps: 1
+ #endif
+ mir_op: true
+@@ -3078,7 +3078,7 @@
+ memoryBase: WordSized
+ #ifdef JS_CODEGEN_ARM
+ num_temps: 2
+-#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ num_temps: 1
+ #endif
+ mir_op: WasmLoad
+@@ -3088,7 +3088,7 @@
+ ptr: WordSized
+ value: WordSized
+ memoryBase: WordSized
+-#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ num_temps: 1
+ #endif
+ mir_op: true
+@@ -3098,7 +3098,7 @@
+ ptr: WordSized
+ value: Int64
+ memoryBase: WordSized
+-#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ num_temps: 1
+ #endif
+ mir_op: WasmStore
+@@ -3128,7 +3128,7 @@
+ memoryBase: WordSized
+ #ifdef JS_CODEGEN_X86
+ num_temps: 1
+-#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ # Temp that may be used on LL/SC platforms for extract/insert bits of word.
+ num_temps: 3
+ #endif
+@@ -3142,7 +3142,7 @@
+ memoryBase: WordSized
+ #ifdef JS_CODEGEN_X86
+ num_temps: 1
+-#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ # Temp that may be used on LL/SC platforms for extract/insert bits of word.
+ num_temps: 3
+ #endif
+@@ -3154,7 +3154,7 @@
+ ptr: WordSized
+ value: WordSized
+ memoryBase: WordSized
+-#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ # Temp that may be used on LL/SC platforms for extract/insert bits of word.
+ num_temps: 3
+ #elifdef JS_CODEGEN_X86
+@@ -3171,7 +3171,7 @@
+ ptr: WordSized
+ value: WordSized
+ memoryBase: WordSized
+-#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ # Temp that may be used on LL/SC platforms for extract/insert bits of word.
+ num_temps: 3
+ #elif defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64)
+@@ -4424,6 +4424,64 @@
+ mir_op: WasmAtomicExchangeHeap
+ #endif
+
++#ifdef JS_CODEGEN_PPC64
++- name: DivOrModI64
++ gen_boilerplate: false
++
++- name: UDivOrMod
++ gen_boilerplate: false
++
++- name: UDivOrModI64
++ gen_boilerplate: false
++
++- name: ModMaskI
++ result_type: WordSized
++ operands:
++ input: WordSized
++ arguments:
++ shift: int32_t
++ num_temps: 2
++ mir_op: Mod
++
++- name: WasmTruncateToInt64
++ result_type: Int64
++ operands:
++ input: WordSized
++ mir_op: true
++
++- name: Int64ToFloatingPoint
++ result_type: WordSized
++ operands:
++ input: Int64
++ mir_op: true
++
++- name: WasmCompareExchangeI64
++ result_type: Int64
++ operands:
++ ptr: WordSized
++ oldValue: Int64
++ newValue: Int64
++ memoryBase: WordSized
++ mir_op: WasmCompareExchangeHeap
++
++- name: WasmAtomicBinopI64
++ result_type: Int64
++ operands:
++ ptr: WordSized
++ value: Int64
++ memoryBase: WordSized
++ num_temps64: 1
++ mir_op: WasmAtomicBinopHeap
++
++- name: WasmAtomicExchangeI64
++ result_type: Int64
++ operands:
++ ptr: WordSized
++ value: Int64
++ memoryBase: WordSized
++ mir_op: WasmAtomicExchangeHeap
++#endif
++
+ #ifdef JS_CODEGEN_RISCV64
+ - name: UDiv
+ result_type: WordSized
+diff --git a/js/src/jit/Label.h b/js/src/jit/Label.h
+index 061bf978d26f..2a49ded9c967 100644
+--- a/js/src/jit/Label.h
++++ b/js/src/jit/Label.h
+@@ -23,7 +23,7 @@ struct LabelBase {
+ uint32_t offset_ : 31;
+
+ #if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+- defined(JS_CODEGEN_RISCV64)
++ defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ public:
+ #endif
+ static const uint32_t INVALID_OFFSET = 0x7fffffff; // UINT31_MAX.
+diff --git a/js/src/jit/Lowering.cpp b/js/src/jit/Lowering.cpp
+index 9c1c4b0df491..e3fe71ea9c83 100644
+--- a/js/src/jit/Lowering.cpp
++++ b/js/src/jit/Lowering.cpp
+@@ -1174,7 +1174,7 @@ void LIRGenerator::visitTest(MTest* test) {
+
+ #if defined(ENABLE_WASM_SIMD) && \
+ (defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_X64) || \
+- defined(JS_CODEGEN_ARM64))
++ defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_PPC64))
+ // Check if the operand for this test is an any_true/all_true SIMD operation.
+ // If it is, we want to emit an LWasmReduceAndBranchSimd128 node to avoid
+ // generating an intermediate boolean result.
+diff --git a/js/src/jit/Lowering.h b/js/src/jit/Lowering.h
+index b4f133758eb6..d973a68989b5 100644
+--- a/js/src/jit/Lowering.h
++++ b/js/src/jit/Lowering.h
+@@ -23,6 +23,8 @@
+ # include "jit/loong64/Lowering-loong64.h"
+ #elif defined(JS_CODEGEN_RISCV64)
+ # include "jit/riscv64/Lowering-riscv64.h"
++#elif defined(JS_CODEGEN_PPC64)
++# include "jit/ppc64/Lowering-ppc64.h"
+ #elif defined(JS_CODEGEN_WASM32)
+ # include "jit/wasm32/Lowering-wasm32.h"
+ #elif defined(JS_CODEGEN_NONE)
+diff --git a/js/src/jit/MacroAssembler-inl.h b/js/src/jit/MacroAssembler-inl.h
+index 4747a22e171b..d7385df895d5 100644
+--- a/js/src/jit/MacroAssembler-inl.h
++++ b/js/src/jit/MacroAssembler-inl.h
+@@ -39,6 +39,8 @@
+ # include "jit/loong64/MacroAssembler-loong64-inl.h"
+ #elif defined(JS_CODEGEN_RISCV64)
+ # include "jit/riscv64/MacroAssembler-riscv64-inl.h"
++#elif defined(JS_CODEGEN_PPC64)
++# include "jit/ppc64/MacroAssembler-ppc64-inl.h"
+ #elif defined(JS_CODEGEN_WASM32)
+ # include "jit/wasm32/MacroAssembler-wasm32-inl.h"
+ #elif !defined(JS_CODEGEN_NONE)
+diff --git a/js/src/jit/MacroAssembler.cpp b/js/src/jit/MacroAssembler.cpp
+index eb95d6c9e2c4..5b28e811c88d 100644
+--- a/js/src/jit/MacroAssembler.cpp
++++ b/js/src/jit/MacroAssembler.cpp
+@@ -6128,7 +6128,7 @@ static void MoveDataBlock(MacroAssembler& masm, Register base, int32_t from,
+ static constexpr Register scratch = ABINonArgReg0;
+ masm.push(scratch);
+ #elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+- defined(JS_CODEGEN_RISCV64)
++ defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ UseScratchRegisterScope temps(masm);
+ Register scratch = temps.Acquire();
+ #elif !defined(JS_CODEGEN_NONE)
+@@ -6315,6 +6315,12 @@ static void CollapseWasmFrameFast(MacroAssembler& masm,
+
+ #ifdef JS_USE_LINK_REGISTER
+ // RA is already in its place, just move stack.
++# ifdef JS_CODEGEN_PPC64
++ // PPC64's LR is not a GPR, so WasmTailCallRAScratchReg is a normal GPR
++ // (r14). We must explicitly move it to LR so the callee's prologue
++ // (pushReturnAddress) saves the correct return address.
++ masm.xs_mtlr(tempForRA);
++# endif
+ masm.addToStackPtr(Imm32(framePushedAtStart + newArgDest));
+ #else
+ // Push RA to new frame: store RA, restore temp, and move stack.
+@@ -6463,6 +6469,12 @@ static void CollapseWasmFrameSlow(MacroAssembler& masm,
+ #ifdef JS_USE_LINK_REGISTER
+ masm.freeStack(reserved);
+ // RA is already in its place, just move stack.
++# ifdef JS_CODEGEN_PPC64
++ // PPC64's LR is not a GPR, so WasmTailCallRAScratchReg is a normal GPR
++ // (r14). We must explicitly move the trampoline address to LR so the
++ // callee returns to the trampoline.
++ masm.xs_mtlr(tempForRA);
++# endif
+ masm.addToStackPtr(Imm32(framePushedAtStart + newArgDest));
+ #else
+ // Push RA to new frame: store RA, restore temp, and move stack.
+@@ -8527,7 +8539,7 @@ void MacroAssembler::debugAssertCanonicalInt32(Register r) {
+ breakpoint();
+ bind(&ok);
+ # elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+- defined(JS_CODEGEN_RISCV64)
++ defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ Label ok;
+ UseScratchRegisterScope temps(*this);
+ Register scratch = temps.Acquire();
+@@ -10567,6 +10579,15 @@ void MacroAssembler::orderedHashTableLookup(Register setOrMapObj,
+ unboxInt32(Address(setOrMapObj, TableObject::offsetOfLiveCount()), temp1);
+ branchTest32(Assembler::Zero, temp1, temp1, ¬Found);
+
++#if defined(JS_CODEGEN_PPC64)
++ // If this was preceded by a MoveGroup instruction, the hash may have been
++ // loaded algebraically since it's an Int32 (and thus sign-extended); the
++ // operation doesn't know to keep the upper bits clear, failing the assert.
++ if (isBigInt == IsBigInt::No) {
++ as_rldicl(hash, hash, 0, 32);
++ }
++#endif
++
+ #ifdef DEBUG
+ PushRegsInMask(LiveRegisterSet(RegisterSet::Volatile()));
+
+diff --git a/js/src/jit/MacroAssembler.h b/js/src/jit/MacroAssembler.h
+index 6c08bb554ca8..754e8642bb57 100644
+--- a/js/src/jit/MacroAssembler.h
++++ b/js/src/jit/MacroAssembler.h
+@@ -23,6 +23,8 @@
+ # include "jit/loong64/MacroAssembler-loong64.h"
+ #elif defined(JS_CODEGEN_RISCV64)
+ # include "jit/riscv64/MacroAssembler-riscv64.h"
++#elif defined(JS_CODEGEN_PPC64)
++# include "jit/ppc64/MacroAssembler-ppc64.h"
+ #elif defined(JS_CODEGEN_WASM32)
+ # include "jit/wasm32/MacroAssembler-wasm32.h"
+ #elif defined(JS_CODEGEN_NONE)
+@@ -93,8 +95,9 @@
+ // }
+ // ////}}} check_macroassembler_style
+
+-#define ALL_ARCH mips64, arm, arm64, x86, x64, loong64, riscv64, wasm32
+-#define ALL_SHARED_ARCH arm, arm64, loong64, mips64, riscv64, x86_shared, wasm32
++#define ALL_ARCH mips64, arm, arm64, x86, x64, loong64, riscv64, ppc64, wasm32
++#define ALL_SHARED_ARCH \
++ arm, arm64, loong64, mips64, riscv64, ppc64, x86_shared, wasm32
+
+ // * How this macro works:
+ //
+@@ -140,6 +143,7 @@
+ #define DEFINED_ON_mips64
+ #define DEFINED_ON_loong64
+ #define DEFINED_ON_riscv64
++#define DEFINED_ON_ppc64
+ #define DEFINED_ON_wasm32
+ #define DEFINED_ON_none
+
+@@ -169,6 +173,9 @@
+ #elif defined(JS_CODEGEN_RISCV64)
+ # undef DEFINED_ON_riscv64
+ # define DEFINED_ON_riscv64 define
++#elif defined(JS_CODEGEN_PPC64)
++# undef DEFINED_ON_ppc64
++# define DEFINED_ON_ppc64 define
+ #elif defined(JS_CODEGEN_WASM32)
+ # undef DEFINED_ON_wasm32
+ # define DEFINED_ON_wasm32 define
+@@ -562,7 +569,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ void Pop(const Register64 reg);
+ void PopFlags() DEFINED_ON(x86_shared);
+ void PopStackPtr()
+- DEFINED_ON(arm, mips64, x86_shared, loong64, riscv64, wasm32);
++ DEFINED_ON(arm, mips64, x86_shared, loong64, riscv64, ppc64, wasm32);
+
+ // Move the stack pointer based on the requested amount.
+ void adjustStack(int amount);
+@@ -620,9 +627,9 @@ class MacroAssembler : public MacroAssemblerSpecific {
+
+ // These do not adjust framePushed().
+ void pushReturnAddress()
+- DEFINED_ON(mips64, arm, arm64, loong64, riscv64, wasm32);
++ DEFINED_ON(mips64, arm, arm64, loong64, riscv64, ppc64, wasm32);
+ void popReturnAddress()
+- DEFINED_ON(mips64, arm, arm64, loong64, riscv64, wasm32);
++ DEFINED_ON(mips64, arm, arm64, loong64, riscv64, ppc64, wasm32);
+
+ // Useful for dealing with two-valued returns.
+ void moveRegPair(Register src0, Register src1, Register dst0, Register dst1,
+@@ -641,7 +648,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ CodeOffset farJumpWithPatch() PER_SHARED_ARCH;
+ void patchFarJump(CodeOffset farJump, uint32_t targetOffset) PER_SHARED_ARCH;
+ static void patchFarJump(uint8_t* farJump, uint8_t* target)
+- DEFINED_ON(arm, arm64, x86_shared, loong64, mips64, riscv64);
++ DEFINED_ON(arm, arm64, x86_shared, loong64, mips64, riscv64, ppc64);
+
+ // Emit a nop that can be patched to and from a nop and a call with int32
+ // relative displacement.
+@@ -667,9 +674,9 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ // target behaviour is only provided for `n` in the range 0 .. 2^31-1
+ // inclusive.
+ CodeOffset move32WithPatch(Register dest)
+- DEFINED_ON(x86_shared, arm, arm64, loong64, mips64, riscv64);
++ DEFINED_ON(x86_shared, arm, arm64, loong64, mips64, riscv64, ppc64);
+ void patchMove32(CodeOffset offset, Imm32 n)
+- DEFINED_ON(x86_shared, arm, arm64, loong64, mips64, riscv64);
++ DEFINED_ON(x86_shared, arm, arm64, loong64, mips64, riscv64, ppc64);
+
+ public:
+ // ===============================================================
+@@ -1174,13 +1181,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ inline void mulPtr(ImmWord rhs, Register srcDest) PER_ARCH;
+
+ inline void mul64(const Register64& rhs, const Register64& srcDest)
+- DEFINED_ON(x64, arm64, mips64, loong64, riscv64);
++ DEFINED_ON(x64, arm64, mips64, loong64, riscv64, ppc64);
+ inline void mul64(const Operand& src, const Register64& dest) DEFINED_ON(x64);
+ inline void mul64(const Operand& src, const Register64& dest,
+ const Register temp) DEFINED_ON(x64);
+ inline void mul64(Imm64 imm, const Register64& dest) PER_ARCH;
+ inline void mul64(Imm64 imm, const Register64& dest, const Register temp)
+- DEFINED_ON(x86, x64, arm, mips64, loong64, riscv64);
++ DEFINED_ON(x86, x64, arm, mips64, loong64, riscv64, ppc64);
+ inline void mul64(const Register64& src, const Register64& dest,
+ const Register temp) PER_ARCH;
+ inline void mul64(const Register64& src1, const Register64& src2,
+@@ -1202,11 +1209,11 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ // On ARM, the chip must have hardware division instructions.
+ inline void quotient32(Register lhs, Register rhs, Register dest,
+ bool isUnsigned)
+- DEFINED_ON(mips64, arm, arm64, loong64, riscv64, wasm32);
++ DEFINED_ON(mips64, arm, arm64, loong64, riscv64, wasm32, ppc64);
+
+ inline void quotient64(Register lhs, Register rhs, Register dest,
+ bool isUnsigned)
+- DEFINED_ON(arm64, loong64, mips64, riscv64);
++ DEFINED_ON(arm64, loong64, mips64, riscv64, ppc64);
+
+ // As above, but lhs and dest must be eax and tempEdx must be edx.
+ inline void quotient32(Register lhs, Register rhs, Register dest,
+@@ -1219,11 +1226,11 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ // On ARM, the chip must have hardware division instructions.
+ inline void remainder32(Register lhs, Register rhs, Register dest,
+ bool isUnsigned)
+- DEFINED_ON(mips64, arm, arm64, loong64, riscv64, wasm32);
++ DEFINED_ON(mips64, arm, arm64, loong64, riscv64, wasm32, ppc64);
+
+ inline void remainder64(Register lhs, Register rhs, Register dest,
+ bool isUnsigned)
+- DEFINED_ON(arm64, loong64, mips64, riscv64);
++ DEFINED_ON(arm64, loong64, mips64, riscv64, ppc64);
+
+ // As above, but lhs and dest must be eax and tempEdx must be edx.
+ inline void remainder32(Register lhs, Register rhs, Register dest,
+@@ -2080,7 +2087,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ template <typename T>
+ void branchValueIsNurseryCellImpl(Condition cond, const T& value,
+ Register temp, Label* label)
+- DEFINED_ON(arm64, x64, mips64, loong64, riscv64);
++ DEFINED_ON(arm64, x64, mips64, loong64, riscv64, ppc64);
+
+ template <typename T>
+ inline void branchTestUndefinedImpl(Condition cond, const T& t, Label* label)
+@@ -2245,7 +2252,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ // from all the other registers, on all supported targets.
+ inline void wasmAddSubI128HI64(Register lhsLo, Register lhsHi, Register rhsLo,
+ Register rhsHi, Register output, bool isAdd)
+- DEFINED_ON(x64, arm64, riscv64, loong64, mips64);
++ DEFINED_ON(x64, arm64, riscv64, loong64, mips64, ppc64);
+
+ // Produces the top 64 bits of the 128-bit value `RAX *widen rhs`. The result
+ // will be in RAX. RDX is trashed. `rhs` may not be RAX or RDX. Callers
+@@ -2256,7 +2263,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ // what the registers may be.
+ inline void wasmMulI64WideHI64(Register lhs, Register rhs, Register output,
+ bool isSigned)
+- DEFINED_ON(arm64, riscv64, loong64, mips64);
++ DEFINED_ON(arm64, riscv64, loong64, mips64, ppc64);
+
+ // ========================================================================
+ // Canonicalization primitives.
+@@ -2355,68 +2362,68 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ // Moves
+
+ inline void moveSimd128(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Constants
+
+ inline void loadConstantSimd128(const SimdConstant& v, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Splat
+
+ inline void splatX16(Register src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void splatX16(uint32_t srcLane, FloatRegister src, FloatRegister dest)
+ DEFINED_ON(arm64);
+
+ inline void splatX8(Register src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void splatX8(uint32_t srcLane, FloatRegister src, FloatRegister dest)
+ DEFINED_ON(arm64);
+
+ inline void splatX4(Register src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void splatX4(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void splatX2(Register64 src, FloatRegister dest)
+- DEFINED_ON(x86, x64, arm64);
++ DEFINED_ON(x86, x64, arm64, ppc64);
+
+ inline void splatX2(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Extract lane as scalar. Float extraction does not canonicalize the value.
+
+ inline void extractLaneInt8x16(uint32_t lane, FloatRegister src,
+- Register dest) DEFINED_ON(x86_shared, arm64);
++ Register dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedExtractLaneInt8x16(uint32_t lane, FloatRegister src,
+ Register dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void extractLaneInt16x8(uint32_t lane, FloatRegister src,
+- Register dest) DEFINED_ON(x86_shared, arm64);
++ Register dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedExtractLaneInt16x8(uint32_t lane, FloatRegister src,
+ Register dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void extractLaneInt32x4(uint32_t lane, FloatRegister src,
+- Register dest) DEFINED_ON(x86_shared, arm64);
++ Register dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void extractLaneInt64x2(uint32_t lane, FloatRegister src,
+- Register64 dest) DEFINED_ON(x86, x64, arm64);
++ Register64 dest) DEFINED_ON(x86, x64, arm64, ppc64);
+
+ inline void extractLaneFloat32x4(uint32_t lane, FloatRegister src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void extractLaneFloat64x2(uint32_t lane, FloatRegister src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Replace lane value
+
+@@ -2425,21 +2432,21 @@ class MacroAssembler : public MacroAssemblerSpecific {
+
+ inline void replaceLaneInt8x16(unsigned lane, Register rhs,
+ FloatRegister lhsDest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void replaceLaneInt16x8(unsigned lane, FloatRegister lhs, Register rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void replaceLaneInt16x8(unsigned lane, Register rhs,
+ FloatRegister lhsDest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void replaceLaneInt32x4(unsigned lane, FloatRegister lhs, Register rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void replaceLaneInt32x4(unsigned lane, Register rhs,
+ FloatRegister lhsDest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void replaceLaneInt64x2(unsigned lane, FloatRegister lhs,
+ Register64 rhs, FloatRegister dest)
+@@ -2447,7 +2454,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+
+ inline void replaceLaneInt64x2(unsigned lane, Register64 rhs,
+ FloatRegister lhsDest)
+- DEFINED_ON(x86, x64, arm64);
++ DEFINED_ON(x86, x64, arm64, ppc64);
+
+ inline void replaceLaneFloat32x4(unsigned lane, FloatRegister lhs,
+ FloatRegister rhs, FloatRegister dest)
+@@ -2455,7 +2462,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+
+ inline void replaceLaneFloat32x4(unsigned lane, FloatRegister rhs,
+ FloatRegister lhsDest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void replaceLaneFloat64x2(unsigned lane, FloatRegister lhs,
+ FloatRegister rhs, FloatRegister dest)
+@@ -2463,7 +2470,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+
+ inline void replaceLaneFloat64x2(unsigned lane, FloatRegister rhs,
+ FloatRegister lhsDest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Shuffle - blend and permute with immediate indices, and its many
+ // specializations. Lane values other than those mentioned are illegal.
+@@ -2471,11 +2478,11 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ // lane values 0..31
+ inline void shuffleInt8x16(const uint8_t lanes[16], FloatRegister rhs,
+ FloatRegister lhsDest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void shuffleInt8x16(const uint8_t lanes[16], FloatRegister lhs,
+ FloatRegister rhs, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Lane values must be 0 (select from lhs) or FF (select from rhs).
+ // The behavior is undefined for lane values that are neither 0 nor FF.
+@@ -2502,39 +2509,39 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ // The implementation works effectively for I8x16, I16x8, I32x4, and I64x2.
+ inline void laneSelectSimd128(FloatRegister mask, FloatRegister lhs,
+ FloatRegister rhs, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void interleaveHighInt8x16(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void interleaveHighInt16x8(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void interleaveHighInt32x4(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void interleaveHighInt64x2(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void interleaveLowInt8x16(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void interleaveLowInt16x8(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void interleaveLowInt32x4(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void interleaveLowInt64x2(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Permute - permute with immediate indices.
+
+@@ -2544,7 +2551,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+
+ // lane values 0..7
+ inline void permuteInt16x8(const uint16_t lanes[8], FloatRegister src,
+- FloatRegister dest) DEFINED_ON(arm64);
++ FloatRegister dest) DEFINED_ON(arm64, ppc64);
+
+ // lane values 0..3 [sic].
+ inline void permuteHighInt16x8(const uint16_t lanes[4], FloatRegister src,
+@@ -2562,80 +2569,80 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ // low_16_bytes_of((lhs ++ rhs) >> shift*8), shift must be < 16
+ inline void concatAndRightShiftSimd128(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest, uint32_t shift)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Rotate right by immediate count:
+ // low_16_bytes_of((src ++ src) >> shift*8), shift must be < 16
+ inline void rotateRightSimd128(FloatRegister src, FloatRegister dest,
+- uint32_t shift) DEFINED_ON(arm64);
++ uint32_t shift) DEFINED_ON(arm64, ppc64);
+
+ // Shift bytes with immediate count, shifting in zeroes. Shift count 0..15.
+
+ inline void leftShiftSimd128(Imm32 count, FloatRegister src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void rightShiftSimd128(Imm32 count, FloatRegister src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Zero extend int values.
+
+ inline void zeroExtend8x16To16x8(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+ inline void zeroExtend8x16To32x4(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+ inline void zeroExtend8x16To64x2(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+ inline void zeroExtend16x8To32x4(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+ inline void zeroExtend16x8To64x2(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+ inline void zeroExtend32x4To64x2(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Reverse bytes in lanes.
+
+ inline void reverseInt16x8(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void reverseInt32x4(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void reverseInt64x2(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Swizzle - permute with variable indices. `rhs` holds the lanes parameter.
+
+ inline void swizzleInt8x16(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void swizzleInt8x16Relaxed(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Integer Add
+
+ inline void addInt8x16(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void addInt8x16(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void addInt16x8(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void addInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void addInt32x4(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void addInt32x4(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void addInt64x2(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void addInt64x2(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+@@ -2643,13 +2650,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ // Integer Subtract
+
+ inline void subInt8x16(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void subInt8x16(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void subInt16x8(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void subInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+@@ -2658,24 +2665,24 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void subInt32x4(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void subInt64x2(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void subInt64x2(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Integer Multiply
+
+ inline void mulInt16x8(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void mulInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void mulInt32x4(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void mulInt32x4(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+@@ -2691,100 +2698,100 @@ class MacroAssembler : public MacroAssemblerSpecific {
+
+ inline void mulInt64x2(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest, FloatRegister temp1,
+- FloatRegister temp2) DEFINED_ON(arm64);
++ FloatRegister temp2) DEFINED_ON(arm64, ppc64);
+
+ // Note for the extMul opcodes, the NxM designation is for the input lanes;
+ // the output lanes are twice as wide.
+ inline void extMulLowInt8x16(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void extMulHighInt8x16(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedExtMulLowInt8x16(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedExtMulHighInt8x16(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void extMulLowInt16x8(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void extMulHighInt16x8(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedExtMulLowInt16x8(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedExtMulHighInt16x8(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void extMulLowInt32x4(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void extMulHighInt32x4(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedExtMulLowInt32x4(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedExtMulHighInt32x4(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void q15MulrSatInt16x8(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Integer Negate
+
+ inline void negInt8x16(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void negInt16x8(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void negInt32x4(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void negInt64x2(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Saturating integer add
+
+ inline void addSatInt8x16(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void addSatInt8x16(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void unsignedAddSatInt8x16(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedAddSatInt8x16(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void addSatInt16x8(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void addSatInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void unsignedAddSatInt16x8(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedAddSatInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+@@ -2792,27 +2799,27 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ // Saturating integer subtract
+
+ inline void subSatInt8x16(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void subSatInt8x16(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void unsignedSubSatInt8x16(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedSubSatInt8x16(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void subSatInt16x8(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void subSatInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void unsignedSubSatInt16x8(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedSubSatInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+@@ -2820,40 +2827,40 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ // Lane-wise integer minimum
+
+ inline void minInt8x16(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void minInt8x16(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void unsignedMinInt8x16(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedMinInt8x16(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void minInt16x8(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void minInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void unsignedMinInt16x8(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedMinInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void minInt32x4(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void minInt32x4(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void unsignedMinInt32x4(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedMinInt32x4(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+@@ -2861,40 +2868,40 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ // Lane-wise integer maximum
+
+ inline void maxInt8x16(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void maxInt8x16(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void unsignedMaxInt8x16(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedMaxInt8x16(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void maxInt16x8(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void maxInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void unsignedMaxInt16x8(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedMaxInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void maxInt32x4(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void maxInt32x4(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void unsignedMaxInt32x4(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedMaxInt32x4(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+@@ -2903,25 +2910,25 @@ class MacroAssembler : public MacroAssemblerSpecific {
+
+ inline void unsignedAverageInt8x16(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedAverageInt16x8(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Lane-wise integer absolute value
+
+ inline void absInt8x16(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void absInt16x8(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void absInt32x4(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void absInt64x2(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Left shift by scalar. Immediates and variable shifts must have been
+ // masked; shifts of zero will work but may or may not generate code.
+@@ -2930,41 +2937,41 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ FloatRegister temp) DEFINED_ON(x86_shared);
+
+ inline void leftShiftInt8x16(FloatRegister lhs, Register rhs,
+- FloatRegister dest) DEFINED_ON(arm64);
++ FloatRegister dest) DEFINED_ON(arm64, ppc64);
+
+ inline void leftShiftInt8x16(Imm32 count, FloatRegister src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void leftShiftInt16x8(Register rhs, FloatRegister lhsDest)
+ DEFINED_ON(x86_shared);
+
+ inline void leftShiftInt16x8(FloatRegister lhs, Register rhs,
+- FloatRegister dest) DEFINED_ON(arm64);
++ FloatRegister dest) DEFINED_ON(arm64, ppc64);
+
+ inline void leftShiftInt16x8(Imm32 count, FloatRegister src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void leftShiftInt32x4(Register rhs, FloatRegister lhsDest)
+ DEFINED_ON(x86_shared);
+
+ inline void leftShiftInt32x4(FloatRegister lhs, Register rhs,
+- FloatRegister dest) DEFINED_ON(arm64);
++ FloatRegister dest) DEFINED_ON(arm64, ppc64);
+
+ inline void leftShiftInt32x4(Imm32 count, FloatRegister src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void leftShiftInt64x2(Register rhs, FloatRegister lhsDest)
+ DEFINED_ON(x86_shared);
+
+ inline void leftShiftInt64x2(FloatRegister lhs, Register rhs,
+- FloatRegister dest) DEFINED_ON(arm64);
++ FloatRegister dest) DEFINED_ON(arm64, ppc64);
+
+ inline void leftShiftInt64x2(Imm32 count, FloatRegister src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Right shift by scalar. Immediates and variable shifts must have been
+ // masked; shifts of zero will work but may or may not generate code.
+@@ -2973,82 +2980,82 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ FloatRegister temp) DEFINED_ON(x86_shared);
+
+ inline void rightShiftInt8x16(FloatRegister lhs, Register rhs,
+- FloatRegister dest) DEFINED_ON(arm64);
++ FloatRegister dest) DEFINED_ON(arm64, ppc64);
+
+ inline void rightShiftInt8x16(Imm32 count, FloatRegister src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedRightShiftInt8x16(Register rhs, FloatRegister lhsDest,
+ FloatRegister temp)
+ DEFINED_ON(x86_shared);
+
+ inline void unsignedRightShiftInt8x16(FloatRegister lhs, Register rhs,
+- FloatRegister dest) DEFINED_ON(arm64);
++ FloatRegister dest) DEFINED_ON(arm64, ppc64);
+
+ inline void unsignedRightShiftInt8x16(Imm32 count, FloatRegister src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void rightShiftInt16x8(Register rhs, FloatRegister lhsDest)
+ DEFINED_ON(x86_shared);
+
+ inline void rightShiftInt16x8(FloatRegister lhs, Register rhs,
+- FloatRegister dest) DEFINED_ON(arm64);
++ FloatRegister dest) DEFINED_ON(arm64, ppc64);
+
+ inline void rightShiftInt16x8(Imm32 count, FloatRegister src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedRightShiftInt16x8(Register rhs, FloatRegister lhsDest)
+ DEFINED_ON(x86_shared);
+
+ inline void unsignedRightShiftInt16x8(FloatRegister lhs, Register rhs,
+- FloatRegister dest) DEFINED_ON(arm64);
++ FloatRegister dest) DEFINED_ON(arm64, ppc64);
+
+ inline void unsignedRightShiftInt16x8(Imm32 count, FloatRegister src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void rightShiftInt32x4(Register rhs, FloatRegister lhsDest)
+ DEFINED_ON(x86_shared);
+
+ inline void rightShiftInt32x4(FloatRegister lhs, Register rhs,
+- FloatRegister dest) DEFINED_ON(arm64);
++ FloatRegister dest) DEFINED_ON(arm64, ppc64);
+
+ inline void rightShiftInt32x4(Imm32 count, FloatRegister src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedRightShiftInt32x4(Register rhs, FloatRegister lhsDest)
+ DEFINED_ON(x86_shared);
+
+ inline void unsignedRightShiftInt32x4(FloatRegister lhs, Register rhs,
+- FloatRegister dest) DEFINED_ON(arm64);
++ FloatRegister dest) DEFINED_ON(arm64, ppc64);
+
+ inline void unsignedRightShiftInt32x4(Imm32 count, FloatRegister src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void rightShiftInt64x2(Register rhs, FloatRegister lhsDest,
+ FloatRegister temp) DEFINED_ON(x86_shared);
+
+ inline void rightShiftInt64x2(Imm32 count, FloatRegister src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void rightShiftInt64x2(FloatRegister lhs, Register rhs,
+- FloatRegister dest) DEFINED_ON(arm64);
++ FloatRegister dest) DEFINED_ON(arm64, ppc64);
+
+ inline void unsignedRightShiftInt64x2(Register rhs, FloatRegister lhsDest)
+ DEFINED_ON(x86_shared);
+
+ inline void unsignedRightShiftInt64x2(FloatRegister lhs, Register rhs,
+- FloatRegister dest) DEFINED_ON(arm64);
++ FloatRegister dest) DEFINED_ON(arm64, ppc64);
+
+ inline void unsignedRightShiftInt64x2(Imm32 count, FloatRegister src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Sign replication operation
+
+@@ -3067,47 +3074,47 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ // Bitwise and, or, xor, not
+
+ inline void bitwiseAndSimd128(FloatRegister rhs, FloatRegister lhsDest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void bitwiseAndSimd128(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void bitwiseAndSimd128(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void bitwiseOrSimd128(FloatRegister rhs, FloatRegister lhsDest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void bitwiseOrSimd128(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void bitwiseOrSimd128(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void bitwiseXorSimd128(FloatRegister rhs, FloatRegister lhsDest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void bitwiseXorSimd128(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void bitwiseXorSimd128(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void bitwiseNotSimd128(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Bitwise AND with compliment: dest = lhs & ~rhs, note only arm64 can do it.
+ inline void bitwiseAndNotSimd128(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister lhsDest) DEFINED_ON(arm64);
++ FloatRegister lhsDest) DEFINED_ON(arm64, ppc64);
+
+ // Bitwise AND with complement: dest = ~lhs & rhs, note this is not what Wasm
+ // wants but what the x86 hardware offers. Hence the name.
+
+ inline void bitwiseNotAndSimd128(FloatRegister rhs, FloatRegister lhsDest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void bitwiseNotAndSimd128(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister lhsDest)
+@@ -3120,34 +3127,34 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ FloatRegister temp) DEFINED_ON(x86_shared);
+
+ inline void bitwiseSelectSimd128(FloatRegister onTrue, FloatRegister onFalse,
+- FloatRegister maskDest) DEFINED_ON(arm64);
++ FloatRegister maskDest) DEFINED_ON(arm64, ppc64);
+
+ // Population count
+
+ inline void popcntInt8x16(FloatRegister src, FloatRegister dest,
+- FloatRegister temp) DEFINED_ON(x86_shared);
++ FloatRegister temp) DEFINED_ON(x86_shared, ppc64);
+
+ inline void popcntInt8x16(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(arm64);
++ DEFINED_ON(arm64, ppc64);
+
+ // Any lane true, ie, any bit set
+
+ inline void anyTrueSimd128(FloatRegister src, Register dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // All lanes true
+
+ inline void allTrueInt8x16(FloatRegister src, Register dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void allTrueInt16x8(FloatRegister src, Register dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void allTrueInt32x4(FloatRegister src, Register dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void allTrueInt64x2(FloatRegister src, Register dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Bitmask, ie extract and compress high bits of all lanes
+
+@@ -3155,31 +3162,31 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ DEFINED_ON(x86_shared);
+
+ inline void bitmaskInt8x16(FloatRegister src, Register dest,
+- FloatRegister temp) DEFINED_ON(arm64);
++ FloatRegister temp) DEFINED_ON(arm64, ppc64);
+
+ inline void bitmaskInt16x8(FloatRegister src, Register dest)
+ DEFINED_ON(x86_shared);
+
+ inline void bitmaskInt16x8(FloatRegister src, Register dest,
+- FloatRegister temp) DEFINED_ON(arm64);
++ FloatRegister temp) DEFINED_ON(arm64, ppc64);
+
+ inline void bitmaskInt32x4(FloatRegister src, Register dest)
+ DEFINED_ON(x86_shared);
+
+ inline void bitmaskInt32x4(FloatRegister src, Register dest,
+- FloatRegister temp) DEFINED_ON(arm64);
++ FloatRegister temp) DEFINED_ON(arm64, ppc64);
+
+ inline void bitmaskInt64x2(FloatRegister src, Register dest)
+ DEFINED_ON(x86_shared);
+
+ inline void bitmaskInt64x2(FloatRegister src, Register dest,
+- FloatRegister temp) DEFINED_ON(arm64);
++ FloatRegister temp) DEFINED_ON(arm64, ppc64);
+
+ // Comparisons (integer and floating-point)
+
+ inline void compareInt8x16(Assembler::Condition cond, FloatRegister rhs,
+ FloatRegister lhsDest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // On x86_shared, limited to !=, ==, <=, >
+ inline void compareInt8x16(Assembler::Condition cond, FloatRegister lhs,
+@@ -3189,15 +3196,15 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ // On arm64, use any integer comparison condition.
+ inline void compareInt8x16(Assembler::Condition cond, FloatRegister lhs,
+ FloatRegister rhs, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void compareInt16x8(Assembler::Condition cond, FloatRegister rhs,
+ FloatRegister lhsDest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void compareInt16x8(Assembler::Condition cond, FloatRegister lhs,
+ FloatRegister rhs, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // On x86_shared, limited to !=, ==, <=, >
+ inline void compareInt16x8(Assembler::Condition cond, FloatRegister lhs,
+@@ -3207,7 +3214,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ // On x86_shared, limited to !=, ==, <=, >
+ inline void compareInt32x4(Assembler::Condition cond, FloatRegister rhs,
+ FloatRegister lhsDest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void compareInt32x4(Assembler::Condition cond, FloatRegister lhs,
+ const SimdConstant& rhs, FloatRegister dest)
+@@ -3216,7 +3223,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ // On arm64, use any integer comparison condition.
+ inline void compareInt32x4(Assembler::Condition cond, FloatRegister lhs,
+ FloatRegister rhs, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void compareForEqualityInt64x2(Assembler::Condition cond,
+ FloatRegister lhs, FloatRegister rhs,
+@@ -3230,15 +3237,15 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ DEFINED_ON(x86_shared);
+
+ inline void compareInt64x2(Assembler::Condition cond, FloatRegister rhs,
+- FloatRegister lhsDest) DEFINED_ON(arm64);
++ FloatRegister lhsDest) DEFINED_ON(arm64, ppc64);
+
+ inline void compareInt64x2(Assembler::Condition cond, FloatRegister lhs,
+ FloatRegister rhs, FloatRegister dest)
+- DEFINED_ON(arm64);
++ DEFINED_ON(arm64, ppc64);
+
+ inline void compareFloat32x4(Assembler::Condition cond, FloatRegister rhs,
+ FloatRegister lhsDest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // On x86_shared, limited to ==, !=, <, <=
+ inline void compareFloat32x4(Assembler::Condition cond, FloatRegister lhs,
+@@ -3249,11 +3256,11 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ // On arm64, use any float-point comparison condition.
+ inline void compareFloat32x4(Assembler::Condition cond, FloatRegister lhs,
+ FloatRegister rhs, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void compareFloat64x2(Assembler::Condition cond, FloatRegister rhs,
+ FloatRegister lhsDest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // On x86_shared, limited to ==, !=, <, <=
+ inline void compareFloat64x2(Assembler::Condition cond, FloatRegister lhs,
+@@ -3264,7 +3271,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ // On arm64, use any float-point comparison condition.
+ inline void compareFloat64x2(Assembler::Condition cond, FloatRegister lhs,
+ FloatRegister rhs, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Load
+
+@@ -3273,92 +3280,92 @@ class MacroAssembler : public MacroAssemblerSpecific {
+
+ inline FaultingCodeOffset loadUnalignedSimd128(const Address& src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline FaultingCodeOffset loadUnalignedSimd128(const BaseIndex& src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Store
+
+ inline FaultingCodeOffset storeUnalignedSimd128(FloatRegister src,
+ const Address& dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline FaultingCodeOffset storeUnalignedSimd128(FloatRegister src,
+ const BaseIndex& dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Floating point negation
+
+ inline void negFloat32x4(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void negFloat64x2(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Floating point absolute value
+
+ inline void absFloat32x4(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void absFloat64x2(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // NaN-propagating minimum
+
+ inline void minFloat32x4(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest, FloatRegister temp1,
+- FloatRegister temp2) DEFINED_ON(x86_shared);
++ FloatRegister temp2) DEFINED_ON(x86_shared, ppc64);
+
+ inline void minFloat32x4(FloatRegister rhs, FloatRegister lhsDest)
+- DEFINED_ON(arm64);
++ DEFINED_ON(arm64, ppc64);
+
+ inline void minFloat32x4(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(arm64);
++ FloatRegister dest) DEFINED_ON(arm64, ppc64);
+
+ inline void minFloat64x2(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest, FloatRegister temp1,
+- FloatRegister temp2) DEFINED_ON(x86_shared);
++ FloatRegister temp2) DEFINED_ON(x86_shared, ppc64);
+
+ inline void minFloat64x2(FloatRegister rhs, FloatRegister lhsDest)
+- DEFINED_ON(arm64);
++ DEFINED_ON(arm64, ppc64);
+
+ inline void minFloat64x2(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(arm64);
++ FloatRegister dest) DEFINED_ON(arm64, ppc64);
+
+ // NaN-propagating maximum
+
+ inline void maxFloat32x4(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest, FloatRegister temp1,
+- FloatRegister temp2) DEFINED_ON(x86_shared);
++ FloatRegister temp2) DEFINED_ON(x86_shared, ppc64);
+
+ inline void maxFloat32x4(FloatRegister rhs, FloatRegister lhsDest)
+- DEFINED_ON(arm64);
++ DEFINED_ON(arm64, ppc64);
+
+ inline void maxFloat32x4(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(arm64);
++ FloatRegister dest) DEFINED_ON(arm64, ppc64);
+
+ inline void maxFloat64x2(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest, FloatRegister temp1,
+- FloatRegister temp2) DEFINED_ON(x86_shared);
++ FloatRegister temp2) DEFINED_ON(x86_shared, ppc64);
+
+ inline void maxFloat64x2(FloatRegister rhs, FloatRegister lhsDest)
+- DEFINED_ON(arm64);
++ DEFINED_ON(arm64, ppc64);
+
+ inline void maxFloat64x2(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(arm64);
++ FloatRegister dest) DEFINED_ON(arm64, ppc64);
+
+ // Floating add
+
+ inline void addFloat32x4(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void addFloat32x4(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void addFloat64x2(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void addFloat64x2(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+@@ -3366,13 +3373,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ // Floating subtract
+
+ inline void subFloat32x4(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void subFloat32x4(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void subFloat64x2(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void subFloat64x2(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+@@ -3380,13 +3387,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ // Floating division
+
+ inline void divFloat32x4(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void divFloat32x4(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void divFloat64x2(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void divFloat64x2(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+@@ -3394,13 +3401,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ // Floating Multiply
+
+ inline void mulFloat32x4(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void mulFloat32x4(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void mulFloat64x2(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void mulFloat64x2(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+@@ -3408,91 +3415,91 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ // Pairwise add
+
+ inline void extAddPairwiseInt8x16(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedExtAddPairwiseInt8x16(FloatRegister src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void extAddPairwiseInt16x8(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedExtAddPairwiseInt16x8(FloatRegister src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Floating square root
+
+ inline void sqrtFloat32x4(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void sqrtFloat64x2(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Integer to floating point with rounding
+
+ inline void convertInt32x4ToFloat32x4(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedConvertInt32x4ToFloat32x4(FloatRegister src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void convertInt32x4ToFloat64x2(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedConvertInt32x4ToFloat64x2(FloatRegister src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Floating point to integer with saturation
+
+ inline void truncSatFloat32x4ToInt32x4(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedTruncSatFloat32x4ToInt32x4(FloatRegister src,
+ FloatRegister dest,
+ FloatRegister temp)
+- DEFINED_ON(x86_shared);
++ DEFINED_ON(x86_shared, ppc64);
+
+ inline void unsignedTruncSatFloat32x4ToInt32x4(FloatRegister src,
+ FloatRegister dest)
+- DEFINED_ON(arm64);
++ DEFINED_ON(arm64, ppc64);
+
+ inline void truncSatFloat64x2ToInt32x4(FloatRegister src, FloatRegister dest,
+ FloatRegister temp)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedTruncSatFloat64x2ToInt32x4(FloatRegister src,
+ FloatRegister dest,
+ FloatRegister temp)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void truncFloat32x4ToInt32x4Relaxed(FloatRegister src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedTruncFloat32x4ToInt32x4Relaxed(FloatRegister src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void truncFloat64x2ToInt32x4Relaxed(FloatRegister src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedTruncFloat64x2ToInt32x4Relaxed(FloatRegister src,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Floating point narrowing
+
+ inline void convertFloat64x2ToFloat32x4(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Floating point widening
+
+ inline void convertFloat32x4ToFloat64x2(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Integer to integer narrowing
+
+@@ -3500,65 +3507,65 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void narrowInt16x8(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedNarrowInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void unsignedNarrowInt16x8(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void narrowInt32x4(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void narrowInt32x4(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedNarrowInt32x4(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void unsignedNarrowInt32x4(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Integer to integer widening
+
+ inline void widenLowInt8x16(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void widenHighInt8x16(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedWidenLowInt8x16(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedWidenHighInt8x16(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void widenLowInt16x8(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void widenHighInt16x8(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedWidenLowInt16x8(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedWidenHighInt16x8(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void widenLowInt32x4(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedWidenLowInt32x4(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void widenHighInt32x4(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void unsignedWidenHighInt32x4(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Compare-based minimum/maximum
+ //
+@@ -3570,47 +3577,47 @@ class MacroAssembler : public MacroAssemblerSpecific {
+
+ inline void pseudoMinFloat32x4(FloatRegister rhsOrRhsDest,
+ FloatRegister lhsOrLhsDest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void pseudoMinFloat32x4(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void pseudoMinFloat64x2(FloatRegister rhsOrRhsDest,
+ FloatRegister lhsOrLhsDest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void pseudoMinFloat64x2(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void pseudoMaxFloat32x4(FloatRegister rhsOrRhsDest,
+ FloatRegister lhsOrLhsDest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void pseudoMaxFloat32x4(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void pseudoMaxFloat64x2(FloatRegister rhsOrRhsDest,
+ FloatRegister lhsOrLhsDest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void pseudoMaxFloat64x2(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Widening/pairwise integer dot product
+
+ inline void widenDotInt16x8(FloatRegister lhs, FloatRegister rhs,
+- FloatRegister dest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void widenDotInt16x8(FloatRegister lhs, const SimdConstant& rhs,
+ FloatRegister dest) DEFINED_ON(x86_shared);
+
+ inline void dotInt8x16Int7x16(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void dotInt8x16Int7x16ThenAdd(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+@@ -3618,81 +3625,81 @@ class MacroAssembler : public MacroAssemblerSpecific {
+
+ inline void dotInt8x16Int7x16ThenAdd(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest, FloatRegister temp)
+- DEFINED_ON(arm64);
++ DEFINED_ON(arm64, ppc64);
+
+ // Floating point rounding
+
+ inline void ceilFloat32x4(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void ceilFloat64x2(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void floorFloat32x4(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void floorFloat64x2(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void truncFloat32x4(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void truncFloat64x2(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void nearestFloat32x4(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void nearestFloat64x2(FloatRegister src, FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ // Floating multiply-accumulate: srcDest [+-]= src1 * src2
+
+ inline void fmaFloat32x4(FloatRegister src1, FloatRegister src2,
+- FloatRegister srcDest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister srcDest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void fnmaFloat32x4(FloatRegister src1, FloatRegister src2,
+ FloatRegister srcDest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void fmaFloat64x2(FloatRegister src1, FloatRegister src2,
+- FloatRegister srcDest) DEFINED_ON(x86_shared, arm64);
++ FloatRegister srcDest) DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void fnmaFloat64x2(FloatRegister src1, FloatRegister src2,
+ FloatRegister srcDest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void minFloat32x4Relaxed(FloatRegister src, FloatRegister srcDest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void minFloat32x4Relaxed(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void maxFloat32x4Relaxed(FloatRegister src, FloatRegister srcDest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void maxFloat32x4Relaxed(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void minFloat64x2Relaxed(FloatRegister src, FloatRegister srcDest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void minFloat64x2Relaxed(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void maxFloat64x2Relaxed(FloatRegister src, FloatRegister srcDest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void maxFloat64x2Relaxed(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ inline void q15MulrInt16x8Relaxed(FloatRegister lhs, FloatRegister rhs,
+ FloatRegister dest)
+- DEFINED_ON(x86_shared, arm64);
++ DEFINED_ON(x86_shared, arm64, ppc64);
+
+ public:
+ // ========================================================================
+@@ -3717,10 +3724,10 @@ class MacroAssembler : public MacroAssemblerSpecific {
+
+ // temp required on x86 and x64; must be undefined on mips64 and loong64.
+ void convertUInt64ToFloat32(Register64 src, FloatRegister dest, Register temp)
+- DEFINED_ON(arm64, mips64, loong64, riscv64, wasm32, x64, x86);
++ DEFINED_ON(arm64, mips64, loong64, ppc64, riscv64, wasm32, x64, x86);
+
+ void convertInt64ToFloat32(Register64 src, FloatRegister dest)
+- DEFINED_ON(arm64, mips64, loong64, riscv64, wasm32, x64, x86);
++ DEFINED_ON(arm64, mips64, loong64, ppc64, riscv64, wasm32, x64, x86);
+
+ bool convertUInt64ToDoubleNeedsTemp() PER_ARCH;
+
+@@ -3801,16 +3808,16 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ // Scalar::Int64.
+ void wasmLoad(const wasm::MemoryAccessDesc& access, Register memoryBase,
+ Register ptr, Register ptrScratch, AnyRegister output)
+- DEFINED_ON(arm, loong64, riscv64, mips64);
++ DEFINED_ON(arm, loong64, riscv64, mips64, ppc64);
+ void wasmLoadI64(const wasm::MemoryAccessDesc& access, Register memoryBase,
+ Register ptr, Register ptrScratch, Register64 output)
+- DEFINED_ON(arm, mips64, loong64, riscv64);
++ DEFINED_ON(arm, mips64, loong64, riscv64, ppc64);
+ void wasmStore(const wasm::MemoryAccessDesc& access, AnyRegister value,
+ Register memoryBase, Register ptr, Register ptrScratch)
+- DEFINED_ON(arm, loong64, riscv64, mips64);
++ DEFINED_ON(arm, loong64, riscv64, mips64, ppc64);
+ void wasmStoreI64(const wasm::MemoryAccessDesc& access, Register64 value,
+ Register memoryBase, Register ptr, Register ptrScratch)
+- DEFINED_ON(arm, mips64, loong64, riscv64);
++ DEFINED_ON(arm, mips64, loong64, riscv64, ppc64);
+
+ // These accept general memoryBase + ptr + offset (in `access`); the offset is
+ // always smaller than the guard region. They will insert an additional add
+@@ -3889,11 +3896,11 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ void wasmTruncateDoubleToInt64(FloatRegister input, Register64 output,
+ bool isSaturating, Label* oolEntry,
+ Label* oolRejoin, FloatRegister tempDouble)
+- DEFINED_ON(arm64, x86, x64, mips64, loong64, riscv64, wasm32);
++ DEFINED_ON(arm64, x86, x64, mips64, loong64, riscv64, wasm32, ppc64);
+ void wasmTruncateDoubleToUInt64(FloatRegister input, Register64 output,
+ bool isSaturating, Label* oolEntry,
+ Label* oolRejoin, FloatRegister tempDouble)
+- DEFINED_ON(arm64, x86, x64, mips64, loong64, riscv64, wasm32);
++ DEFINED_ON(arm64, x86, x64, mips64, loong64, riscv64, wasm32, ppc64);
+ void oolWasmTruncateCheckF64ToI64(FloatRegister input, Register64 output,
+ TruncFlags flags,
+ const wasm::TrapSiteDesc& trapSiteDesc,
+@@ -3902,11 +3909,11 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ void wasmTruncateFloat32ToInt64(FloatRegister input, Register64 output,
+ bool isSaturating, Label* oolEntry,
+ Label* oolRejoin, FloatRegister tempDouble)
+- DEFINED_ON(arm64, x86, x64, mips64, loong64, riscv64, wasm32);
++ DEFINED_ON(arm64, x86, x64, mips64, loong64, riscv64, wasm32, ppc64);
+ void wasmTruncateFloat32ToUInt64(FloatRegister input, Register64 output,
+ bool isSaturating, Label* oolEntry,
+ Label* oolRejoin, FloatRegister tempDouble)
+- DEFINED_ON(arm64, x86, x64, mips64, loong64, riscv64, wasm32);
++ DEFINED_ON(arm64, x86, x64, mips64, loong64, riscv64, wasm32, ppc64);
+ void oolWasmTruncateCheckF32ToI64(FloatRegister input, Register64 output,
+ TruncFlags flags,
+ const wasm::TrapSiteDesc& trapSiteDesc,
+@@ -4220,7 +4227,8 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ // convention, which requires predictable high bits. In practice, this means
+ // that the 32-bit value will be zero-extended or sign-extended to 64 bits as
+ // appropriate for the platform.
+- void widenInt32(Register r) DEFINED_ON(arm64, x64, mips64, loong64, riscv64);
++ void widenInt32(Register r)
++ DEFINED_ON(arm64, x64, mips64, loong64, riscv64, ppc64);
+
+ // As enterFakeExitFrame(), but using register conventions appropriate for
+ // wasm stubs.
+@@ -4287,13 +4295,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ const Address& mem, Register expected,
+ Register replacement, Register valueTemp,
+ Register offsetTemp, Register maskTemp, Register output)
+- DEFINED_ON(mips64, loong64, riscv64);
++ DEFINED_ON(mips64, loong64, riscv64, ppc64);
+
+ void compareExchange(Scalar::Type type, Synchronization sync,
+ const BaseIndex& mem, Register expected,
+ Register replacement, Register valueTemp,
+ Register offsetTemp, Register maskTemp, Register output)
+- DEFINED_ON(mips64, loong64, riscv64);
++ DEFINED_ON(mips64, loong64, riscv64, ppc64);
+
+ // x86: `expected` and `output` must be edx:eax; `replacement` is ecx:ebx.
+ // x64: `output` must be rax.
+@@ -4303,12 +4311,12 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ void compareExchange64(Synchronization sync, const Address& mem,
+ Register64 expected, Register64 replacement,
+ Register64 output)
+- DEFINED_ON(arm, arm64, x64, x86, mips64, loong64, riscv64);
++ DEFINED_ON(arm, arm64, x64, x86, mips64, loong64, riscv64, ppc64);
+
+ void compareExchange64(Synchronization sync, const BaseIndex& mem,
+ Register64 expected, Register64 replacement,
+ Register64 output)
+- DEFINED_ON(arm, arm64, x64, x86, mips64, loong64, riscv64);
++ DEFINED_ON(arm, arm64, x64, x86, mips64, loong64, riscv64, ppc64);
+
+ // Exchange with memory. Return the value initially in memory.
+ // MIPS: `valueTemp`, `offsetTemp` and `maskTemp` must be defined for 8-bit
+@@ -4325,12 +4333,12 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ void atomicExchange(Scalar::Type type, Synchronization sync,
+ const Address& mem, Register value, Register valueTemp,
+ Register offsetTemp, Register maskTemp, Register output)
+- DEFINED_ON(mips64, loong64, riscv64);
++ DEFINED_ON(mips64, loong64, riscv64, ppc64);
+
+ void atomicExchange(Scalar::Type type, Synchronization sync,
+ const BaseIndex& mem, Register value, Register valueTemp,
+ Register offsetTemp, Register maskTemp, Register output)
+- DEFINED_ON(mips64, loong64, riscv64);
++ DEFINED_ON(mips64, loong64, riscv64, ppc64);
+
+ // x86: `value` must be ecx:ebx; `output` must be edx:eax.
+ // ARM: `value` and `output` must be distinct and (even,odd) pairs.
+@@ -4338,11 +4346,11 @@ class MacroAssembler : public MacroAssemblerSpecific {
+
+ void atomicExchange64(Synchronization sync, const Address& mem,
+ Register64 value, Register64 output)
+- DEFINED_ON(arm, arm64, x64, x86, mips64, loong64, riscv64);
++ DEFINED_ON(arm, arm64, x64, x86, mips64, loong64, riscv64, ppc64);
+
+ void atomicExchange64(Synchronization sync, const BaseIndex& mem,
+ Register64 value, Register64 output)
+- DEFINED_ON(arm, arm64, x64, x86, mips64, loong64, riscv64);
++ DEFINED_ON(arm, arm64, x64, x86, mips64, loong64, riscv64, ppc64);
+
+ // Read-modify-write with memory. Return the value in memory before the
+ // operation.
+@@ -4376,12 +4384,12 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ void atomicFetchOp(Scalar::Type type, Synchronization sync, AtomicOp op,
+ Register value, const Address& mem, Register valueTemp,
+ Register offsetTemp, Register maskTemp, Register output)
+- DEFINED_ON(mips64, loong64, riscv64);
++ DEFINED_ON(mips64, loong64, riscv64, ppc64);
+
+ void atomicFetchOp(Scalar::Type type, Synchronization sync, AtomicOp op,
+ Register value, const BaseIndex& mem, Register valueTemp,
+ Register offsetTemp, Register maskTemp, Register output)
+- DEFINED_ON(mips64, loong64, riscv64);
++ DEFINED_ON(mips64, loong64, riscv64, ppc64);
+
+ // x86:
+ // `temp` must be ecx:ebx; `output` must be edx:eax.
+@@ -4395,7 +4403,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+
+ void atomicFetchOp64(Synchronization sync, AtomicOp op, Register64 value,
+ const Address& mem, Register64 temp, Register64 output)
+- DEFINED_ON(arm, arm64, x64, mips64, loong64, riscv64);
++ DEFINED_ON(arm, arm64, x64, mips64, loong64, riscv64, ppc64);
+
+ void atomicFetchOp64(Synchronization sync, AtomicOp op, const Address& value,
+ const Address& mem, Register64 temp, Register64 output)
+@@ -4403,7 +4411,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+
+ void atomicFetchOp64(Synchronization sync, AtomicOp op, Register64 value,
+ const BaseIndex& mem, Register64 temp, Register64 output)
+- DEFINED_ON(arm, arm64, x64, mips64, loong64, riscv64);
++ DEFINED_ON(arm, arm64, x64, mips64, loong64, riscv64, ppc64);
+
+ void atomicFetchOp64(Synchronization sync, AtomicOp op, const Address& value,
+ const BaseIndex& mem, Register64 temp, Register64 output)
+@@ -4421,14 +4429,14 @@ class MacroAssembler : public MacroAssemblerSpecific {
+
+ void atomicEffectOp64(Synchronization sync, AtomicOp op, Register64 value,
+ const Address& mem, Register64 temp)
+- DEFINED_ON(arm, arm64, mips64, loong64, riscv64);
++ DEFINED_ON(arm, arm64, mips64, loong64, riscv64, ppc64);
+
+ void atomicEffectOp64(Synchronization sync, AtomicOp op, Register64 value,
+ const BaseIndex& mem) DEFINED_ON(x64);
+
+ void atomicEffectOp64(Synchronization sync, AtomicOp op, Register64 value,
+ const BaseIndex& mem, Register64 temp)
+- DEFINED_ON(arm, arm64, mips64, loong64, riscv64);
++ DEFINED_ON(arm, arm64, mips64, loong64, riscv64, ppc64);
+
+ // 64-bit atomic load. On 64-bit systems, use regular load with
+ // Synchronization::Load, not this method.
+@@ -4481,14 +4489,14 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ Register replacement, Register valueTemp,
+ Register offsetTemp, Register maskTemp,
+ Register output)
+- DEFINED_ON(mips64, loong64, riscv64);
++ DEFINED_ON(mips64, loong64, riscv64, ppc64);
+
+ void wasmCompareExchange(const wasm::MemoryAccessDesc& access,
+ const BaseIndex& mem, Register expected,
+ Register replacement, Register valueTemp,
+ Register offsetTemp, Register maskTemp,
+ Register output)
+- DEFINED_ON(mips64, loong64, riscv64);
++ DEFINED_ON(mips64, loong64, riscv64, ppc64);
+
+ void wasmAtomicExchange(const wasm::MemoryAccessDesc& access,
+ const Address& mem, Register value, Register output)
+@@ -4502,13 +4510,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ const Address& mem, Register value,
+ Register valueTemp, Register offsetTemp,
+ Register maskTemp, Register output)
+- DEFINED_ON(mips64, loong64, riscv64);
++ DEFINED_ON(mips64, loong64, riscv64, ppc64);
+
+ void wasmAtomicExchange(const wasm::MemoryAccessDesc& access,
+ const BaseIndex& mem, Register value,
+ Register valueTemp, Register offsetTemp,
+ Register maskTemp, Register output)
+- DEFINED_ON(mips64, loong64, riscv64);
++ DEFINED_ON(mips64, loong64, riscv64, ppc64);
+
+ void wasmAtomicFetchOp(const wasm::MemoryAccessDesc& access, AtomicOp op,
+ Register value, const Address& mem, Register temp,
+@@ -4529,13 +4537,14 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ void wasmAtomicFetchOp(const wasm::MemoryAccessDesc& access, AtomicOp op,
+ Register value, const Address& mem, Register valueTemp,
+ Register offsetTemp, Register maskTemp,
+- Register output) DEFINED_ON(mips64, loong64, riscv64);
++ Register output)
++ DEFINED_ON(mips64, loong64, riscv64, ppc64);
+
+ void wasmAtomicFetchOp(const wasm::MemoryAccessDesc& access, AtomicOp op,
+ Register value, const BaseIndex& mem,
+ Register valueTemp, Register offsetTemp,
+ Register maskTemp, Register output)
+- DEFINED_ON(mips64, loong64, riscv64);
++ DEFINED_ON(mips64, loong64, riscv64, ppc64);
+
+ // Read-modify-write with memory. Return no value.
+ //
+@@ -4562,13 +4571,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ Register value, const Address& mem,
+ Register valueTemp, Register offsetTemp,
+ Register maskTemp)
+- DEFINED_ON(mips64, loong64, riscv64);
++ DEFINED_ON(mips64, loong64, riscv64, ppc64);
+
+ void wasmAtomicEffectOp(const wasm::MemoryAccessDesc& access, AtomicOp op,
+ Register value, const BaseIndex& mem,
+ Register valueTemp, Register offsetTemp,
+ Register maskTemp)
+- DEFINED_ON(mips64, loong64, riscv64);
++ DEFINED_ON(mips64, loong64, riscv64, ppc64);
+
+ // 64-bit wide operations.
+
+@@ -4626,12 +4635,12 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ void wasmAtomicFetchOp64(const wasm::MemoryAccessDesc& access, AtomicOp op,
+ Register64 value, const Address& mem,
+ Register64 temp, Register64 output)
+- DEFINED_ON(arm, arm64, mips64, loong64, riscv64, x64);
++ DEFINED_ON(arm, arm64, mips64, loong64, riscv64, ppc64, x64);
+
+ void wasmAtomicFetchOp64(const wasm::MemoryAccessDesc& access, AtomicOp op,
+ Register64 value, const BaseIndex& mem,
+ Register64 temp, Register64 output)
+- DEFINED_ON(arm, arm64, mips64, loong64, riscv64, x64);
++ DEFINED_ON(arm, arm64, mips64, loong64, riscv64, ppc64, x64);
+
+ void wasmAtomicFetchOp64(const wasm::MemoryAccessDesc& access, AtomicOp op,
+ const Address& value, const Address& mem,
+@@ -4684,14 +4693,14 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ Register replacement, Register valueTemp,
+ Register offsetTemp, Register maskTemp, Register temp,
+ AnyRegister output)
+- DEFINED_ON(mips64, loong64, riscv64);
++ DEFINED_ON(mips64, loong64, riscv64, ppc64);
+
+ void compareExchangeJS(Scalar::Type arrayType, Synchronization sync,
+ const BaseIndex& mem, Register expected,
+ Register replacement, Register valueTemp,
+ Register offsetTemp, Register maskTemp, Register temp,
+ AnyRegister output)
+- DEFINED_ON(mips64, loong64, riscv64);
++ DEFINED_ON(mips64, loong64, riscv64, ppc64);
+
+ void atomicExchangeJS(Scalar::Type arrayType, Synchronization sync,
+ const Address& mem, Register value, Register temp,
+@@ -4705,13 +4714,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ const Address& mem, Register value, Register valueTemp,
+ Register offsetTemp, Register maskTemp, Register temp,
+ AnyRegister output)
+- DEFINED_ON(mips64, loong64, riscv64);
++ DEFINED_ON(mips64, loong64, riscv64, ppc64);
+
+ void atomicExchangeJS(Scalar::Type arrayType, Synchronization sync,
+ const BaseIndex& mem, Register value,
+ Register valueTemp, Register offsetTemp,
+ Register maskTemp, Register temp, AnyRegister output)
+- DEFINED_ON(mips64, loong64, riscv64);
++ DEFINED_ON(mips64, loong64, riscv64, ppc64);
+
+ void atomicFetchOpJS(Scalar::Type arrayType, Synchronization sync,
+ AtomicOp op, Register value, const Address& mem,
+@@ -4737,13 +4746,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ AtomicOp op, Register value, const Address& mem,
+ Register valueTemp, Register offsetTemp,
+ Register maskTemp, Register temp, AnyRegister output)
+- DEFINED_ON(mips64, loong64, riscv64);
++ DEFINED_ON(mips64, loong64, riscv64, ppc64);
+
+ void atomicFetchOpJS(Scalar::Type arrayType, Synchronization sync,
+ AtomicOp op, Register value, const BaseIndex& mem,
+ Register valueTemp, Register offsetTemp,
+ Register maskTemp, Register temp, AnyRegister output)
+- DEFINED_ON(mips64, loong64, riscv64);
++ DEFINED_ON(mips64, loong64, riscv64, ppc64);
+
+ void atomicEffectOpJS(Scalar::Type arrayType, Synchronization sync,
+ AtomicOp op, Register value, const Address& mem,
+@@ -4764,12 +4773,14 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ void atomicEffectOpJS(Scalar::Type arrayType, Synchronization sync,
+ AtomicOp op, Register value, const Address& mem,
+ Register valueTemp, Register offsetTemp,
+- Register maskTemp) DEFINED_ON(mips64, loong64, riscv64);
++ Register maskTemp)
++ DEFINED_ON(mips64, loong64, riscv64, ppc64);
+
+ void atomicEffectOpJS(Scalar::Type arrayType, Synchronization sync,
+ AtomicOp op, Register value, const BaseIndex& mem,
+ Register valueTemp, Register offsetTemp,
+- Register maskTemp) DEFINED_ON(mips64, loong64, riscv64);
++ Register maskTemp)
++ DEFINED_ON(mips64, loong64, riscv64, ppc64);
+
+ void atomicIsLockFreeJS(Register value, Register output);
+
+@@ -5928,7 +5939,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
+ inline void addStackPtrTo(T t);
+
+ void subFromStackPtr(Imm32 imm32)
+- DEFINED_ON(mips64, loong64, riscv64, wasm32, arm, x86, x64);
++ DEFINED_ON(mips64, loong64, riscv64, ppc64, wasm32, arm, x86, x64);
+ void subFromStackPtr(Register reg);
+
+ template <typename T>
+diff --git a/js/src/jit/MoveEmitter.h b/js/src/jit/MoveEmitter.h
+index 642829c070d6..3a883c596ca0 100644
+--- a/js/src/jit/MoveEmitter.h
++++ b/js/src/jit/MoveEmitter.h
+@@ -17,6 +17,8 @@
+ # include "jit/loong64/MoveEmitter-loong64.h"
+ #elif defined(JS_CODEGEN_RISCV64)
+ # include "jit/riscv64/MoveEmitter-riscv64.h"
++#elif defined(JS_CODEGEN_PPC64)
++# include "jit/ppc64/MoveEmitter-ppc64.h"
+ #elif defined(JS_CODEGEN_WASM32)
+ # include "jit/wasm32/MoveEmitter-wasm32.h"
+ #elif defined(JS_CODEGEN_NONE)
+diff --git a/js/src/jit/MoveResolver.cpp b/js/src/jit/MoveResolver.cpp
+index d2e1f12700bd..8e622407a0a8 100644
+--- a/js/src/jit/MoveResolver.cpp
++++ b/js/src/jit/MoveResolver.cpp
+@@ -57,6 +57,22 @@ bool MoveResolver::addMove(const MoveOperand& from, const MoveOperand& to,
+ MoveOp::Type type) {
+ // Assert that we're not doing no-op moves.
+ MOZ_ASSERT(!(from == to));
++#ifdef JS_CODEGEN_PPC64
++ // PPC64 FloatRegisters expose Single/Double kinds that have distinct code()
++ // values but share one physical register. The register allocator can emit a
++ // move between two such kind-views of the same FPR (e.g. f2-Double to
++ // f2-Single); these are no-ops on the hardware, are not caught by the
++ // (from == to) assert above, and would otherwise trip the
++ // !from().aliases(to()) invariant the resolver relies on later. Drop them.
++ //
++ // This would be correct for any backend whose FloatRegister has multiple
++ // kinds aliasing one physical register, and could be un-gated if another
++ // such backend needs it, but it is scoped to PPC64 so move resolution on
++ // tier-1 platforms is left unchanged.
++ if (from.aliases(to)) {
++ return true;
++ }
++#endif
+ PendingMove* pm = movePool_.allocate(from, to, type);
+ if (!pm) {
+ return false;
+diff --git a/js/src/jit/RegisterAllocator.h b/js/src/jit/RegisterAllocator.h
+index eda9933f6322..42e48111046a 100644
+--- a/js/src/jit/RegisterAllocator.h
++++ b/js/src/jit/RegisterAllocator.h
+@@ -262,9 +262,10 @@ class RegisterAllocator {
+ public:
+ template <typename TakeableSet>
+ static void takeWasmRegisters(TakeableSet& regs) {
+-#if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM) || \
+- defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) || \
+- defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM) || \
++ defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) || \
++ defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || \
++ defined(JS_CODEGEN_PPC64)
+ regs.take(HeapReg);
+ #endif
+ MOZ_ASSERT(!regs.has(FramePointer));
+diff --git a/js/src/jit/Registers.h b/js/src/jit/Registers.h
+index e0d02e2fb60d..423777ce38cd 100644
+--- a/js/src/jit/Registers.h
++++ b/js/src/jit/Registers.h
+@@ -20,6 +20,8 @@
+ # include "jit/loong64/Architecture-loong64.h"
+ #elif defined(JS_CODEGEN_RISCV64)
+ # include "jit/riscv64/Architecture-riscv64.h"
++#elif defined(JS_CODEGEN_PPC64)
++# include "jit/ppc64/Architecture-ppc64.h"
+ #elif defined(JS_CODEGEN_WASM32)
+ # include "jit/wasm32/Architecture-wasm32.h"
+ #elif defined(JS_CODEGEN_NONE)
+diff --git a/js/src/jit/Safepoints.cpp b/js/src/jit/Safepoints.cpp
+index 42e305f053af..8e3a25c3c5ff 100644
+--- a/js/src/jit/Safepoints.cpp
++++ b/js/src/jit/Safepoints.cpp
+@@ -63,6 +63,11 @@ static void WriteFloatRegisterMask(CompactBufferWriter& stream,
+ stream.writeUnsigned64(bits.low());
+ stream.writeUnsigned64(bits.high());
+ break;
++#elif defined(JS_CODEGEN_PPC64)
++ case 16:
++ stream.writeUnsigned64(static_cast<uint64_t>(bits));
++ stream.writeUnsigned64(static_cast<uint64_t>(bits >> 64));
++ break;
+ #else
+ case 1:
+ stream.writeByte(bits);
+@@ -88,6 +93,12 @@ static FloatRegisters::SetType ReadFloatRegisterMask(
+ uint64_t high = stream.readUnsigned64();
+ return Bitset128(high, low);
+ }
++#elif defined(JS_CODEGEN_PPC64)
++ case 16: {
++ uint64_t low = stream.readUnsigned64();
++ uint64_t high = stream.readUnsigned64();
++ return FloatRegisters::SetType(high) << 64 | FloatRegisters::SetType(low);
++ }
+ #else
+ case 1:
+ return stream.readByte();
+diff --git a/js/src/jit/SharedICHelpers-inl.h b/js/src/jit/SharedICHelpers-inl.h
+index eedccc831732..1005b140f1df 100644
+--- a/js/src/jit/SharedICHelpers-inl.h
++++ b/js/src/jit/SharedICHelpers-inl.h
+@@ -19,6 +19,8 @@
+ # include "jit/loong64/SharedICHelpers-loong64-inl.h"
+ #elif defined(JS_CODEGEN_RISCV64)
+ # include "jit/riscv64/SharedICHelpers-riscv64-inl.h"
++#elif defined(JS_CODEGEN_PPC64)
++# include "jit/ppc64/SharedICHelpers-ppc64-inl.h"
+ #elif defined(JS_CODEGEN_WASM32)
+ # include "jit/wasm32/SharedICHelpers-wasm32-inl.h"
+ #elif defined(JS_CODEGEN_NONE)
+diff --git a/js/src/jit/SharedICHelpers.h b/js/src/jit/SharedICHelpers.h
+index 1ebd61e44509..f2703c6f986c 100644
+--- a/js/src/jit/SharedICHelpers.h
++++ b/js/src/jit/SharedICHelpers.h
+@@ -19,6 +19,8 @@
+ # include "jit/loong64/SharedICHelpers-loong64.h"
+ #elif defined(JS_CODEGEN_RISCV64)
+ # include "jit/riscv64/SharedICHelpers-riscv64.h"
++#elif defined(JS_CODEGEN_PPC64)
++# include "jit/ppc64/SharedICHelpers-ppc64.h"
+ #elif defined(JS_CODEGEN_WASM32)
+ # include "jit/wasm32/SharedICHelpers-wasm32.h"
+ #elif defined(JS_CODEGEN_NONE)
+diff --git a/js/src/jit/SharedICRegisters.h b/js/src/jit/SharedICRegisters.h
+index c3ab86bf0a82..5b270d0c166a 100644
+--- a/js/src/jit/SharedICRegisters.h
++++ b/js/src/jit/SharedICRegisters.h
+@@ -19,6 +19,8 @@
+ # include "jit/loong64/SharedICRegisters-loong64.h"
+ #elif defined(JS_CODEGEN_RISCV64)
+ # include "jit/riscv64/SharedICRegisters-riscv64.h"
++#elif defined(JS_CODEGEN_PPC64)
++# include "jit/ppc64/SharedICRegisters-ppc64.h"
+ #elif defined(JS_CODEGEN_WASM32)
+ # include "jit/wasm32/SharedICRegisters-wasm32.h"
+ #elif defined(JS_CODEGEN_NONE)
+diff --git a/js/src/jit/Simulator.h b/js/src/jit/Simulator.h
+index 39503716f10d..9f60baf53198 100644
+--- a/js/src/jit/Simulator.h
++++ b/js/src/jit/Simulator.h
+@@ -15,6 +15,8 @@
+ # include "jit/loong64/Simulator-loong64.h"
+ #elif defined(JS_SIMULATOR_RISCV64)
+ # include "jit/riscv64/Simulator-riscv64.h"
++#elif defined(JS_SIMULATOR_PPC64)
++# include "jit/ppc64/Simulator-ppc64.h"
+ #elif defined(JS_SIMULATOR)
+ # error "Unexpected simulator platform"
+ #endif
+diff --git a/js/src/jit/moz.build b/js/src/jit/moz.build
+index 5b5df3e5b7b2..36ef65d6221a 100644
+--- a/js/src/jit/moz.build
++++ b/js/src/jit/moz.build
+@@ -228,6 +228,18 @@ elif CONFIG["JS_CODEGEN_LOONG64"]:
+ ]
+ if CONFIG["JS_SIMULATOR_LOONG64"]:
+ UNIFIED_SOURCES += ["loong64/Simulator-loong64.cpp"]
++elif CONFIG["JS_CODEGEN_PPC64"]:
++ UNIFIED_SOURCES += [
++ "ppc64/Architecture-ppc64.cpp",
++ "ppc64/Assembler-ppc64.cpp",
++ "ppc64/CodeGenerator-ppc64.cpp",
++ "ppc64/Lowering-ppc64.cpp",
++ "ppc64/MacroAssembler-ppc64.cpp",
++ "ppc64/MoveEmitter-ppc64.cpp",
++ "ppc64/Trampoline-ppc64.cpp",
++ ]
++ if CONFIG["JS_SIMULATOR_PPC64"]:
++ UNIFIED_SOURCES += ["ppc64/Simulator-ppc64.cpp"]
+ elif CONFIG["JS_CODEGEN_RISCV64"]:
+ UNIFIED_SOURCES += [
+ "riscv64/Architecture-riscv64.cpp",
+diff --git a/js/src/jit/ppc64/Architecture-ppc64.cpp b/js/src/jit/ppc64/Architecture-ppc64.cpp
+new file mode 100644
+index 000000000000..5632865556ac
+--- /dev/null
++++ b/js/src/jit/ppc64/Architecture-ppc64.cpp
+@@ -0,0 +1,221 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#include "jit/ppc64/Architecture-ppc64.h"
++
++#ifndef JS_SIMULATOR
++# include <sys/auxv.h>
++#endif
++
++#include "jit/FlushICache.h" // js::jit::FlushICache
++#include "jit/RegisterSets.h"
++
++namespace js {
++namespace jit {
++
++Registers::Code Registers::FromName(const char* name) {
++ for (size_t i = 0; i < Total; i++) {
++ if (strcmp(GetName(i), name) == 0) {
++ return Code(i);
++ }
++ }
++
++ return Invalid;
++}
++
++FloatRegisters::Code FloatRegisters::FromName(const char* name) {
++ for (size_t i = 0; i < Total; i++) {
++ if (strcmp(GetName(i), name) == 0) {
++ return Code(i);
++ }
++ }
++
++ return Invalid;
++}
++
++FloatRegisterSet FloatRegister::ReduceSetForPush(const FloatRegisterSet& s) {
++ SetType all = s.bits();
++ SetType simd128Set =
++ (all >> (uint32_t(FloatRegisters::Simd128) * FloatRegisters::TotalPhys)) &
++ FloatRegisters::AllPhysMask;
++ SetType doubleSet =
++ (all >> (uint32_t(FloatRegisters::Double) * FloatRegisters::TotalPhys)) &
++ FloatRegisters::AllPhysMask;
++ SetType singleSet =
++ (all >> (uint32_t(FloatRegisters::Single) * FloatRegisters::TotalPhys)) &
++ FloatRegisters::AllPhysMask;
++
++ // Single+Double share physical FPRs (push as Double, 8-byte slot);
++ // Simd128 lives in its own physical VRs (push as Simd128, 16-byte
++ // slot). Different physical pools — no dedup. Note that
++ // sizeof(FloatRegisters::RegisterContent) is 8 bytes (no v128 in the
++ // union), so RegisterDump::FPUArray is 32 × 8 = 256 bytes, matching
++ // the Float-only layout PushRegsInMask produces.
++ SetType set64 = singleSet | doubleSet;
++
++ SetType reduced =
++ (simd128Set << (uint32_t(FloatRegisters::Simd128) *
++ FloatRegisters::TotalPhys)) |
++ (set64 << (uint32_t(FloatRegisters::Double) * FloatRegisters::TotalPhys));
++ return FloatRegisterSet(reduced);
++}
++
++uint32_t FloatRegister::GetPushSizeInBytes(const FloatRegisterSet& s) {
++ SetType all = s.bits();
++ SetType simd128Set =
++ (all >> (uint32_t(FloatRegisters::Simd128) * FloatRegisters::TotalPhys)) &
++ FloatRegisters::AllPhysMask;
++ SetType doubleSet =
++ (all >> (uint32_t(FloatRegisters::Double) * FloatRegisters::TotalPhys)) &
++ FloatRegisters::AllPhysMask;
++ SetType singleSet =
++ (all >> (uint32_t(FloatRegisters::Single) * FloatRegisters::TotalPhys)) &
++ FloatRegisters::AllPhysMask;
++
++ // Natural per-kind slot sizes. See ReduceSetForPush comment.
++ SetType set64 = singleSet | doubleSet;
++
++ uint32_t count64 = std::popcount(static_cast<uint64_t>(set64));
++ uint32_t count128 = std::popcount(static_cast<uint64_t>(simd128Set));
++
++ return count64 * sizeof(double) + count128 * 16;
++}
++
++uint32_t FloatRegister::getRegisterDumpOffsetInBytes() {
++ // Simd128 encoding is 32-63 — mask back to 0-31 for the FPUArray-
++ // relative offset. (FPUArray has 32 slots; Simd128 should never be in
++ // a SafepointState/BailoutState anyway.)
++ return (encoding() & 31) * sizeof(FloatRegisters::RegisterContent);
++}
++
++static bool sPOWER9Detected = false;
++static bool sPOWER10Detected = false;
++static bool sCPUFlagsComputed = false;
++
++#ifndef JS_SIMULATOR
++// Cache line sizes, detected at startup from ELF auxiliary vector.
++// Fallback to 32 bytes (safe minimum per LuaJIT/LLVM compiler-rt).
++static size_t sDCacheLineSize = 0;
++static size_t sICacheLineSize = 0;
++#endif
++
++void PPC64Flags::Init() {
++ if (sCPUFlagsComputed) {
++ return;
++ }
++#ifndef JS_SIMULATOR
++ unsigned long hwcap2 = getauxval(AT_HWCAP2);
++ // PPC_FEATURE2_ARCH_3_00 = 0x00800000 (ISA 3.0 / POWER9)
++ sPOWER9Detected = (hwcap2 & 0x00800000) != 0;
++ // PPC_FEATURE2_ARCH_3_1 = 0x00040000 (ISA 3.1 / POWER10)
++ sPOWER10Detected = (hwcap2 & 0x00040000) != 0;
++ // Allow forcing POWER8 mode for testing: MOZ_PPC64_FORCE_POWER8=1.
++ // P10 implies P9; downgrade clears both.
++ const char* forceP8 = getenv("MOZ_PPC64_FORCE_POWER8");
++ if (forceP8 && forceP8[0] == '1') {
++ sPOWER9Detected = false;
++ sPOWER10Detected = false;
++ }
++
++ size_t dcache = getauxval(AT_DCACHEBSIZE);
++ size_t icache = getauxval(AT_ICACHEBSIZE);
++ sDCacheLineSize = dcache ? dcache : 32;
++ sICacheLineSize = icache ? icache : 32;
++#endif
++ // FORCE_POWER9/10 opt into the corresponding ISA fast paths. Useful under
++ // the simulator; on real silicon below the gated level they are foot-guns
++ // because the CPU will trap on undefined ops. Outside the JS_SIMULATOR
++ // guard so the sim can opt in via env.
++ //
++ // FORCE_POWER10 also implies FORCE_POWER9 — this matches what real-P10
++ // silicon advertises in hwcap2 (both ARCH_3_00 and ARCH_3_1 bits set), so
++ // we don't ask sim users to pass both vars separately.
++ const char* forceP9 = getenv("MOZ_PPC64_FORCE_POWER9");
++ if (forceP9 && forceP9[0] == '1') {
++ sPOWER9Detected = true;
++ }
++ const char* forceP10 = getenv("MOZ_PPC64_FORCE_POWER10");
++ if (forceP10 && forceP10[0] == '1') {
++ sPOWER10Detected = true;
++ sPOWER9Detected = true;
++ }
++ sCPUFlagsComputed = true;
++}
++
++bool HasPOWER9() {
++ MOZ_ASSERT(sCPUFlagsComputed);
++ return sPOWER9Detected;
++}
++
++bool HasPOWER10() {
++ MOZ_ASSERT(sCPUFlagsComputed);
++ return sPOWER10Detected;
++}
++
++bool CPUFlagsHaveBeenComputed() { return sCPUFlagsComputed; }
++
++// Per-bit feature flags packed into the wasm code signature. Adding a
++// new bit (e.g., POWER10, VSX4) should be a 1-line change here plus a
++// corresponding HasPOWER10()/IsVSX4Available() probe above. The value
++// is also assert-checked into a fixed-width field in
++// js/src/wasm/WasmCompile.cpp — if that field ever overflows, widen
++// it there before landing more bits here.
++uint32_t GetPPC64Flags() {
++ uint32_t flags = 0;
++ if (sPOWER9Detected) {
++ flags |= PPC64Flag_POWER9;
++ }
++ return flags;
++}
++
++void FlushICache(void* code, size_t size) {
++#if defined(JS_SIMULATOR)
++ js::jit::SimulatorProcess::FlushICache(code, size);
++#else
++ // PPC64 has incoherent I/D caches. GCC's __builtin___clear_cache is a
++ // no-op on PPC64 Linux, so we implement the flush explicitly.
++ // This follows the same approach as QEMU (util/cacheflush.c) and the
++ // Linux kernel (arch/powerpc/mm/cacheflush.c):
++ // dcbst loop -> sync -> icbi loop -> sync -> isync
++ if (!size) {
++ return;
++ }
++ MOZ_ASSERT(sCPUFlagsComputed,
++ "PPC64Flags::Init must run before any FlushICache call");
++
++ uintptr_t start = reinterpret_cast<uintptr_t>(code);
++ uintptr_t end = start + size;
++
++ // Step 1: Write back data cache to memory.
++ for (uintptr_t addr = start & ~(sDCacheLineSize - 1); addr < end;
++ addr += sDCacheLineSize) {
++ asm volatile("dcbst 0, %0" : : "r"(addr) : "memory");
++ }
++ asm volatile("sync" ::: "memory");
++
++ // Step 2: Invalidate instruction cache.
++ for (uintptr_t addr = start & ~(sICacheLineSize - 1); addr < end;
++ addr += sICacheLineSize) {
++ asm volatile("icbi 0, %0" : : "r"(addr) : "memory");
++ }
++ // The extra sync before isync matches the Linux kernel and QEMU.
++ // It ensures all icbi operations complete before the pipeline flush.
++ asm volatile("sync" ::: "memory");
++ asm volatile("isync" ::: "memory");
++#endif
++}
++
++void FlushExecutionContext() {
++#if !defined(JS_SIMULATOR)
++ // PPC64's isync flushes the instruction pipeline on the current core,
++ // ensuring any previously invalidated icache entries are discarded and
++ // instructions are re-fetched from coherent memory.
++ asm volatile("isync" ::: "memory");
++#endif
++}
++
++} // namespace jit
++} // namespace js
+diff --git a/js/src/jit/ppc64/Architecture-ppc64.h b/js/src/jit/ppc64/Architecture-ppc64.h
+new file mode 100644
+index 000000000000..efaab0b0c854
+--- /dev/null
++++ b/js/src/jit/ppc64/Architecture-ppc64.h
+@@ -0,0 +1,581 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifndef jit_ppc64_Architecture_ppc64_h
++#define jit_ppc64_Architecture_ppc64_h
++
++#include <algorithm>
++#include <bit>
++
++#include "jit/shared/Architecture-shared.h"
++
++#include "js/Utility.h"
++
++namespace js {
++namespace jit {
++
++// PPC64 has 32 64-bit general purpose registers, r0 through r31.
++// The program counter is not directly accessible as a register.
++// The link register (LR) and count register (CTR) are SPRs.
++
++// PPC64 ELFv2 GPR Convention:
++// Name Usage
++// r0 Volatile, cannot be base register in load/store
++// r1 Stack pointer (callee-saved)
++// r2 TOC pointer (reserved)
++// r3 Return value / first argument
++// r4-r10 Arguments 2-8
++// r11 Environment pointer / scratch
++// r12 Branch target / scratch
++// r13 Thread pointer (reserved, TLS)
++// r14-r31 Callee-saved
++
++// PPC64 ELFv2 FPR Convention:
++// f0 Scratch
++// f1-f13 Arguments / volatile
++// f14-f31 Callee-saved
++
++class Registers {
++ public:
++ enum RegisterID {
++ r0 = 0,
++ r1,
++ r2,
++ r3,
++ r4,
++ r5,
++ r6,
++ r7,
++ r8,
++ r9,
++ r10,
++ r11,
++ r12,
++ r13,
++ r14,
++ r15,
++ r16,
++ r17,
++ r18,
++ r19,
++ r20,
++ r21,
++ r22,
++ r23,
++ r24,
++ r25,
++ r26,
++ r27,
++ r28,
++ r29,
++ r30,
++ r31,
++ sp = r1,
++ invalid_reg,
++ };
++ typedef uint8_t Code;
++ typedef RegisterID Encoding;
++ typedef uint32_t SetType;
++
++ static const Encoding StackPointer = sp;
++ static const Encoding Invalid = invalid_reg;
++
++ union RegisterContent {
++ uintptr_t r;
++ };
++
++ static uint32_t SetSize(SetType x) { return std::popcount(x); }
++ static uint32_t FirstBit(SetType x) {
++ MOZ_ASSERT(x);
++ return std::countr_zero(x);
++ }
++ static uint32_t LastBit(SetType x) {
++ MOZ_ASSERT(x);
++ return std::bit_width(x) - 1;
++ }
++
++ static const char* GetName(uint32_t code) {
++ static const char* const Names[] = {
++ "r0", "sp", "r2", "r3", "r4", "r5", "r6", "r7",
++ "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
++ "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
++ "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31"};
++ static_assert(Total == std::size(Names), "Table is the correct size");
++ if (code >= Total) {
++ return "invalid";
++ }
++ return Names[code];
++ }
++
++ static Code FromName(const char* name);
++
++ static const uint32_t Total = 32;
++ static const uint32_t TotalPhys = 32;
++ static const uint32_t Allocatable = 24;
++
++ static const SetType AllMask = 0xFFFFFFFF;
++ static const SetType NoneMask = 0x0;
++
++ static const SetType ArgRegMask =
++ (1U << Registers::r3) | (1U << Registers::r4) | (1U << Registers::r5) |
++ (1U << Registers::r6) | (1U << Registers::r7) | (1U << Registers::r8) |
++ (1U << Registers::r9) | (1U << Registers::r10);
++
++ // r0, r11, r12 are also volatile but handled separately.
++ static const SetType VolatileMask = ArgRegMask;
++
++ // ELFv2 callee-saved GPRs are r14..r31. r2 (TOC) and r13 (TLS) are
++ // dedicated registers, NOT general callee-saved: r2 is restored by the
++ // PLT-call linkage convention (`ld r2, 24(r1)` after every cross-module
++ // call); r13 is the thread pointer and must NEVER be written. Including
++ // them here previously made `PushRegsInMask(NonVolatileMask)` save and
++ // restore them — wasted 16 bytes per wasm-stub frame at best, latent
++ // TLS corruption if save/restore were ever misordered. Verified that
++ // no JIT-emitted code writes r2 or r13 (both are NonAllocatable, and
++ // grep across js/src/jit/ppc64/ finds no `as_*` site assigning to
++ // them), so they're preserved across the JIT body for free.
++ static const SetType NonVolatileMask =
++ (1U << Registers::r14) |
++ (1U << Registers::r15) | (1U << Registers::r16) | (1U << Registers::r17) |
++ (1U << Registers::r18) | (1U << Registers::r19) | (1U << Registers::r20) |
++ (1U << Registers::r21) | (1U << Registers::r22) | (1U << Registers::r23) |
++ (1U << Registers::r24) | (1U << Registers::r25) | (1U << Registers::r26) |
++ (1U << Registers::r27) | (1U << Registers::r28) | (1U << Registers::r29) |
++ (1U << Registers::r30) | (1U << Registers::r31);
++
++ static const SetType NonAllocatableMask =
++ (1U << Registers::r0) | // Cannot be base in load/store.
++ (1U << Registers::sp) | // Stack pointer.
++ (1U << Registers::r2) | // TOC pointer (ELFv2).
++ (1U << Registers::r11) | // Third scratch.
++ (1U << Registers::r12) | // Second scratch / addressTempRegister.
++ (1U << Registers::r13) | // Thread-local storage (ELFv2).
++ (1U << Registers::r16) | // Saved scratch register.
++ (1U << Registers::r31); // Frame pointer.
++
++ static const SetType WrapperMask = VolatileMask;
++
++ // Registers returned from a JS -> JS call.
++ static const SetType JSCallMask = (1U << Registers::r5);
++
++ // Registers returned from a JS -> C call.
++ static const SetType CallMask = (1U << Registers::r3);
++
++ static const SetType AllocatableMask = AllMask & ~NonAllocatableMask;
++};
++
++typedef uint32_t PackedRegisterMask;
++
++template <typename T>
++class TypedRegisterSet;
++
++class FloatRegisters {
++ public:
++ enum FPRegisterID {
++ f0 = 0,
++ f1,
++ f2,
++ f3,
++ f4,
++ f5,
++ f6,
++ f7,
++ f8,
++ f9,
++ f10,
++ f11,
++ f12,
++ f13,
++ f14,
++ f15,
++ f16,
++ f17,
++ f18,
++ f19,
++ f20,
++ f21,
++ f22,
++ f23,
++ f24,
++ f25,
++ f26,
++ f27,
++ f28,
++ f29,
++ f30,
++ f31,
++ };
++
++ // Eight bits: (invalid << 7) | (kind << 5) | encoding
++ typedef uint8_t Code;
++ typedef FPRegisterID Encoding;
++ // 3 kinds × 32 regs = 96 bits needed. Use __uint128_t.
++ typedef __uint128_t SetType;
++
++ enum Kind : uint8_t { Double, Single, Simd128, NumTypes };
++
++ static constexpr Code Invalid = 0x80;
++
++ static const char* GetName(uint32_t code) {
++ static const char* const Names[] = {
++ "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7",
++ "f8", "f9", "f10", "f11", "f12", "f13", "f14", "f15",
++ "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23",
++ "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31"};
++ static_assert(TotalPhys == std::size(Names), "Table is the correct size");
++ if (code >= Total) {
++ return "invalid";
++ }
++ return Names[code % TotalPhys];
++ }
++
++ static Code FromName(const char* name);
++
++ static const uint32_t TotalPhys = 32;
++ static const uint32_t Total = TotalPhys * NumTypes;
++ static const uint32_t Allocatable = 31; // Without f0, the scratch register.
++
++ static_assert(sizeof(SetType) * 8 >= Total,
++ "SetType should be large enough to enumerate all registers.");
++
++ static const SetType SpreadSingle = SetType(1)
++ << (uint32_t(Single) * TotalPhys);
++ static const SetType SpreadDouble = SetType(1)
++ << (uint32_t(Double) * TotalPhys);
++ static const SetType SpreadSimd128 = SetType(1)
++ << (uint32_t(Simd128) * TotalPhys);
++ static const SetType Spread = SpreadSingle | SpreadDouble | SpreadSimd128;
++
++ static const SetType AllPhysMask = ((SetType(1) << TotalPhys) - 1);
++ static const SetType AllMask = AllPhysMask * Spread;
++ static const SetType AllSingleMask = AllPhysMask * SpreadSingle;
++ static const SetType AllDoubleMask = AllPhysMask * SpreadDouble;
++ static const SetType AllSimd128Mask = AllPhysMask * SpreadSimd128;
++ static const SetType NoneMask = SetType(0);
++
++ // ELFv2: f14-f31 are non-volatile (callee-saved) for scalar FP.
++ // The upper 64 bits of VSR 0-31 are volatile, so Simd128 view is all-volatile.
++ static const SetType NonVolatilePhysMask =
++ SetType((1U << FloatRegisters::f14) | (1U << FloatRegisters::f15) |
++ (1U << FloatRegisters::f16) | (1U << FloatRegisters::f17) |
++ (1U << FloatRegisters::f18) | (1U << FloatRegisters::f19) |
++ (1U << FloatRegisters::f20) | (1U << FloatRegisters::f21) |
++ (1U << FloatRegisters::f22) | (1U << FloatRegisters::f23) |
++ (1U << FloatRegisters::f24) | (1U << FloatRegisters::f25) |
++ (1U << FloatRegisters::f26) | (1U << FloatRegisters::f27) |
++ (1U << FloatRegisters::f28) | (1U << FloatRegisters::f29) |
++ (1U << FloatRegisters::f30) | (1U << FloatRegisters::f31));
++ // Simd128 lives in VR-namespace (VSR32-63 = VR0-VR31). Per ELFv2 ABI,
++ // VR20-VR31 are non-volatile (callee-saved). Encoding storage is 20-31
++ // with kind=Simd128.
++ static const SetType SimdNonVolatilePhysMask =
++ SetType((1U << 20) | (1U << 21) | (1U << 22) | (1U << 23) |
++ (1U << 24) | (1U << 25) | (1U << 26) | (1U << 27) |
++ (1U << 28) | (1U << 29) | (1U << 30) | (1U << 31));
++ static const SetType NonVolatileMask =
++ NonVolatilePhysMask * (SpreadSingle | SpreadDouble) |
++ SimdNonVolatilePhysMask * SpreadSimd128;
++
++ static const SetType VolatileMask = AllMask & ~NonVolatileMask;
++
++ static const SetType WrapperMask = VolatileMask;
++
++ // f0 is the scratch register (all three views: single, double, simd128).
++ static const SetType NonAllocatableMask =
++ (SetType(1) << FloatRegisters::f0) * Spread;
++
++ static const SetType AllocatableMask = AllMask & ~NonAllocatableMask;
++
++ union RegisterContent {
++ float s;
++ double d;
++ // No v128 here. Simd128 lives in physically-distinct VRs (VSR32-63)
++ // and never reaches RegisterDump (asserted by SafepointState; bailout
++ // AllRegs excludes Simd128). With v128 in the union, sizeof was 16,
++ // forcing PushRegsInMask to a 16-byte stride that mismatched
++ // addressOfRegister's 8-byte walk via (*iter).size().
++ };
++
++ static constexpr Encoding encoding(Code c) { return Encoding(c & 31); }
++
++ static constexpr Kind kind(Code c) { return Kind((c >> 5) & 3); }
++
++ static constexpr Code fromParts(uint32_t encoding, uint32_t kind,
++ uint32_t invalid) {
++ return Code((invalid << 7) | (kind << 5) | encoding);
++ }
++};
++
++// SpillSlotSize must fit the widest register class (Simd128 = 16 bytes).
++// We can't derive from sizeof(FloatRegisters::RegisterContent) — that
++// union is sized for FPRs only (8 bytes since v128 lives in distinct
++// VRs, not in the FPR union), so deriving would under-reserve for
++// Simd128 cycle breaks. SpillSlotSize is consumed only by MoveEmitter
++// and is not part of the JIT frame layout.
++static const uint32_t SpillSlotSize = 16;
++
++// PPC64 ELFv2 ABI: the callee saves LR at [caller_SP+16], CR at
++// [caller_SP+8], and may save TOC at [caller_SP+24]. Reserve 32 bytes
++// (the minimum ELFv2 stack frame) as a shadow area for every ABI call.
++static constexpr uint32_t ShadowStackSpace = 32;
++static const uint32_t SizeOfReturnAddressAfterCall = 0;
++
++// PPC64 branch instructions have a 26-bit signed offset field, giving a
++// range of +/- 32MB. We reduce this to leave room for jump island insertion.
++static constexpr uint32_t JumpImmediateRange = (32 * 1024 * 1024) - 32;
++
++// Size of each bailout table entry (a single bl instruction).
++static const uint32_t BAILOUT_TABLE_ENTRY_SIZE = 4;
++
++// PPC64 special purpose registers (not exposed to the allocator).
++enum SPRegisterID {
++ spr_xer = 1,
++ spr_lr = 8,
++ spr_ctr = 9,
++ spr_vrsave = 256,
++ invalid_spreg
++};
++
++// PPC64 condition registers.
++enum CRegisterID { cr0 = 0, cr1, cr5 = 5, cr6, cr7, invalid_creg };
++
++struct FloatRegister {
++ typedef FloatRegisters Codes;
++ typedef size_t Code;
++ typedef Codes::Encoding Encoding;
++ typedef Codes::SetType SetType;
++
++ static uint32_t SetSize(SetType x) {
++ // Fold all 3 kinds (Single, Double, Simd128) down to physical mask.
++ SetType phys = (x & FloatRegisters::AllPhysMask) |
++ ((x >> FloatRegisters::TotalPhys) & FloatRegisters::AllPhysMask) |
++ ((x >> (2 * FloatRegisters::TotalPhys)) & FloatRegisters::AllPhysMask);
++ return std::popcount(static_cast<uint64_t>(phys));
++ }
++
++ // __uint128_t helpers for FirstBit/LastBit.
++ static uint32_t FirstBit(SetType x) {
++ MOZ_ASSERT(x);
++ uint64_t lo = static_cast<uint64_t>(x);
++ if (lo) {
++ return std::countr_zero(lo);
++ }
++ return 64 + std::countr_zero(static_cast<uint64_t>(x >> 64));
++ }
++ static uint32_t LastBit(SetType x) {
++ MOZ_ASSERT(x);
++ uint64_t hi = static_cast<uint64_t>(x >> 64);
++ if (hi) {
++ return 64 + (std::bit_width(hi) - 1);
++ }
++ return std::bit_width(static_cast<uint64_t>(x)) - 1;
++ }
++
++ private:
++ uint8_t encoding_;
++ uint8_t kind_;
++ bool invalid_;
++
++ typedef Codes::Kind Kind;
++
++ public:
++ constexpr FloatRegister(Encoding encoding, Kind kind)
++ : encoding_(encoding), kind_(kind), invalid_(false) {}
++
++ constexpr FloatRegister()
++ : encoding_(0), kind_(FloatRegisters::Double), invalid_(true) {}
++
++ static FloatRegister FromCode(uint32_t i) {
++ MOZ_ASSERT(i < Codes::Total);
++ return FloatRegister(FloatRegisters::encoding(i), FloatRegisters::kind(i));
++ }
++
++ bool isSingle() const {
++ MOZ_ASSERT(!invalid_);
++ return kind_ == FloatRegisters::Single;
++ }
++ bool isDouble() const {
++ MOZ_ASSERT(!invalid_);
++ return kind_ == FloatRegisters::Double;
++ }
++ bool isSimd128() const {
++ MOZ_ASSERT(!invalid_);
++ return kind_ == FloatRegisters::Simd128;
++ }
++ bool isInvalid() const { return invalid_; }
++
++ FloatRegister asSingle() const {
++ MOZ_ASSERT(!invalid_);
++ return FloatRegister(Encoding(encoding_), FloatRegisters::Single);
++ }
++ FloatRegister asDouble() const {
++ MOZ_ASSERT(!invalid_);
++ return FloatRegister(Encoding(encoding_), FloatRegisters::Double);
++ }
++ FloatRegister asSimd128() const {
++ MOZ_ASSERT(!invalid_);
++ return FloatRegister(Encoding(encoding_), FloatRegisters::Simd128);
++ }
++
++ constexpr uint32_t size() const {
++ MOZ_ASSERT(!invalid_);
++ if (kind_ == FloatRegisters::Double) {
++ return sizeof(double);
++ }
++ if (kind_ == FloatRegisters::Single) {
++ return sizeof(float);
++ }
++ MOZ_ASSERT(kind_ == FloatRegisters::Simd128);
++ return 16;
++ }
++
++ constexpr Code code() const {
++ return Codes::fromParts(encoding_, kind_, invalid_);
++ }
++
++ constexpr Encoding encoding() const {
++ MOZ_ASSERT(!invalid_);
++ // Simd128 lives in VR-namespace at VSR32-63 (= VR0-31). Single/Double
++ // share FPR namespace at VSR0-31. The unified XX-form encoders split
++ // the result into low-5-bit VRT/VRA/VRB + TX/AX/BX bits; VMX
++ // FloatRegister-taking encoders mask with `& 31` for the raw VR
++ // field. So 32+E flows correctly through both paths.
++ return Encoding(encoding_ +
++ (kind_ == FloatRegisters::Simd128 ? 32 : 0));
++ }
++
++ const char* name() const { return FloatRegisters::GetName(code()); }
++ bool volatile_() const {
++ MOZ_ASSERT(!invalid_);
++ return !!((SetType(1) << code()) & FloatRegisters::VolatileMask);
++ }
++ constexpr bool operator!=(FloatRegister other) const {
++ return code() != other.code();
++ }
++ constexpr bool operator==(FloatRegister other) const {
++ return code() == other.code();
++ }
++
++ bool aliases(FloatRegister other) const {
++ // Register-class partition: {Single, Double} share FPRs (VSR0-31);
++ // Simd128 lives in VR-namespace (VSR32-63). FPR f5 (Single/Double
++ // encoding 5) and VR v5 (Simd128 encoding 5) are distinct physical
++ // registers.
++ if (encoding_ != other.encoding_) return false;
++ bool selfSimd = (kind_ == FloatRegisters::Simd128);
++ bool otherSimd = (other.kind_ == FloatRegisters::Simd128);
++ return selfSimd == otherSimd;
++ }
++ bool equiv(FloatRegister other) const {
++ MOZ_ASSERT(!invalid_);
++ return kind_ == other.kind_;
++ }
++
++ uint32_t numAliased() const {
++ return (kind_ == FloatRegisters::Simd128) ? 1 : 2;
++ }
++ uint32_t numAlignedAliased() { return numAliased(); }
++
++ FloatRegister aliased(uint32_t aliasIdx) {
++ MOZ_ASSERT(!invalid_);
++ MOZ_ASSERT(aliasIdx < numAliased());
++ if (kind_ == FloatRegisters::Simd128) {
++ return *this;
++ }
++ Kind otherKind = (kind_ == FloatRegisters::Single)
++ ? FloatRegisters::Double
++ : FloatRegisters::Single;
++ Kind selectedKind = (aliasIdx == 0) ? Kind(kind_) : otherKind;
++ return FloatRegister(Encoding(encoding_), selectedKind);
++ }
++ FloatRegister alignedAliased(uint32_t aliasIdx) {
++ MOZ_ASSERT(aliasIdx < numAliased());
++ return aliased(aliasIdx);
++ }
++ SetType alignedOrDominatedAliasedSet() const {
++ if (kind_ == FloatRegisters::Simd128) {
++ return SetType(1) << ((uint32_t(FloatRegisters::Simd128) *
++ FloatRegisters::TotalPhys) +
++ encoding_);
++ }
++ return (Codes::SpreadSingle | Codes::SpreadDouble) << encoding_;
++ }
++
++ static constexpr RegTypeName DefaultType = RegTypeName::Float64;
++
++ template <RegTypeName Name = DefaultType>
++ static SetType LiveAsIndexableSet(SetType s) {
++ return SetType(0);
++ }
++
++ template <RegTypeName Name = DefaultType>
++ static SetType AllocatableAsIndexableSet(SetType s) {
++ static_assert(Name != RegTypeName::Any, "Allocatable set are not iterable");
++ return LiveAsIndexableSet<Name>(s);
++ }
++
++ static TypedRegisterSet<FloatRegister> ReduceSetForPush(
++ const TypedRegisterSet<FloatRegister>& s);
++ static uint32_t GetPushSizeInBytes(const TypedRegisterSet<FloatRegister>& s);
++ uint32_t getRegisterDumpOffsetInBytes();
++};
++
++template <>
++inline FloatRegister::SetType
++FloatRegister::LiveAsIndexableSet<RegTypeName::Float32>(SetType set) {
++ return set & FloatRegisters::AllSingleMask;
++}
++
++template <>
++inline FloatRegister::SetType
++FloatRegister::LiveAsIndexableSet<RegTypeName::Float64>(SetType set) {
++ return set & FloatRegisters::AllDoubleMask;
++}
++
++template <>
++inline FloatRegister::SetType
++FloatRegister::LiveAsIndexableSet<RegTypeName::Vector128>(SetType set) {
++ return set & FloatRegisters::AllSimd128Mask;
++}
++
++template <>
++inline FloatRegister::SetType
++FloatRegister::LiveAsIndexableSet<RegTypeName::Any>(SetType set) {
++ return set;
++}
++
++inline bool hasUnaliasedDouble() { return false; }
++inline bool hasMultiAlias() { return false; }
++
++// PPC64 feature bits packed into the value GetPPC64Flags() returns,
++// which feeds wasm/WasmCompile.cpp's per-architecture code signature.
++// Defined as enum constants (not enum class) so callers can OR/AND
++// freely. New bits should remain backward-compatible — older signatures
++// must keep meaning the same set of features.
++enum PPC64FeatureFlags : uint32_t {
++ PPC64Flag_POWER9 = 1u << 0,
++ // Future: PPC64Flag_POWER10 = 1u << 1, PPC64Flag_VSX4 = 1u << 2, ...
++};
++
++uint32_t GetPPC64Flags();
++
++class PPC64Flags final {
++ public:
++ PPC64Flags() = delete;
++
++ // PPC64Flags::Init is called from the JitContext constructor to read the
++ // hardware capabilities (via getauxval(AT_HWCAP2)). It must be called
++ // exactly once, before HasPOWER9()/HasPOWER10() are used.
++ static void Init();
++};
++
++bool HasPOWER9();
++bool HasPOWER10();
++
++} // namespace jit
++} // namespace js
++
++#endif /* jit_ppc64_Architecture_ppc64_h */
+diff --git a/js/src/jit/ppc64/Assembler-ppc64.cpp b/js/src/jit/ppc64/Assembler-ppc64.cpp
+new file mode 100644
+index 000000000000..481070c4c6d5
+--- /dev/null
++++ b/js/src/jit/ppc64/Assembler-ppc64.cpp
+@@ -0,0 +1,3028 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#include "jit/ppc64/Assembler-ppc64.h"
++
++#include "mozilla/DebugOnly.h"
++#include "mozilla/Maybe.h"
++
++#include "gc/Marking.h"
++#include "jit/AutoWritableJitCode.h"
++#include "jit/ExecutableAllocator.h"
++#include "jit/FlushICache.h"
++
++using mozilla::DebugOnly;
++
++using namespace js;
++using namespace js::jit;
++
++// ELFv2 ABI: 8 GPRs (r3-r10), 13 FPRs (f1-f13).
++// FP arguments also consume a GPR slot per ELFv2 convention.
++ABIArg ABIArgGenerator::next(MIRType type) {
++ switch (type) {
++ case MIRType::Int32:
++ case MIRType::Int64:
++ case MIRType::Pointer:
++ case MIRType::WasmAnyRef:
++ case MIRType::WasmArrayData:
++ case MIRType::StackResults: {
++ if (intRegIndex_ >= NumIntArgRegs) {
++ current_ = ABIArg(stackOffset_);
++ stackOffset_ += sizeof(uintptr_t);
++ break;
++ }
++ current_ = ABIArg(Register::FromCode(Registers::r3 + intRegIndex_));
++ intRegIndex_++;
++ break;
++ }
++ case MIRType::Float32:
++ case MIRType::Double: {
++ if (floatRegIndex_ == NumFloatArgRegs) {
++ current_ = ABIArg(stackOffset_);
++ stackOffset_ += sizeof(double);
++ break;
++ }
++ current_ = ABIArg(FloatRegister(
++ FloatRegisters::Encoding(FloatRegisters::f1 + floatRegIndex_),
++ type == MIRType::Double ? FloatRegisters::Double
++ : FloatRegisters::Single));
++ floatRegIndex_++;
++ // ELFv2 ABI: each FP arg also consumes a GPR slot (shadow).
++ // Cap at NumIntArgRegs so subsequent int args go to the stack.
++ if (intRegIndex_ < NumIntArgRegs) {
++ intRegIndex_++;
++ }
++ break;
++ }
++ case MIRType::Simd128: {
++ // Pass v128 in FP registers (Simd128 kind). On PPC64 ELFv2, SIMD
++ // values use the same VSR register file as FP args.
++ if (floatRegIndex_ == NumFloatArgRegs) {
++ current_ = ABIArg(stackOffset_);
++ stackOffset_ += 16;
++ break;
++ }
++ current_ = ABIArg(FloatRegister(
++ FloatRegisters::Encoding(FloatRegisters::f1 + floatRegIndex_),
++ FloatRegisters::Simd128));
++ floatRegIndex_++;
++ if (intRegIndex_ < NumIntArgRegs) {
++ intRegIndex_++;
++ }
++ break;
++ }
++ default:
++ MOZ_CRASH("Unexpected argument type");
++ }
++ return current_;
++}
++
++// Condition inversion tables.
++Assembler::Condition Assembler::InvertCondition(Condition cond) {
++ switch (cond) {
++ case Equal:
++ return NotEqual;
++ case NotEqual:
++ return Equal;
++ case LessThan:
++ return GreaterThanOrEqual;
++ case LessThanOrEqual:
++ return GreaterThan;
++ case GreaterThan:
++ return LessThanOrEqual;
++ case GreaterThanOrEqual:
++ return LessThan;
++ case Above:
++ return BelowOrEqual;
++ case AboveOrEqual:
++ return Below;
++ case Below:
++ return AboveOrEqual;
++ case BelowOrEqual:
++ return Above;
++ case Zero:
++ return NonZero;
++ case NonZero:
++ return Zero;
++ case Signed:
++ return NotSigned;
++ case NotSigned:
++ return Signed;
++ case SOBit:
++ return NSOBit;
++ case NSOBit:
++ return SOBit;
++ case Overflow:
++ return NotOverflow;
++ case NotOverflow:
++ return Overflow;
++ case CarrySet:
++ return CarryClear;
++ case CarryClear:
++ return CarrySet;
++ default:
++ MOZ_CRASH("unexpected condition");
++ }
++}
++
++Assembler::DoubleCondition Assembler::InvertCondition(DoubleCondition cond) {
++ switch (cond) {
++ case DoubleOrdered:
++ return DoubleUnordered;
++ case DoubleEqual:
++ return DoubleNotEqualOrUnordered;
++ case DoubleNotEqual:
++ return DoubleEqualOrUnordered;
++ case DoubleGreaterThan:
++ return DoubleLessThanOrEqualOrUnordered;
++ case DoubleGreaterThanOrEqual:
++ return DoubleLessThanOrUnordered;
++ case DoubleLessThan:
++ return DoubleGreaterThanOrEqualOrUnordered;
++ case DoubleLessThanOrEqual:
++ return DoubleGreaterThanOrUnordered;
++ case DoubleUnordered:
++ return DoubleOrdered;
++ case DoubleEqualOrUnordered:
++ return DoubleNotEqual;
++ case DoubleNotEqualOrUnordered:
++ return DoubleEqual;
++ case DoubleGreaterThanOrUnordered:
++ return DoubleLessThanOrEqual;
++ case DoubleGreaterThanOrEqualOrUnordered:
++ return DoubleLessThan;
++ case DoubleLessThanOrUnordered:
++ return DoubleGreaterThanOrEqual;
++ case DoubleLessThanOrEqualOrUnordered:
++ return DoubleGreaterThan;
++ default:
++ MOZ_CRASH("unexpected condition");
++ }
++}
++
++// InstImm helper.
++uint8_t InstImm::traptag() {
++ uint8_t r = ((data & 0x001f0000) >> 16);
++ MOZ_ASSERT(isOpcode(PPC_tw));
++ MOZ_ASSERT(r == ((data & 0x0000f800) >> 11));
++ return r & 0xfe;
++}
++
++BOffImm16::BOffImm16(InstImm inst) : data(inst.extractImm16Value() & 0xFFFC) {
++ // Sign-extend the 16-bit field.
++ if (data & 0x8000) {
++ data |= ~0xFFFF;
++ }
++}
++
++Instruction* BOffImm16::getDest(Instruction* src) const {
++ return (Instruction*)((uint8_t*)src + data);
++}
++
++Instruction* JOffImm26::getDest(Instruction* src) const {
++ return (Instruction*)((uint8_t*)src + data);
++}
++
++Imm16::Imm16() : value(0) {}
++
++Imm8::Imm8() : value(0) {}
++
++// Buffer management.
++bool Assembler::oom() const {
++ return AssemblerShared::oom() || m_buffer.oom() || jumpRelocations_.oom() ||
++ dataRelocations_.oom();
++}
++
++void Assembler::finish() {
++ MOZ_ASSERT(!isFinished);
++ isFinished = true;
++ m_buffer.flushPool();
++}
++
++bool Assembler::appendRawCode(const uint8_t* code, size_t numBytes) {
++ return m_buffer.appendRawCode(code, numBytes);
++}
++
++bool Assembler::reserve(size_t size) {
++ // Fixed-size chunk buffer; no point in reserving now vs. on-demand.
++ return !oom();
++}
++
++bool Assembler::swapBuffer(wasm::Bytes& bytes) {
++ MOZ_ASSERT(bytes.empty());
++ if (!bytes.resize(bytesNeeded())) {
++ return false;
++ }
++ m_buffer.executableCopy(bytes.begin());
++ return true;
++}
++
++void Assembler::copyJumpRelocationTable(uint8_t* dest) {
++ if (jumpRelocations_.length()) {
++ memcpy(dest, jumpRelocations_.buffer(), jumpRelocations_.length());
++ }
++}
++
++void Assembler::copyDataRelocationTable(uint8_t* dest) {
++ if (dataRelocations_.length()) {
++ memcpy(dest, dataRelocations_.buffer(), dataRelocations_.length());
++ }
++}
++
++void Assembler::executableCopy(void* buffer) {
++ MOZ_ASSERT(isFinished);
++ m_buffer.executableCopy(static_cast<uint8_t*>(buffer));
++}
++
++void Assembler::executableCopy(uint8_t* buffer) {
++ MOZ_ASSERT(isFinished);
++ m_buffer.executableCopy(buffer);
++}
++
++size_t Assembler::size() const {
++ // AssemblerBufferWithConstantPools::size() asserts pool is empty.
++ // Flush pending pool entries first.
++ const_cast<PPCBufferWithExecutableCopy&>(m_buffer).flushPool();
++ return m_buffer.size();
++}
++
++size_t Assembler::jumpRelocationTableBytes() const {
++ return jumpRelocations_.length();
++}
++
++size_t Assembler::dataRelocationTableBytes() const {
++ return dataRelocations_.length();
++}
++
++size_t Assembler::bytesNeeded() const {
++ return size() + jumpRelocationTableBytes() + dataRelocationTableBytes();
++}
++
++// Write an instruction into the buffer or to an external destination.
++BufferOffset Assembler::writeInst(uint32_t x, uint32_t* dest) {
++ MOZ_ASSERT(hasCreator());
++ if (dest == nullptr) {
++ return m_buffer.putInt(x);
++ }
++
++ WriteInstStatic(x, dest);
++ return BufferOffset();
++}
++
++void Assembler::WriteInstStatic(uint32_t x, uint32_t* dest) {
++ MOZ_ASSERT(dest != nullptr);
++ *dest = x;
++}
++
++// Alignment.
++BufferOffset Assembler::haltingAlign(int alignment) {
++ BufferOffset ret;
++ MOZ_ASSERT(m_buffer.isAligned(4));
++ if (alignment == 8) {
++ if (!m_buffer.isAligned(alignment)) {
++ BufferOffset tmp = xs_trap();
++ if (!ret.assigned()) {
++ ret = tmp;
++ }
++ }
++ } else {
++ MOZ_ASSERT((alignment & (alignment - 1)) == 0);
++ while (size() & (alignment - 1)) {
++ BufferOffset tmp = xs_trap();
++ if (!ret.assigned()) {
++ ret = tmp;
++ }
++ }
++ }
++ return ret;
++}
++
++BufferOffset Assembler::nopAlign(int alignment) {
++ BufferOffset ret;
++ MOZ_ASSERT(m_buffer.isAligned(4));
++ if (alignment == 8) {
++ if (!m_buffer.isAligned(alignment)) {
++ BufferOffset tmp = as_nop();
++ if (!ret.assigned()) {
++ ret = tmp;
++ }
++ }
++ } else {
++ MOZ_ASSERT((alignment & (alignment - 1)) == 0);
++ while (size() & (alignment - 1)) {
++ BufferOffset tmp = as_nop();
++ if (!ret.assigned()) {
++ ret = tmp;
++ }
++ }
++ }
++ return ret;
++}
++
++// Primitive instructions.
++BufferOffset Assembler::as_nop() {
++ spew("nop");
++ return writeInst(PPC_nop);
++}
++
++BufferOffset Assembler::as_lwsync() {
++ spew("lwsync");
++ return writeInst(PPC_lwsync);
++}
++
++BufferOffset Assembler::as_sync() {
++ spew("sync");
++ return writeInst(PPC_sync);
++}
++
++BufferOffset Assembler::as_isync() {
++ spew("isync");
++ return writeInst(PPC_isync);
++}
++
++// Branch and jump instructions.
++BufferOffset Assembler::as_b(JOffImm26 off, BranchAddressType bat, LinkBit lb) {
++ return as_b(off.encode(), bat, lb);
++}
++
++BufferOffset Assembler::as_b(int32_t off, BranchAddressType bat, LinkBit lb) {
++ spew("b%s%s\t%x", bat == AbsoluteBranch ? "a" : "", lb ? "l" : "", off);
++ MOZ_ASSERT(!(off & 0x03));
++ return writeInst(PPC_b | ((uint32_t)off & 0x3fffffc) | bat | lb);
++}
++
++BufferOffset Assembler::as_blr(LinkBit lb) {
++ spew("blr%s", lb ? "l" : "");
++ return writeInst(uint32_t(PPC_blr) | uint32_t(lb));
++}
++
++BufferOffset Assembler::as_bctr(LinkBit lb) {
++ spew("bctr%s", lb ? "l" : "");
++ return writeInst(uint32_t(PPC_bctr) | uint32_t(lb));
++}
++
++// Conditional branches.
++BufferOffset Assembler::as_bc(BOffImm16 off, Condition cond, CRegisterID cr,
++ LikelyBit lkb, LinkBit lb) {
++ return as_bc(off.encode(), cond, cr, lkb, lb);
++}
++
++BufferOffset Assembler::as_bc(int16_t off, Condition cond, CRegisterID cr,
++ LikelyBit lkb, LinkBit lb) {
++ return as_bc(off, computeConditionCode(cond, cr), lkb, lb);
++}
++
++BufferOffset Assembler::as_bc(BOffImm16 off, DoubleCondition cond,
++ CRegisterID cr, LikelyBit lkb, LinkBit lb) {
++ return as_bc(off.encode(), cond, cr, lkb, lb);
++}
++
++BufferOffset Assembler::as_bc(int16_t off, DoubleCondition cond, CRegisterID cr,
++ LikelyBit lkb, LinkBit lb) {
++ return as_bc(off, computeConditionCode(cond, cr), lkb, lb);
++}
++
++BufferOffset Assembler::as_bcctr(Condition cond, CRegisterID cr, LikelyBit lkb,
++ LinkBit lb) {
++ return as_bcctr(computeConditionCode(cond, cr), lkb, lb);
++}
++
++BufferOffset Assembler::as_bcctr(DoubleCondition cond, CRegisterID cr,
++ LikelyBit lkb, LinkBit lb) {
++ return as_bcctr(computeConditionCode(cond, cr), lkb, lb);
++}
++
++// Condition code computation: turn DoubleCondition + CR into BO|BI.
++// May emit CR logic instructions for synthetic conditions involving FU bit.
++uint16_t Assembler::computeConditionCode(DoubleCondition op, CRegisterID cr) {
++ const uint8_t condBit = crBit(cr, op);
++ const uint8_t fuBit = crBit(cr, DoubleUnordered);
++ uint32_t newop = (uint32_t)op & 255;
++
++ if (op & DoubleConditionUnordered) {
++ if ((uint32_t(op) & BranchOptionMask) == BranchOnClear) {
++ as_crorc(condBit, fuBit, condBit);
++ newop |= BranchOnSet;
++ } else {
++ if (condBit != fuBit) {
++ as_cror(condBit, fuBit, condBit);
++ }
++ }
++ } else {
++ if ((uint32_t(op) & BranchOptionMask) == BranchOnClear) {
++ if (condBit != fuBit) {
++ as_cror(condBit, fuBit, condBit);
++ }
++ } else {
++ if (condBit != fuBit) {
++ as_crandc(condBit, condBit, fuBit);
++ }
++ }
++ }
++
++ return (newop + ((uint8_t)cr << 6));
++}
++
++// Condition code computation: turn Condition + CR into BO|BI.
++// May emit mcrxrx for XER-mediated conditions.
++uint16_t Assembler::computeConditionCode(Condition op, CRegisterID cr) {
++ uint32_t newop = (uint32_t)op & 255;
++
++ if (op & ConditionOnlyXER) {
++ MOZ_ASSERT(op == Overflow || op == NotOverflow);
++ if (HasPOWER9()) {
++ as_mcrxrx(cr);
++ } else {
++ // POWER8: read XER, place OV into the GT position of the target
++ // CR field. Overflow condition (0x1c = GreaterThan) tests GT bit,
++ // which mcrxrx populates with OV32. For 64-bit ops OV == OV32.
++ // XER layout in GPR low 32 bits (IBM): bit 0=SO, 1=OV, 2=CA.
++ // Target: GT position = IBM bit 4*cr+1.
++ xs_mfxer(r0);
++ int gtBit = 4 * (int)cr + 1; // GT position in CR field
++ int sh = (1 - gtBit) & 31; // rotate OV from bit 1 to gtBit
++ as_rlwinm(r0, r0, sh, gtBit, gtBit); // isolate OV at GT only
++ as_mtcrf(1 << (7 - (int)cr), r0);
++ }
++ newop = (uint32_t)op & 255;
++ }
++
++ return (newop + ((uint8_t)cr << 6));
++}
++
++// Given BO|BI in a 16-bit quantity, split into bit fields for instruction.
++static uint32_t makeOpMask(uint16_t op) {
++ MOZ_ASSERT(!(op & 0xfc00));
++ return ((op & 0x0f) << 21) | ((op & 0xfff0) << 12);
++}
++
++BufferOffset Assembler::as_bc(int16_t off, uint16_t op, LikelyBit lkb,
++ LinkBit lb) {
++ spew("bc%s%s\tBO_BI=0x%04x,%d", lb ? "l" : "", lkb ? "+" : "", op, off);
++ MOZ_ASSERT(!(off & 0x03));
++ return writeInst(Instruction(PPC_bc | makeOpMask(op) | lkb << 21 |
++ ((uint16_t)off & 0xfffc) | lb)
++ .encode());
++}
++
++BufferOffset Assembler::as_bcctr(uint16_t op, LikelyBit lkb, LinkBit lb) {
++ spew("bcctr%s%s", lb ? "l" : "", lkb ? "+" : "");
++ return writeInst(PPC_bcctr | makeOpMask(op) | lkb << 21 | lb);
++}
++
++// SPR operations.
++BufferOffset Assembler::as_mtspr(SPRegisterID spr, Register ra) {
++ spew("mtspr\t%d,%3s", spr, ra.name());
++ return writeInst(PPC_mtspr | ra.code() << 21 | PPC_SPR(spr));
++}
++
++BufferOffset Assembler::as_mfspr(Register rd, SPRegisterID spr) {
++ spew("mfspr\t%3s,%d", rd.name(), spr);
++ return writeInst(PPC_mfspr | rd.code() << 21 | PPC_SPR(spr));
++}
++
++// CR operations.
++#define DEF_CRCR(op) \
++ BufferOffset Assembler::as_##op(uint8_t t, uint8_t a, uint8_t b) { \
++ spew(#op "\t%d,%d,%d", t, a, b); \
++ return writeInst(PPC_##op | t << 21 | a << 16 | b << 11); \
++ }
++DEF_CRCR(crandc)
++DEF_CRCR(cror)
++DEF_CRCR(crorc)
++#undef DEF_CRCR
++
++BufferOffset Assembler::as_mtcrf(uint32_t mask, Register rs) {
++ spew("mtcrf\t%d,%3s", mask, rs.name());
++ return writeInst(PPC_mtcrf | rs.code() << 21 | mask << 12);
++}
++
++BufferOffset Assembler::as_mfocrf(Register rd, CRegisterID crfs) {
++ spew("mfocrf\t%3s,cr%d", rd.name(), crfs);
++ // FXM is a one-hot 8-bit mask at bits 12-19. Bit (7-crfs) selects the CR.
++ return writeInst(PPC_mfocrf | rd.code() << 21 | (1 << (7 - crfs)) << 12);
++}
++
++BufferOffset Assembler::as_mcrxrx(CRegisterID cr) {
++ spew("mcrxrx\tcr%d", cr);
++ return writeInst(PPC_mcrxrx | cr << 23);
++}
++
++// GPR neg.
++BufferOffset Assembler::as_neg(Register rd, Register rs) {
++ spew("neg\t%3s,%3s", rd.name(), rs.name());
++ return writeInst(InstReg(PPC_neg, rd, rs, r0).encode());
++}
++
++// Compare instructions.
++BufferOffset Assembler::as_cmpd(CRegisterID cr, Register ra, Register rb) {
++ spew("cmpd\tcr%d,%3s,%3s", cr, ra.name(), rb.name());
++ return writeInst(PPC_cmpd | cr << 23 | ra.code() << 16 | rb.code() << 11);
++}
++
++BufferOffset Assembler::as_cmpdi(CRegisterID cr, Register ra, int16_t im) {
++ spew("cmpdi\tcr%d,%3s,%d", cr, ra.name(), im);
++ return writeInst(PPC_cmpdi | cr << 23 | ra.code() << 16 |
++ ((uint16_t)im & 0xffff));
++}
++
++BufferOffset Assembler::as_cmpld(CRegisterID cr, Register ra, Register rb) {
++ spew("cmpld\tcr%d,%3s,%3s", cr, ra.name(), rb.name());
++ return writeInst(PPC_cmpld | cr << 23 | ra.code() << 16 | rb.code() << 11);
++}
++
++BufferOffset Assembler::as_cmpldi(CRegisterID cr, Register ra, int16_t im) {
++ spew("cmpldi\tcr%d,%3s,%d", cr, ra.name(), im);
++ return writeInst(PPC_cmpldi | cr << 23 | ra.code() << 16 |
++ ((uint16_t)im & 0xffff));
++}
++
++BufferOffset Assembler::as_cmpw(CRegisterID cr, Register ra, Register rb) {
++ spew("cmpw\tcr%d,%3s,%3s", cr, ra.name(), rb.name());
++ return writeInst(PPC_cmpw | cr << 23 | ra.code() << 16 | rb.code() << 11);
++}
++
++BufferOffset Assembler::as_cmpwi(CRegisterID cr, Register ra, int16_t im) {
++ spew("cmpwi\tcr%d,%3s,%d", cr, ra.name(), im);
++ return writeInst(PPC_cmpwi | cr << 23 | ra.code() << 16 |
++ ((uint16_t)im & 0xffff));
++}
++
++BufferOffset Assembler::as_cmplw(CRegisterID cr, Register ra, Register rb) {
++ spew("cmplw\tcr%d,%3s,%3s", cr, ra.name(), rb.name());
++ return writeInst(PPC_cmplw | cr << 23 | ra.code() << 16 | rb.code() << 11);
++}
++
++BufferOffset Assembler::as_cmplwi(CRegisterID cr, Register ra, int16_t im) {
++ spew("cmplwi\tcr%d,%3s,%d", cr, ra.name(), im);
++ return writeInst(PPC_cmplwi | cr << 23 | ra.code() << 16 |
++ ((uint16_t)im & 0xffff));
++}
++
++// Compare instructions (cr0 implicit).
++BufferOffset Assembler::as_cmpd(Register ra, Register rb) {
++ spew("cmpd\t%3s,%3s", ra.name(), rb.name());
++ return writeInst(PPC_cmpd | ra.code() << 16 | rb.code() << 11);
++}
++
++BufferOffset Assembler::as_cmpdi(Register ra, int16_t im) {
++ spew("cmpdi\t%3s,%d", ra.name(), im);
++ return writeInst(PPC_cmpdi | ra.code() << 16 | ((uint16_t)im & 0xffff));
++}
++
++BufferOffset Assembler::as_cmpld(Register ra, Register rb) {
++ spew("cmpld\t%3s,%3s", ra.name(), rb.name());
++ return writeInst(PPC_cmpld | ra.code() << 16 | rb.code() << 11);
++}
++
++BufferOffset Assembler::as_cmpldi(Register ra, int16_t im) {
++ spew("cmpldi\t%3s,%d", ra.name(), im);
++ return writeInst(PPC_cmpldi | ra.code() << 16 | ((uint16_t)im & 0xffff));
++}
++
++BufferOffset Assembler::as_cmpw(Register ra, Register rb) {
++ spew("cmpw\t%3s,%3s", ra.name(), rb.name());
++ return writeInst(PPC_cmpw | ra.code() << 16 | rb.code() << 11);
++}
++
++BufferOffset Assembler::as_cmpwi(Register ra, int16_t im) {
++ spew("cmpwi\t%3s,%d", ra.name(), im);
++ return writeInst(PPC_cmpwi | ra.code() << 16 | ((uint16_t)im & 0xffff));
++}
++
++BufferOffset Assembler::as_cmplw(Register ra, Register rb) {
++ spew("cmplw\t%3s,%3s", ra.name(), rb.name());
++ return writeInst(PPC_cmplw | ra.code() << 16 | rb.code() << 11);
++}
++
++BufferOffset Assembler::as_cmplwi(Register ra, int16_t im) {
++ spew("cmplwi\t%3s,%d", ra.name(), im);
++ return writeInst(PPC_cmplwi | ra.code() << 16 | ((uint16_t)im & 0xffff));
++}
++
++// FP encoding helpers.
++static uint32_t AForm(uint32_t op, FloatRegister frt, FloatRegister fra,
++ FloatRegister frb, FloatRegister frc, bool rc) {
++ return (op | (frt.encoding() << 21) | (fra.encoding() << 16) |
++ (frb.encoding() << 11) | (frc.encoding() << 6) | rc);
++}
++
++static uint32_t XForm(uint32_t op, FloatRegister frt, FloatRegister fra,
++ FloatRegister frb, bool rc) {
++ return (op | (frt.encoding() << 21) | (fra.encoding() << 16) |
++ (frb.encoding() << 11) | rc);
++}
++
++static uint32_t XForm(uint32_t op, FloatRegister frt, Register ra, Register rb,
++ bool rc) {
++ return (op | (frt.encoding() << 21) | (ra.code() << 16) | (rb.code() << 11) |
++ rc);
++}
++
++static uint32_t DForm(uint32_t op, FloatRegister frt, Register ra,
++ int16_t imm) {
++ return (op | (frt.encoding() << 21) | (ra.code() << 16) |
++ ((uint16_t)imm & 0xffff));
++}
++
++// XX-form encoders. Each form has its own X-bit positions.
++// All take uint32_t encodings (0-63) so they correctly
++// emit the high bit for VSR32-63. FloatRegister.encoding() returns 0-31
++// for Single/Double (= VSR0-31 = FPR namespace) and 32-63 for Simd128
++// (= VSR32-63 = VR namespace) — so a single XX-form encoder addresses
++// the full VSR space.
++
++// XX1-form: T + GPR (RA) + GPR (RB). TX bit at instruction bit 0.
++// Used by lxvx, stxvx, lxvd2x, stxvd2x, mtvsrdd, mtvsrd, mtvsrws, mtvsrwz.
++static uint32_t XX1Form(uint32_t op, uint32_t xt, uint32_t ra, uint32_t rb) {
++ return op | (xt & 31) << 21 | (ra & 31) << 16 | (rb & 31) << 11 |
++ ((xt >> 5) & 1);
++}
++
++// XX1-form for mfvsrX: GPR (RT) + VSR (XS). TX bit ("SX") at instruction
++// bit 0; the X spec calls this SX since the source register is the VSR.
++// Used by mfvsrd, mfvsrld.
++static uint32_t XX1FormMfvsr(uint32_t op, uint32_t rt, uint32_t xs) {
++ return op | (xs & 31) << 21 | (rt & 31) << 16 | ((xs >> 5) & 1);
++}
++
++// XX2-form: T + B (no A field; bits 16-20 unused or hold a UIM). BX bit
++// at instruction bit 1, TX bit at instruction bit 0. The bits16-20 slot
++// is set by callers — for plain XX2 it must be 0, for XX2 with UIM it
++// holds the immediate.
++// Used by xxbrd, xxbrh, xxbrw, xxbrq, xscvdpsp, xscvspdp, xscvdpspn,
++// xscvspdpn, xxspltw (UIM=2 bits), xxinsertw (UIM=4 bits),
++// xxextractuw (UIM=4 bits), xvabs*/xvneg*/xvsqrt*/xvr* etc. via
++// DEF_VSX_UN.
++static uint32_t XX2Form(uint32_t op, uint32_t xt, uint32_t xb,
++ uint32_t bits16to20 = 0) {
++ return op | (xt & 31) << 21 | (bits16to20 & 31) << 16 | (xb & 31) << 11 |
++ ((xb >> 5) & 1) << 1 | ((xt >> 5) & 1);
++}
++
++// XX3-form: T + A + B. AX/BX/TX bits at instruction bits 2/1/0.
++// Used by xxlor, xxland, xxlxor, xxlnor, xxlandc, xxpermdi, xsmaxjdp,
++// xsminjdp, xvadd*, xvcmp*, etc.
++static uint32_t XX3Form(uint32_t op, uint32_t xt, uint32_t xa, uint32_t xb) {
++ return op | (xt & 31) << 21 | (xa & 31) << 16 | (xb & 31) << 11 |
++ ((xa >> 5) & 1) << 2 | ((xb >> 5) & 1) << 1 | ((xt >> 5) & 1);
++}
++
++// XX4-form: T + A + B + C. CX/AX/BX/TX bits at instruction bits 3/2/1/0.
++// Used by xxsel.
++static uint32_t XX4Form(uint32_t op, uint32_t xt, uint32_t xa, uint32_t xb,
++ uint32_t xc) {
++ return op | (xt & 31) << 21 | (xa & 31) << 16 | (xb & 31) << 11 |
++ (xc & 31) << 6 | ((xc >> 5) & 1) << 3 | ((xa >> 5) & 1) << 2 |
++ ((xb >> 5) & 1) << 1 | ((xt >> 5) & 1);
++}
++
++// FloatRegister convenience overload for XX3Form (the most common form).
++static uint32_t XX3Form(uint32_t op, FloatRegister xt, FloatRegister xa,
++ FloatRegister xb) {
++ return XX3Form(op, uint32_t(xt.encoding()), uint32_t(xa.encoding()),
++ uint32_t(xb.encoding()));
++}
++
++// --- Macro-defined instruction emitters ---
++
++// X-form: rd in bits 21-25, ra in 16-20, rb in 11-15.
++#define DEF_XFORM(op) \
++ BufferOffset Assembler::as_##op(Register rd, Register ra, Register rb) { \
++ spew(#op "\t%3s,%3s,%3s", rd.name(), ra.name(), rb.name()); \
++ return writeInst(InstReg(PPC_##op, rd, ra, rb).encode()); \
++ }
++
++#define DEF_XFORM_RC(op) \
++ BufferOffset Assembler::as_##op##_rc(Register rd, Register ra, \
++ Register rb) { \
++ spew(#op ".\t%3s,%3s,%3s", rd.name(), ra.name(), rb.name()); \
++ return writeInst(InstReg(PPC_##op, rd, ra, rb).encode() | 0x1); \
++ }
++
++// X-form with swapped RS/RA encoding: rs in bits 21-25, ra in 16-20.
++#define DEF_XFORMS(op) \
++ BufferOffset Assembler::as_##op(Register rd, Register ra, Register rb) { \
++ spew(#op "\t%3s,%3s,%3s", rd.name(), ra.name(), rb.name()); \
++ return writeInst(InstReg(PPC_##op, ra, rd, rb).encode()); \
++ }
++
++#define DEF_XFORMS_RC(op) \
++ BufferOffset Assembler::as_##op##_rc(Register rd, Register ra, \
++ Register rb) { \
++ spew(#op ".\t%3s,%3s,%3s", rd.name(), ra.name(), rb.name()); \
++ return writeInst(InstReg(PPC_##op, ra, rd, rb).encode() | 0x1); \
++ }
++
++// X-form shift immediate with swapped encoding.
++#define DEF_XFORMS_I(op) \
++ BufferOffset Assembler::as_##op(Register rd, Register ra, uint8_t sh) { \
++ spew(#op "\t%3s,%3s,%d", rd.name(), ra.name(), sh); \
++ MOZ_ASSERT(sh < 32); \
++ return writeInst(PPC_##op | ra.code() << 21 | rd.code() << 16 | sh << 11); \
++ }
++
++// 2-reg X-form: rd in bits 21-25, ra in 16-20, rb=r0.
++#define DEF_XFORM2(op) \
++ BufferOffset Assembler::as_##op(Register rd, Register ra) { \
++ spew(#op "\t%3s,%3s", rd.name(), ra.name()); \
++ return writeInst(InstReg(PPC_##op, rd, ra, r0).encode()); \
++ }
++
++#define DEF_XFORM2_RC(op) \
++ BufferOffset Assembler::as_##op##_rc(Register rd, Register ra) { \
++ spew(#op ".\t%3s,%3s", rd.name(), ra.name()); \
++ return writeInst(InstReg(PPC_##op, rd, ra, r0).encode() | 0x1); \
++ }
++
++// 2-reg X-form swapped: ra in bits 21-25, rd in 16-20.
++#define DEF_XFORM2S(op) \
++ BufferOffset Assembler::as_##op(Register rd, Register ra) { \
++ spew(#op "\t%3s,%3s", rd.name(), ra.name()); \
++ return writeInst(InstReg(PPC_##op, ra, rd, r0).encode()); \
++ }
++
++#define DEF_XFORM2S_RC(op) \
++ BufferOffset Assembler::as_##op##_rc(Register rd, Register ra) { \
++ spew(#op ".\t%3s,%3s", rd.name(), ra.name()); \
++ return writeInst(InstReg(PPC_##op, ra, rd, r0).encode() | 0x1); \
++ }
++
++// D-form load/store: rd=RT, rb=RA (base register), off=displacement.
++// r0 cannot be used as base register for D-form loads/stores.
++#define DEF_DFORM(op) \
++ BufferOffset Assembler::as_##op(Register rd, Register rb, int16_t off) { \
++ spew(#op "\t%3s,%d(%3s)", rd.name(), off, rb.name()); \
++ MOZ_ASSERT(rb != r0); \
++ return writeInst(InstImm(PPC_##op, rd, rb, off).encode()); \
++ }
++
++// D-form with swapped RS/RA encoding for logical immediates.
++#define DEF_DFORMS(op) \
++ BufferOffset Assembler::as_##op(Register rd, Register ra, uint16_t im) { \
++ spew(#op "\t%3s,%3s,%d", rd.name(), ra.name(), im); \
++ return writeInst(InstImm(PPC_##op, ra, rd, im).encode()); \
++ }
++
++// M-form: rotate with 3 registers + mb + me.
++#define DEF_MFORM(op) \
++ BufferOffset Assembler::as_##op(Register rd, Register rs, Register rb, \
++ uint8_t mb, uint8_t me) { \
++ spew(#op "\t%3s,%3s,%3s,%d,%d", rd.name(), rs.name(), rb.name(), mb, me); \
++ MOZ_ASSERT(mb < 32); \
++ MOZ_ASSERT(me < 32); \
++ return writeInst(PPC_##op | rs.code() << 21 | rd.code() << 16 | \
++ rb.code() << 11 | mb << 6 | me << 1); \
++ }
++
++// M-form with immediate shift.
++#define DEF_MFORM_I(op) \
++ BufferOffset Assembler::as_##op(Register rd, Register rs, uint8_t sh, \
++ uint8_t mb, uint8_t me) { \
++ spew(#op "\t%3s,%3s,%d,%d,%d", rd.name(), rs.name(), sh, mb, me); \
++ MOZ_ASSERT(sh < 32); \
++ MOZ_ASSERT(mb < 32); \
++ MOZ_ASSERT(me < 32); \
++ return writeInst(PPC_##op | rs.code() << 21 | rd.code() << 16 | sh << 11 | \
++ mb << 6 | me << 1); \
++ }
++
++#define DEF_MFORM_I_RC(op) \
++ BufferOffset Assembler::as_##op##_rc(Register rd, Register rs, uint8_t sh, \
++ uint8_t mb, uint8_t me) { \
++ spew(#op ".\t%3s,%3s,%d,%d,%d", rd.name(), rs.name(), sh, mb, me); \
++ MOZ_ASSERT(sh < 32); \
++ MOZ_ASSERT(mb < 32); \
++ MOZ_ASSERT(me < 32); \
++ return writeInst(PPC_##op | rs.code() << 21 | rd.code() << 16 | sh << 11 | \
++ mb << 6 | me << 1 | 1); \
++ }
++
++// MDS-form: rotate with register + mb (64-bit).
++#define DEF_MDSFORM(op) \
++ BufferOffset Assembler::as_##op(Register ra, Register rs, Register rb, \
++ uint8_t mb) { \
++ spew(#op "\t%3s,%3s,%3s,%d", ra.name(), rs.name(), rb.name(), mb); \
++ MOZ_ASSERT(mb < 64); \
++ return writeInst(PPC_##op | rs.code() << 21 | ra.code() << 16 | \
++ rb.code() << 11 | ((mb & 0x1f) << 6) | (mb & 0x20)); \
++ }
++
++#define DEF_MDSFORM_RC(op) \
++ BufferOffset Assembler::as_##op##_rc(Register ra, Register rs, Register rb, \
++ uint8_t mb) { \
++ spew(#op ".\t%3s,%3s,%3s,%d", ra.name(), rs.name(), rb.name(), mb); \
++ MOZ_ASSERT(mb < 64); \
++ return writeInst(PPC_##op | rs.code() << 21 | ra.code() << 16 | \
++ rb.code() << 11 | ((mb & 0x1f) << 6) | (mb & 0x20) | 1); \
++ }
++
++// MD-form: rotate/shift with immediate sh + mb (64-bit).
++// sh and mb are 6-bit fields split across the instruction word.
++#define DEF_MDFORM(op) \
++ BufferOffset Assembler::as_##op(Register ra, Register rs, uint8_t sh, \
++ uint8_t mb) { \
++ spew(#op "\t%3s,%3s,%d,%d", ra.name(), rs.name(), sh, mb); \
++ MOZ_ASSERT(sh < 64); \
++ MOZ_ASSERT(mb < 64); \
++ return writeInst(PPC_##op | rs.code() << 21 | ra.code() << 16 | \
++ ((sh & 0x1f) << 11) | ((mb & 0x1f) << 6) | (mb & 0x20) | \
++ ((sh & 0x20) >> 4)); \
++ }
++
++#define DEF_MDFORM_RC(op) \
++ BufferOffset Assembler::as_##op##_rc(Register ra, Register rs, uint8_t sh, \
++ uint8_t mb) { \
++ spew(#op ".\t%3s,%3s,%d,%d", ra.name(), rs.name(), sh, mb); \
++ MOZ_ASSERT(sh < 64); \
++ MOZ_ASSERT(mb < 64); \
++ return writeInst(PPC_##op | rs.code() << 21 | ra.code() << 16 | \
++ ((sh & 0x1f) << 11) | ((mb & 0x1f) << 6) | (mb & 0x20) | \
++ ((sh & 0x20) >> 4) | 0x01); \
++ }
++
++// FP 2-reg X-form: frt in bits 21-25, fra=f0, frb in 11-15.
++#define DEF_XFORM2_F(op) \
++ BufferOffset Assembler::as_##op(FloatRegister rd, FloatRegister ra) { \
++ spew(#op "\t%3s,%3s", rd.name(), ra.name()); \
++ return writeInst(XForm(PPC_##op, rd, f0, ra, false)); \
++ }
++
++#define DEF_XFORM2_F_RC(op) \
++ BufferOffset Assembler::as_##op##_rc(FloatRegister rd, FloatRegister ra) { \
++ spew(#op ".\t%3s,%3s", rd.name(), ra.name()); \
++ return writeInst(XForm(PPC_##op, rd, f0, ra, true)); \
++ }
++
++// FP A-form with frc (fmul-type): frt, fra, frc; frb=f0.
++#define DEF_AFORM_C(op) \
++ BufferOffset Assembler::as_##op(FloatRegister rd, FloatRegister ra, \
++ FloatRegister rc) { \
++ spew(#op "\t%3s,%3s,%3s", rd.name(), ra.name(), rc.name()); \
++ return writeInst(AForm(PPC_##op, rd, ra, f0, rc, false)); \
++ }
++
++#define DEF_AFORM_C_RC(op) \
++ BufferOffset Assembler::as_##op##_rc(FloatRegister rd, FloatRegister ra, \
++ FloatRegister rc) { \
++ spew(#op ".\t%3s,%3s,%3s", rd.name(), ra.name(), rc.name()); \
++ return writeInst(AForm(PPC_##op, rd, ra, f0, rc, true)); \
++ }
++
++// FP A-form with frb (fadd-type): frt, fra, frb; frc=f0.
++#define DEF_AFORM_B(op) \
++ BufferOffset Assembler::as_##op(FloatRegister rd, FloatRegister ra, \
++ FloatRegister rb) { \
++ spew(#op "\t%3s,%3s,%3s", rd.name(), ra.name(), rb.name()); \
++ return writeInst(AForm(PPC_##op, rd, ra, rb, f0, false)); \
++ }
++
++#define DEF_AFORM_B_RC(op) \
++ BufferOffset Assembler::as_##op##_rc(FloatRegister rd, FloatRegister ra, \
++ FloatRegister rb) { \
++ spew(#op ".\t%3s,%3s,%3s", rd.name(), ra.name(), rb.name()); \
++ return writeInst(AForm(PPC_##op, rd, ra, rb, f0, true)); \
++ }
++
++// Full FP A-form: frt, fra, frc, frb (fmadd-type).
++#define DEF_AFORM(op) \
++ BufferOffset Assembler::as_##op(FloatRegister rd, FloatRegister ra, \
++ FloatRegister rc, FloatRegister rb) { \
++ spew(#op "\t%3s,%3s,%3s,%3s", rd.name(), ra.name(), rc.name(), rb.name()); \
++ return writeInst(AForm(PPC_##op, rd, ra, rb, rc, false)); \
++ }
++
++#define DEF_AFORM_RC(op) \
++ BufferOffset Assembler::as_##op##_rc(FloatRegister rd, FloatRegister ra, \
++ FloatRegister rc, FloatRegister rb) { \
++ spew(#op ".\t%3s,%3s,%3s,%3s", rd.name(), ra.name(), rc.name(), \
++ rb.name()); \
++ return writeInst(AForm(PPC_##op, rd, ra, rb, rc, true)); \
++ }
++
++// FP D-form load/store.
++#define DEF_DFORM_F(op) \
++ BufferOffset Assembler::as_##op(FloatRegister rd, Register rb, \
++ int16_t off) { \
++ spew(#op "\t%3s,%d(%3s)", rd.name(), off, rb.name()); \
++ MOZ_ASSERT(rb != r0); \
++ return writeInst(DForm(PPC_##op, rd, rb, off)); \
++ }
++
++// FP X-form indexed load/store.
++#define DEF_FMEMx(op) \
++ BufferOffset Assembler::as_##op(FloatRegister rd, Register ra, \
++ Register rb) { \
++ spew(#op "\t%3s,%3s,%3s", rd.name(), ra.name(), rb.name()); \
++ return writeInst(XForm(PPC_##op, rd, ra, rb, false)); \
++ }
++
++// --- Rotate/shift instructions ---
++
++DEF_MFORM(rlwnm)
++DEF_MFORM_I(rlwinm)
++DEF_MFORM_I_RC(rlwinm)
++DEF_MFORM_I(rlwimi)
++DEF_XFORMS_I(srawi)
++
++DEF_MDSFORM(rldcl)
++DEF_MDFORM(rldicl)
++DEF_MDFORM_RC(rldicl)
++DEF_MDFORM(rldicr)
++DEF_MDFORM_RC(rldicr)
++DEF_MDFORM(rldimi)
++
++BufferOffset Assembler::as_sradi(Register rd, Register rs, int sh) {
++ spew("sradi\t%3s,%3s,%d", rd.name(), rs.name(), sh);
++ MOZ_ASSERT(sh >= 0 && sh < 64);
++ return writeInst(PPC_sradi | rd.code() << 16 | rs.code() << 21 |
++ (sh & 0x1f) << 11 | (sh & 0x20) >> 4);
++}
++
++// --- ALU three-register ---
++
++#define DEF_ALU2(op) DEF_XFORM(op)
++
++DEF_ALU2(add)
++DEF_ALU2(addc)
++DEF_ALU2(adde)
++DEF_ALU2(subf)
++DEF_ALU2(subfc)
++DEF_ALU2(subfe)
++DEF_ALU2(divd)
++DEF_ALU2(divdu)
++DEF_ALU2(divw)
++DEF_ALU2(divwu)
++// POWER9 modulo (XO-form, same encoding pattern as div).
++DEF_XFORM(modsd)
++DEF_XFORM(modsw)
++DEF_XFORM(modud)
++DEF_XFORM(moduw)
++DEF_ALU2(mulld)
++DEF_ALU2(mulhd)
++DEF_ALU2(mulhdu)
++DEF_ALU2(mulldo)
++DEF_ALU2(mullw)
++DEF_ALU2(mulhwu)
++#undef DEF_ALU2
++
++// --- ALU immediate ---
++
++// D-form ALU-immediate ops have no Rc bit at instruction LSB (that bit
++// is part of the 16-bit immediate). The only valid record-form variant
++// in this group is `addic.`, which is a separate primary opcode (13)
++// hand-written below; subfic and mulli have no record form at all.
++#define DEF_ALUI(op) \
++ BufferOffset Assembler::as_##op(Register rd, Register ra, int16_t im) { \
++ spew(#op "\t%3s,%3s,%d", rd.name(), ra.name(), im); \
++ return writeInst(InstImm(PPC_##op, rd, ra, im).encode()); \
++ }
++
++BufferOffset Assembler::as_addi(Register rd, Register ra, int16_t im,
++ bool actually_li) {
++#ifdef DEBUG
++ if (actually_li) {
++ spew("li\t%3s,%d", rd.name(), im);
++ } else {
++ MOZ_ASSERT(ra != r0);
++ spew("addi\t%3s,%3s,%d", rd.name(), ra.name(), im);
++ }
++#endif
++ return writeInst(InstImm(PPC_addi, rd, ra, im).encode());
++}
++
++BufferOffset Assembler::as_addis(Register rd, Register ra, int16_t im,
++ bool actually_lis) {
++#ifdef DEBUG
++ if (actually_lis) {
++ spew("lis\t%3s,%d", rd.name(), im);
++ } else {
++ MOZ_ASSERT(ra != r0);
++ spew("addis\t%3s,%3s,%d", rd.name(), ra.name(), im);
++ }
++#endif
++ return writeInst(InstImm(PPC_addis, rd, ra, im).encode());
++}
++
++DEF_ALUI(mulli)
++DEF_ALUI(subfic)
++#undef DEF_ALUI
++
++// --- ALU unary/extended ---
++
++
++#define DEF_ALUE_S(op) DEF_XFORM2S(op)
++DEF_ALUE_S(cntlzw)
++DEF_ALUE_S(cntlzd)
++DEF_ALUE_S(cnttzd)
++DEF_ALUE_S(cnttzw)
++#undef DEF_ALUE_S
++
++DEF_XFORM2S(popcntd)
++DEF_XFORM2S(popcntw)
++DEF_XFORM2S(brd) // POWER10
++DEF_XFORM2S(brh) // POWER10
++DEF_XFORM2S(brw) // POWER10
++
++// --- Bitwise logical (three-register) ---
++
++#define DEF_BITALU2(op) DEF_XFORMS(op)
++DEF_BITALU2(nor)
++DEF_BITALU2(slw)
++DEF_BITALU2(srw)
++DEF_BITALU2(sraw)
++DEF_BITALU2(sld)
++DEF_BITALU2(srd)
++DEF_BITALU2(srad)
++#undef DEF_BITALU2
++
++// and_, or_, xor_ are manually defined (trailing underscore to avoid C++
++// keyword conflicts). xs_mr delegates to as_or_ so we must not assert
++// rd==rs==rb in as_or_ (which would be a valid mr).
++BufferOffset Assembler::as_or_(Register rd, Register rs, Register rb) {
++ spew("or\t%3s,%3s,%3s", rd.name(), rs.name(), rb.name());
++ return writeInst(InstReg(PPC_or_, rs, rd, rb).encode());
++}
++
++BufferOffset Assembler::as_xor_(Register rd, Register rs, Register rb) {
++ spew("xor\t%3s,%3s,%3s", rd.name(), rs.name(), rb.name());
++ return writeInst(InstReg(PPC_xor_, rs, rd, rb).encode());
++}
++
++BufferOffset Assembler::as_and_(Register rd, Register rs, Register rb) {
++ spew("and\t%3s,%3s,%3s", rd.name(), rs.name(), rb.name());
++ return writeInst(InstReg(PPC_and_, rs, rd, rb).encode());
++}
++
++BufferOffset Assembler::as_and__rc(Register rd, Register rs, Register rb) {
++ spew("and.\t%3s,%3s,%3s", rd.name(), rs.name(), rb.name());
++ return writeInst(InstReg(PPC_and_, rs, rd, rb).encode() | 0x1);
++}
++
++// --- Bitwise logical (immediate) ---
++
++DEF_DFORMS(ori)
++DEF_DFORMS(oris)
++DEF_DFORMS(xori)
++DEF_DFORMS(xoris)
++
++BufferOffset Assembler::as_andi_rc(Register rd, Register ra, uint16_t im) {
++ spew("andi.\t%3s,%3s,%d", rd.name(), ra.name(), im);
++ return writeInst(InstImm(PPC_andi_dot, ra, rd, im).encode());
++}
++
++// --- Sign extension ---
++
++#define DEF_ALUEXT(op) DEF_XFORM2S(op) DEF_XFORM2S_RC(op)
++DEF_XFORM2S(extsb)
++DEF_XFORM2S(extsh)
++DEF_ALUEXT(extsw)
++#undef DEF_ALUEXT
++
++// --- Integer loads (D-form) ---
++
++DEF_DFORM(lbz)
++DEF_DFORM(lha)
++DEF_DFORM(lhz)
++
++BufferOffset Assembler::as_lwa(Register rd, Register rb, int16_t off) {
++ spew("lwa\t%3s,%d(%3s)", rd.name(), off, rb.name());
++ MOZ_ASSERT(rb != r0);
++ MOZ_ASSERT(!(off & 0x03));
++ return writeInst(InstImm(PPC_lwa, rd, rb, off).encode());
++}
++
++DEF_DFORM(lwz)
++
++BufferOffset Assembler::as_ld(Register rd, Register rb, int16_t off) {
++ spew("ld\t%3s,%d(%3s)", rd.name(), off, rb.name());
++ MOZ_ASSERT(rb != r0);
++ MOZ_ASSERT(!(off & 0x03));
++ return writeInst(InstImm(PPC_ld, rd, rb, off).encode());
++}
++
++// --- Integer stores (D-form) ---
++
++DEF_DFORM(stb)
++DEF_DFORM(sth)
++DEF_DFORM(stw)
++
++BufferOffset Assembler::as_std(Register rd, Register rb, int16_t off) {
++ spew("std\t%3s,%d(%3s)", rd.name(), off, rb.name());
++ MOZ_ASSERT(rb != r0);
++ MOZ_ASSERT(!(off & 0x03));
++ return writeInst(InstImm(PPC_std, rd, rb, off).encode());
++}
++
++DEF_DFORM(stdu)
++
++#undef DEF_DFORM
++#undef DEF_DFORMS
++
++// --- Integer loads/stores (X-form, indexed) ---
++
++#define DEF_MEMx(op) DEF_XFORM(op)
++DEF_MEMx(lbzx) DEF_MEMx(lhax) DEF_MEMx(lhzx) DEF_MEMx(lwax)
++ DEF_MEMx(lwzx) DEF_MEMx(lwarx) DEF_MEMx(lbarx)
++ DEF_MEMx(lharx) DEF_MEMx(ldx) DEF_MEMx(ldarx) DEF_MEMx(stbx)
++ DEF_MEMx(stbcx) DEF_MEMx(stwx) DEF_MEMx(stwbrx) DEF_MEMx(sthx)
++ DEF_MEMx(sthcx) DEF_MEMx(stdx) DEF_MEMx(stdcx)
++ DEF_MEMx(stwcx)
++#undef DEF_MEMx
++
++// --- Integer select ---
++
++BufferOffset Assembler::as_isel(Register rt, Register ra, Register rb,
++ uint16_t bc, CRegisterID cr) {
++ MOZ_ASSERT(ra != r0);
++ return as_isel0(rt, ra, rb, bc, cr);
++}
++
++BufferOffset Assembler::as_isel0(Register rt, Register ra, Register rb,
++ uint16_t bc, CRegisterID cr) {
++ spew("isel\t%3s,%3s,%3s,cr%d:0x%02x", rt.name(), ra.name(), rb.name(), cr,
++ bc);
++ MOZ_ASSERT((bc < 0x40) && ((bc & 0x0f) == 0x0c));
++ uint16_t nbc = (bc >> 4) + (cr << 2);
++ return writeInst(PPC_isel | rt.code() << 21 | ra.code() << 16 |
++ rb.code() << 11 | nbc << 6);
++}
++
++BufferOffset Assembler::as_setbc(Register rt, uint16_t bc, CRegisterID cr) {
++ spew("setbc\t%3s,cr%d:0x%02x", rt.name(), cr, bc);
++ MOZ_ASSERT((bc < 0x40) && ((bc & 0x0f) == 0x0c));
++ uint16_t nbc = (bc >> 4) + (cr << 2);
++ return writeInst(PPC_setbc | (rt.code() << 21) | (nbc << 16));
++}
++
++BufferOffset Assembler::as_setbcr(Register rt, uint16_t bc, CRegisterID cr) {
++ spew("setbcr\t%3s,cr%d:0x%02x", rt.name(), cr, bc);
++ MOZ_ASSERT((bc < 0x40) && ((bc & 0x0f) == 0x0c));
++ uint16_t nbc = (bc >> 4) + (cr << 2);
++ return writeInst(PPC_setbcr | (rt.code() << 21) | (nbc << 16));
++}
++
++// --- FP compare ---
++
++BufferOffset Assembler::as_fcmpu(CRegisterID cr, FloatRegister ra,
++ FloatRegister rb) {
++ spew("fcmpu\tcr%d,%3s,%3s", cr, ra.name(), rb.name());
++ return writeInst(PPC_fcmpu | cr << 23 | ra.encoding() << 16 |
++ rb.encoding() << 11);
++}
++
++BufferOffset Assembler::as_fcmpu(FloatRegister ra, FloatRegister rb) {
++ return as_fcmpu(cr0, ra, rb);
++}
++
++// --- FP arithmetic ---
++
++#define DEF_FPUAC(op) DEF_AFORM_C(op)
++DEF_FPUAC(fmul)
++DEF_FPUAC(fmuls)
++#undef DEF_FPUAC
++
++#define DEF_FPUAB(op) DEF_AFORM_B(op)
++DEF_FPUAB(fadd)
++DEF_FPUAB(fdiv)
++DEF_FPUAB(fsub)
++DEF_FPUAB(fadds)
++DEF_FPUAB(fdivs)
++DEF_FPUAB(fsubs)
++DEF_FPUAB(fcpsgn)
++#undef DEF_FPUAB
++
++// --- FP unary/conversion/rounding ---
++
++#define DEF_FPUDS(op) DEF_XFORM2_F(op)
++DEF_FPUDS(fabs)
++DEF_FPUDS(fneg)
++DEF_FPUDS(fmr)
++DEF_FPUDS(fcfid)
++DEF_FPUDS(fcfids)
++DEF_FPUDS(fcfidu)
++DEF_FPUDS(fcfidus)
++DEF_FPUDS(fctid)
++DEF_FPUDS(fctidz)
++DEF_FPUDS(fctiduz)
++DEF_FPUDS(fctiwz)
++DEF_FPUDS(frim)
++DEF_FPUDS(frip)
++DEF_FPUDS(friz)
++DEF_FPUDS(frsp)
++DEF_FPUDS(fsqrt)
++DEF_FPUDS(fsqrts)
++#undef DEF_FPUDS
++
++// --- FP loads/stores (D-form) ---
++
++DEF_DFORM_F(lfd)
++DEF_DFORM_F(lfs)
++DEF_DFORM_F(stfd)
++DEF_DFORM_F(stfs)
++DEF_DFORM_F(stfdu)
++DEF_DFORM_F(stfsu)
++
++// --- FP loads/stores (X-form, indexed) ---
++
++DEF_FMEMx(lfdx) DEF_FMEMx(lfsx) DEF_FMEMx(lfiwax)
++ DEF_FMEMx(stfdx) DEF_FMEMx(stfsx)
++// Clean up macros.
++#undef DEF_XFORM
++#undef DEF_XFORM_RC
++#undef DEF_XFORMS
++#undef DEF_XFORMS_RC
++#undef DEF_XFORMS_I
++#undef DEF_XFORM2
++#undef DEF_XFORM2_RC
++#undef DEF_XFORM2S
++#undef DEF_XFORM2S_RC
++#undef DEF_XFORM2_F
++#undef DEF_XFORM2_F_RC
++#undef DEF_MFORM
++#undef DEF_MFORM_I
++#undef DEF_MFORM_I_RC
++#undef DEF_MDSFORM
++#undef DEF_MDSFORM_RC
++#undef DEF_MDFORM
++#undef DEF_MDFORM_RC
++#undef DEF_DFORM_F
++#undef DEF_FMEMx
++#undef DEF_AFORM_C
++#undef DEF_AFORM_C_RC
++#undef DEF_AFORM_B
++#undef DEF_AFORM_B_RC
++#undef DEF_AFORM
++#undef DEF_AFORM_RC
++
++ // --- FPSCR operations ---
++
++ BufferOffset Assembler::as_mtfsb0(uint8_t bt) {
++ spew("mtfsb0\t%d", bt);
++ return writeInst(PPC_mtfsb0 | (uint32_t)bt << 21);
++}
++
++BufferOffset Assembler::as_mcrfs(CRegisterID bf, uint8_t bfa) {
++ spew("mcrfs\tcr%d,%d", bf, bfa);
++ return writeInst(PPC_mcrfs | (uint32_t)bf << 23 | (uint32_t)bfa << 18);
++}
++
++// --- VSX (FPR-only subset) ---
++
++BufferOffset Assembler::as_mfvsrd(Register ra, FloatRegister xs) {
++ spew("mfvsrd\t%3s,%3s", ra.name(), xs.name());
++ return writeInst(XX1FormMfvsr(PPC_mfvsrd, ra.code(), xs.encoding()));
++}
++
++BufferOffset Assembler::as_mtvsrd(FloatRegister xt, Register ra) {
++ spew("mtvsrd\t%3s,%3s", xt.name(), ra.name());
++ return writeInst(XX1Form(PPC_mtvsrd, xt.encoding(), ra.code(), 0));
++}
++
++BufferOffset Assembler::as_mtvsrwa(FloatRegister xt, Register ra) {
++ spew("mtvsrwa\t%3s,%3s", xt.name(), ra.name());
++ return writeInst(XX1Form(PPC_mtvsrwa, xt.encoding(), ra.code(), 0));
++}
++
++BufferOffset Assembler::as_mtvsrws(FloatRegister xt, Register ra) {
++ spew("mtvsrws\t%3s,%3s", xt.name(), ra.name());
++ return writeInst(XX1Form(PPC_mtvsrws, xt.encoding(), ra.code(), 0));
++}
++
++BufferOffset Assembler::as_mtvsrwz(FloatRegister xt, Register ra) {
++ spew("mtvsrwz\t%3s,%3s", xt.name(), ra.name());
++ return writeInst(XX1Form(PPC_mtvsrwz, xt.encoding(), ra.code(), 0));
++}
++
++BufferOffset Assembler::as_xxbrd(FloatRegister xt, FloatRegister xb) {
++ spew("xxbrd\t%3s,%3s", xt.name(), xb.name());
++ return writeInst(XX2Form(PPC_xxbrd, xt.encoding(), xb.encoding()));
++}
++
++BufferOffset Assembler::as_xscvdpspn(FloatRegister xt, FloatRegister xb) {
++ spew("xscvdpspn\t%3s,%3s", xt.name(), xb.name());
++ return writeInst(XX2Form(PPC_xscvdpspn, xt.encoding(), xb.encoding()));
++}
++
++BufferOffset Assembler::as_xscvspdpn(FloatRegister xt, FloatRegister xb) {
++ spew("xscvspdpn\t%3s,%3s", xt.name(), xb.name());
++ return writeInst(XX2Form(PPC_xscvspdpn, xt.encoding(), xb.encoding()));
++}
++
++// POWER9 (ISA 3.0) scalar FP16 conversions. The UIM disambiguator is
++// already in PPC_xscvdphp / PPC_xscvhpdp; XX2Form's bits16to20 default
++// of 0 leaves it intact.
++BufferOffset Assembler::as_xscvdphp(FloatRegister xt, FloatRegister xb) {
++ spew("xscvdphp\t%3s,%3s", xt.name(), xb.name());
++ return writeInst(XX2Form(PPC_xscvdphp, xt.encoding(), xb.encoding()));
++}
++
++BufferOffset Assembler::as_xscvhpdp(FloatRegister xt, FloatRegister xb) {
++ spew("xscvhpdp\t%3s,%3s", xt.name(), xb.name());
++ return writeInst(XX2Form(PPC_xscvhpdp, xt.encoding(), xb.encoding()));
++}
++
++BufferOffset Assembler::as_xsxexpdp(FloatRegister xt, FloatRegister xb) {
++ spew("xsxexpdp\t%3s,%3s", xt.name(), xb.name());
++ return writeInst(XX2Form(PPC_xsxexpdp, xt.encoding(), xb.encoding()));
++}
++
++// POWER9 (ISA 3.0) FP16 load/store, X-form indexed. lxsihzx loads
++// 16 bits into VSR dword 0 word 1's low halfword (zeroing the rest);
++// stxsihx stores from there. The XT[5]/XS[5] bit travels via the
++// X-form's TX/SX bit at instruction bit 0.
++BufferOffset Assembler::as_lxsihzx(FloatRegister xt, Register ra, Register rb) {
++ spew("lxsihzx\t%3s,%3s,%3s", xt.name(), ra.name(), rb.name());
++ return writeInst(PPC_lxsihzx | (xt.encoding() & 31) << 21 |
++ ra.code() << 16 | rb.code() << 11 |
++ ((xt.encoding() >> 5) & 1));
++}
++
++BufferOffset Assembler::as_stxsihx(FloatRegister xs, Register ra, Register rb) {
++ spew("stxsihx\t%3s,%3s,%3s", xs.name(), ra.name(), rb.name());
++ return writeInst(PPC_stxsihx | (xs.encoding() & 31) << 21 |
++ ra.code() << 16 | rb.code() << 11 |
++ ((xs.encoding() >> 5) & 1));
++}
++
++// XX3-form, FPR-space only (encoding 0..31 → VSR0..31, all AX/BX/TX = 0).
++// Java/JavaScript-style scalar max/min — semantics verified to match
++// ECMA-262 Math.max/Math.min including ±0 and NaN propagation. POWER9-only.
++BufferOffset Assembler::as_xsmaxjdp(FloatRegister xt, FloatRegister xa,
++ FloatRegister xb) {
++ spew("xsmaxjdp\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name());
++ return writeInst(XX3Form(PPC_xsmaxjdp, xt, xa, xb));
++}
++
++BufferOffset Assembler::as_xsminjdp(FloatRegister xt, FloatRegister xa,
++ FloatRegister xb) {
++ spew("xsminjdp\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name());
++ return writeInst(XX3Form(PPC_xsminjdp, xt, xa, xb));
++}
++
++// --- VSX SIMD load/store ---
++
++// For VSX0-31 (FPR), the 6th register bit (TX/SX/BX) is 0.
++// X-form: opcode | T << 21 | A << 16 | B << 11 | xo | TX
++// lxvx/stxvx are POWER9 (ISA 3.0). lxvd2x/stxvd2x are POWER8 (ISA 2.07).
++
++BufferOffset Assembler::as_lxvx(FloatRegister xt, Register ra, Register rb) {
++ spew("lxvx\t%3s,%3s,%3s", xt.name(), ra.name(), rb.name());
++ return writeInst(XX1Form(PPC_lxvx, xt.encoding(), ra.code(), rb.code()));
++}
++
++BufferOffset Assembler::as_stxvx(FloatRegister xs, Register ra, Register rb) {
++ spew("stxvx\t%3s,%3s,%3s", xs.name(), ra.name(), rb.name());
++ return writeInst(XX1Form(PPC_stxvx, xs.encoding(), ra.code(), rb.code()));
++}
++
++BufferOffset Assembler::as_lxvd2x(FloatRegister xt, Register ra, Register rb) {
++ spew("lxvd2x\t%3s,%3s,%3s", xt.name(), ra.name(), rb.name());
++ return writeInst(XX1Form(PPC_lxvd2x, xt.encoding(), ra.code(), rb.code()));
++}
++
++BufferOffset Assembler::as_stxvd2x(FloatRegister xs, Register ra, Register rb) {
++ spew("stxvd2x\t%3s,%3s,%3s", xs.name(), ra.name(), rb.name());
++ return writeInst(XX1Form(PPC_stxvd2x, xs.encoding(), ra.code(), rb.code()));
++}
++
++// VMX register load/store. See PPC_lvx/PPC_stvx in Assembler-ppc64.h for
++// the encoding rationale.
++BufferOffset Assembler::as_lvx(uint8_t vrt, Register ra, Register rb) {
++ MOZ_ASSERT(vrt < 32);
++ spew("lvx\tvr%d,%3s,%3s", vrt, ra.name(), rb.name());
++ return writeInst(PPC_lvx | uint32_t(vrt) << 21 | ra.code() << 16 |
++ rb.code() << 11);
++}
++
++BufferOffset Assembler::as_stvx(uint8_t vrs, Register ra, Register rb) {
++ MOZ_ASSERT(vrs < 32);
++ spew("stvx\tvr%d,%3s,%3s", vrs, ra.name(), rb.name());
++ return writeInst(PPC_stvx | uint32_t(vrs) << 21 | ra.code() << 16 |
++ rb.code() << 11);
++}
++
++// --- VSX SIMD register operations ---
++
++// XX3-form: opcode | T[0:4]<<21 | A[0:4]<<16 | B[0:4]<<11 | xo | AX | BX | TX
++// where AX/BX/TX (bits 2/1/0) carry bit 5 of each 6-bit VSR index.
++// Encoded by the XX3Form helper above for both VSR0-31 (Single/Double) and
++// VSR32-63 (Simd128) operands.
++BufferOffset Assembler::as_xxlor(FloatRegister xt, FloatRegister xa,
++ FloatRegister xb) {
++ spew("xxlor\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name());
++ return writeInst(XX3Form(PPC_xxlor, xt, xa, xb));
++}
++
++BufferOffset Assembler::as_xxland(FloatRegister xt, FloatRegister xa,
++ FloatRegister xb) {
++ spew("xxland\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name());
++ return writeInst(XX3Form(PPC_xxland, xt, xa, xb));
++}
++
++BufferOffset Assembler::as_xxlxor(FloatRegister xt, FloatRegister xa,
++ FloatRegister xb) {
++ spew("xxlxor\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name());
++ return writeInst(XX3Form(PPC_xxlxor, xt, xa, xb));
++}
++
++BufferOffset Assembler::as_xxlnor(FloatRegister xt, FloatRegister xa,
++ FloatRegister xb) {
++ spew("xxlnor\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name());
++ return writeInst(XX3Form(PPC_xxlnor, xt, xa, xb));
++}
++
++BufferOffset Assembler::as_xxlandc(FloatRegister xt, FloatRegister xa,
++ FloatRegister xb) {
++ spew("xxlandc\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name());
++ return writeInst(XX3Form(PPC_xxlandc, xt, xa, xb));
++}
++
++BufferOffset Assembler::as_xxsel(FloatRegister xt, FloatRegister xa,
++ FloatRegister xb, FloatRegister xc) {
++ spew("xxsel\t%3s,%3s,%3s,%3s", xt.name(), xa.name(), xb.name(), xc.name());
++ return writeInst(XX4Form(PPC_xxsel, xt.encoding(), xa.encoding(),
++ xb.encoding(), xc.encoding()));
++}
++
++BufferOffset Assembler::as_xxpermdi(FloatRegister xt, FloatRegister xa,
++ FloatRegister xb, uint8_t dm) {
++ MOZ_ASSERT(dm < 4);
++ spew("xxpermdi\t%3s,%3s,%3s,%d", xt.name(), xa.name(), xb.name(), dm);
++ return writeInst(XX3Form(PPC_xxpermdi | (uint32_t(dm) << 8), xt, xa, xb));
++}
++
++// POWER9 (ISA 3.0). XX1-form with two GPR sources.
++BufferOffset Assembler::as_mtvsrdd(FloatRegister xt, Register ra, Register rb) {
++ spew("mtvsrdd\t%3s,%3s,%3s", xt.name(), ra.name(), rb.name());
++ return writeInst(XX1Form(PPC_mtvsrdd, xt.encoding(), ra.code(), rb.code()));
++}
++
++// POWER9 (ISA 3.0). XX1-form: move lower doubleword of VSR to GPR.
++BufferOffset Assembler::as_mfvsrld(Register rt, FloatRegister xs) {
++ spew("mfvsrld\t%3s,%3s", rt.name(), xs.name());
++ return writeInst(XX1FormMfvsr(PPC_mfvsrld, rt.code(), xs.encoding()));
++}
++
++// --- XX2-form VSX instructions ---
++
++// XX2-form: opcode | T<<21 | UIM<<16_area | B<<11_area | XO<<2 | BX | TX
++// For VSR0-31, BX=TX=0.
++
++BufferOffset Assembler::as_xxspltw(FloatRegister xt, FloatRegister xb,
++ uint8_t uim) {
++ MOZ_ASSERT(uim < 4);
++ spew("xxspltw\t%3s,%3s,%d", xt.name(), xb.name(), uim);
++ return writeInst(XX2Form(PPC_xxspltw, xt.encoding(), xb.encoding(), uim));
++}
++
++BufferOffset Assembler::as_xxinsertw(FloatRegister xt, FloatRegister xb,
++ uint8_t uim) {
++ MOZ_ASSERT(uim <= 12 && (uim & 3) == 0);
++ spew("xxinsertw\t%3s,%3s,%d", xt.name(), xb.name(), uim);
++ return writeInst(XX2Form(PPC_xxinsertw, xt.encoding(), xb.encoding(), uim));
++}
++
++BufferOffset Assembler::as_xxextractuw(FloatRegister xt, FloatRegister xb,
++ uint8_t uim) {
++ MOZ_ASSERT(uim <= 12 && (uim & 3) == 0);
++ spew("xxextractuw\t%3s,%3s,%d", xt.name(), xb.name(), uim);
++ return writeInst(XX2Form(PPC_xxextractuw, xt.encoding(), xb.encoding(), uim));
++}
++
++// POWER9 (ISA 3.0). XX1-form-ish: T(5) + UIM8(8) + XO + TX. UIM8 occupies
++// bits 18..11 (a non-standard slot that XX1Form doesn't fit), so encode
++// inline. TX bit at instruction bit 0 selects the upper half of VSR
++// space when xt.encoding() is in 32-63 (Simd128).
++BufferOffset Assembler::as_xxspltib(FloatRegister xt, uint8_t imm8) {
++ spew("xxspltib\t%3s,%u", xt.name(), imm8);
++ uint32_t enc = uint32_t(xt.encoding());
++ return writeInst(PPC_xxspltib | (enc & 31) << 21 | (uint32_t)imm8 << 11 |
++ ((enc >> 5) & 1));
++}
++
++// --- VMX instructions ---
++
++// VX-form: (4<<26) | VRT<<21 | UIMM<<16 | VRB<<11 | XO
++// VRT/VRB are 5-bit raw VR numbers (0-31). Simd128 FloatRegister.encoding()
++// returns 32-63; masking with & 31 maps it back to the VR offset 0-31.
++BufferOffset Assembler::as_vspltb(FloatRegister vrt, FloatRegister vrb,
++ uint8_t uim) {
++ MOZ_ASSERT(uim < 16);
++ spew("vspltb\t%3s,%3s,%d", vrt.name(), vrb.name(), uim);
++ return writeInst(PPC_vspltb | (vrt.encoding() & 31) << 21 |
++ (uint32_t)uim << 16 | (vrb.encoding() & 31) << 11);
++}
++
++BufferOffset Assembler::as_vsplth(FloatRegister vrt, FloatRegister vrb,
++ uint8_t uim) {
++ MOZ_ASSERT(uim < 8);
++ spew("vsplth\t%3s,%3s,%d", vrt.name(), vrb.name(), uim);
++ return writeInst(PPC_vsplth | (vrt.encoding() & 31) << 21 |
++ (uint32_t)uim << 16 | (vrb.encoding() & 31) << 11);
++}
++
++// VA-form: (4<<26) | VRT<<21 | VRA<<16 | VRB<<11 | SHB<<6 | XO(6-bit)
++BufferOffset Assembler::as_vsldoi(FloatRegister vrt, FloatRegister vra,
++ FloatRegister vrb, uint8_t shb) {
++ MOZ_ASSERT(shb < 16);
++ spew("vsldoi\t%3s,%3s,%3s,%d", vrt.name(), vra.name(), vrb.name(), shb);
++ return writeInst(PPC_vsldoi | (vrt.encoding() & 31) << 21 |
++ (vra.encoding() & 31) << 16 | (vrb.encoding() & 31) << 11 |
++ (uint32_t)shb << 6);
++}
++
++// --- VMX integer arithmetic (VR registers only) ---
++
++// VX-form: (4<<26) | VRT<<21 | VRA<<16 | VRB<<11 | XO
++// The macro takes raw VR numbers (0-31).
++#define DEF_VMX_VVV(op) \
++ BufferOffset Assembler::as_##op(uint8_t vrt, uint8_t vra, uint8_t vrb) { \
++ MOZ_ASSERT(vrt < 32 && vra < 32 && vrb < 32); \
++ spew(#op "\tvr%d,vr%d,vr%d", vrt, vra, vrb); \
++ return writeInst(PPC_##op | vrt << 21 | vra << 16 | vrb << 11); \
++ }
++
++DEF_VMX_VVV(vaddubm)
++DEF_VMX_VVV(vadduhm)
++DEF_VMX_VVV(vadduwm)
++DEF_VMX_VVV(vaddudm)
++DEF_VMX_VVV(vsububm)
++DEF_VMX_VVV(vsubuhm)
++DEF_VMX_VVV(vsubuwm)
++DEF_VMX_VVV(vsubudm)
++DEF_VMX_VVV(vaddsbs)
++DEF_VMX_VVV(vaddshs)
++DEF_VMX_VVV(vaddubs)
++DEF_VMX_VVV(vadduhs)
++DEF_VMX_VVV(vsubsbs)
++DEF_VMX_VVV(vsubshs)
++DEF_VMX_VVV(vsububs)
++DEF_VMX_VVV(vsubuhs)
++DEF_VMX_VVV(vminsb)
++DEF_VMX_VVV(vminsh)
++DEF_VMX_VVV(vminsw)
++DEF_VMX_VVV(vmaxsb)
++DEF_VMX_VVV(vmaxsh)
++DEF_VMX_VVV(vmaxsw)
++DEF_VMX_VVV(vmaxsd)
++DEF_VMX_VVV(vminub)
++DEF_VMX_VVV(vminuh)
++DEF_VMX_VVV(vminuw)
++DEF_VMX_VVV(vmaxub)
++DEF_VMX_VVV(vmaxuh)
++DEF_VMX_VVV(vmaxuw)
++DEF_VMX_VVV(vavgub)
++DEF_VMX_VVV(vavguh)
++DEF_VMX_VVV(vmuluwm)
++DEF_VMX_VVV(vmulld)
++
++DEF_VMX_VVV(vslb)
++DEF_VMX_VVV(vslh)
++DEF_VMX_VVV(vslw)
++DEF_VMX_VVV(vsld)
++DEF_VMX_VVV(vsrb)
++DEF_VMX_VVV(vsrh)
++DEF_VMX_VVV(vsrw)
++DEF_VMX_VVV(vsrd)
++DEF_VMX_VVV(vsrab)
++DEF_VMX_VVV(vsrah)
++DEF_VMX_VVV(vsraw)
++DEF_VMX_VVV(vsrad)
++DEF_VMX_VVV(vslo)
++DEF_VMX_VVV(vsro)
++DEF_VMX_VVV(vcmpequb)
++DEF_VMX_VVV(vcmpequh)
++DEF_VMX_VVV(vcmpequw)
++DEF_VMX_VVV(vcmpequd)
++DEF_VMX_VVV(vcmpgtsb)
++DEF_VMX_VVV(vcmpgtsh)
++DEF_VMX_VVV(vcmpgtsw)
++DEF_VMX_VVV(vcmpgtsd)
++DEF_VMX_VVV(vcmpgtub)
++DEF_VMX_VVV(vcmpgtuh)
++DEF_VMX_VVV(vcmpgtuw)
++DEF_VMX_VVV(vcmpgtud)
++// POWER9 (ISA 3.0). NotEqual compare; saves the xxlnor that vcmpequX needs.
++DEF_VMX_VVV(vcmpneb)
++DEF_VMX_VVV(vcmpneh)
++DEF_VMX_VVV(vcmpnew)
++
++// POWER8+ (ISA 2.07). vbpermq RT,RA,RB: bit-permute quadword.
++DEF_VMX_VVV(vbpermq)
++
++#undef DEF_VMX_VVV
++
++// VC-form record forms: same as VX-form above with Rc bit (bit 10 LSB) set.
++// vcmpXXX. sets CR6: LT = all-true, EQ = none-true.
++#define DEF_VMX_VVV_RC(op) \
++ BufferOffset Assembler::as_##op##_rc(uint8_t vrt, uint8_t vra, \
++ uint8_t vrb) { \
++ MOZ_ASSERT(vrt < 32 && vra < 32 && vrb < 32); \
++ spew(#op ".\tvr%d,vr%d,vr%d", vrt, vra, vrb); \
++ return writeInst(PPC_##op | vrt << 21 | vra << 16 | vrb << 11 | 0x400); \
++ }
++
++DEF_VMX_VVV_RC(vcmpequb)
++DEF_VMX_VVV_RC(vcmpequh)
++DEF_VMX_VVV_RC(vcmpequw)
++DEF_VMX_VVV_RC(vcmpequd)
++
++#undef DEF_VMX_VVV_RC
++
++// VSX float compare (XX3-form).
++#define DEF_VSX_CMP(op) \
++ BufferOffset Assembler::as_##op(FloatRegister xt, FloatRegister xa, \
++ FloatRegister xb) { \
++ spew(#op "\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name()); \
++ return writeInst(XX3Form(PPC_##op, xt, xa, xb)); \
++ }
++
++DEF_VSX_CMP(xvcmpeqsp)
++DEF_VSX_CMP(xvcmpgtsp)
++DEF_VSX_CMP(xvcmpgesp)
++DEF_VSX_CMP(xvcmpeqdp)
++DEF_VSX_CMP(xvcmpgtdp)
++DEF_VSX_CMP(xvcmpgedp)
++
++#undef DEF_VSX_CMP
++
++// VSX float arithmetic (XX3-form binary).
++#define DEF_VSX_BIN(op) \
++ BufferOffset Assembler::as_##op(FloatRegister xt, FloatRegister xa, \
++ FloatRegister xb) { \
++ spew(#op "\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name()); \
++ return writeInst(XX3Form(PPC_##op, xt, xa, xb)); \
++ }
++DEF_VSX_BIN(xvaddsp)
++DEF_VSX_BIN(xvadddp) DEF_VSX_BIN(xvsubsp) DEF_VSX_BIN(xvsubdp) DEF_VSX_BIN(
++ xvmulsp) DEF_VSX_BIN(xvmuldp) DEF_VSX_BIN(xvdivsp) DEF_VSX_BIN(xvdivdp)
++ DEF_VSX_BIN(xvminsp) DEF_VSX_BIN(xvmindp) DEF_VSX_BIN(xvmaxsp) DEF_VSX_BIN(
++ xvmaxdp) DEF_VSX_BIN(xvmaddasp) DEF_VSX_BIN(xvmaddadp)
++ DEF_VSX_BIN(xvnmsubasp) DEF_VSX_BIN(xvnmsubadp)
++#undef DEF_VSX_BIN
++
++// VSX unary (XX2-form): op | xt<<21 | xb<<11 | XO<<2
++// XX2-form unary VSX op: T + B, no UIM. Uses XX2Form helper for TX/BX bits.
++#define DEF_VSX_UN(op) \
++ BufferOffset Assembler::as_##op(FloatRegister xt, FloatRegister xb) { \
++ spew(#op "\t%3s,%3s", xt.name(), xb.name()); \
++ return writeInst(XX2Form(PPC_##op, xt.encoding(), xb.encoding())); \
++ }
++ DEF_VSX_UN(xvabssp) DEF_VSX_UN(xvabsdp) DEF_VSX_UN(
++ xvnegsp) DEF_VSX_UN(xvnegdp) DEF_VSX_UN(xvsqrtsp)
++ DEF_VSX_UN(xvsqrtdp) DEF_VSX_UN(xvrspip) DEF_VSX_UN(
++ xvrdpip) DEF_VSX_UN(xvrspim) DEF_VSX_UN(xvrdpim)
++ DEF_VSX_UN(xvrspiz) DEF_VSX_UN(xvrdpiz) DEF_VSX_UN(
++ xvrspic) DEF_VSX_UN(xvrdpic) DEF_VSX_UN(xvcvsxwsp)
++ DEF_VSX_UN(xvcvuxwsp) DEF_VSX_UN(xvcvsxwdp) DEF_VSX_UN(
++ xvcvuxwdp) DEF_VSX_UN(xvcvspsxws)
++ DEF_VSX_UN(xvcvspuxws) DEF_VSX_UN(xvcvdpsxws)
++ DEF_VSX_UN(xvcvdpuxws) DEF_VSX_UN(xvcvdpsp)
++ DEF_VSX_UN(xvcvspdp)
++#undef DEF_VSX_UN
++
++// VMX unary VX-form: (4<<26) | VRT<<21 | 0<<16 | VRB<<11 | XO
++#define DEF_VMX_UNARY(op) \
++ BufferOffset Assembler::as_##op(uint8_t vrt, uint8_t vrb) { \
++ MOZ_ASSERT(vrt < 32 && vrb < 32); \
++ spew(#op "\tvr%d,vr%d", vrt, vrb); \
++ return writeInst(PPC_##op | vrt << 21 | vrb << 11); \
++ }
++ DEF_VMX_UNARY(vupkhsb) DEF_VMX_UNARY(
++ vupklsb) DEF_VMX_UNARY(vupkhsh)
++ DEF_VMX_UNARY(vupklsh)
++ DEF_VMX_UNARY(vupkhsw)
++ DEF_VMX_UNARY(vupklsw)
++ // POWER9 per-lane integer negate. The VRA field holds the subop code
++ // (6 for vnegw, 7 for vnegd) which is already baked into PPC_vneg{w,d}.
++ DEF_VMX_UNARY(vnegw) DEF_VMX_UNARY(vnegd) DEF_VMX_UNARY(vpopcntb)
++#undef DEF_VMX_UNARY
++
++ // POWER9 addpcis (DX-form). Computes rT = (CIA + 4) + (D << 16).
++ // D is a 16-bit signed immediate, split across three instruction fields:
++ // d0 = bits 16..25 (10 bits, D[15:6])
++ // d1 = bits 11..15 (5 bits, D[5:1])
++ // d2 = bit 31 (1 bit, D[0])
++ // Primary opcode 19, DX subop 2.
++ BufferOffset Assembler::as_addpcis(Register rt, int16_t d) {
++ spew("addpcis\t%s,%d", rt.name(), (int)d);
++ uint32_t D = uint16_t(d);
++ uint32_t inst = (19u << 26) | (uint32_t(rt.code()) << 21) |
++ ((D >> 1) & 0x1F) << 16 | ((D >> 6) & 0x3FF) << 6 |
++ (2u << 1) | (D & 1u);
++ return writeInst(inst);
++}
++
++// -----------------------------------------------------------------------------
++// Power ISA v3.1 (POWER10) prefixed instructions.
++//
++// Layout:
++//
++// Prefix word (BE bit numbering from the manual; LE bits in parentheses):
++// [0..5] primary opcode = 1 (LE 31..26)
++// [6..7] Type: 00 = 8LS, 10 = MLS (LE 25..24)
++// [8..10] reserved = 0 (LE 23..21)
++// [11] R: 1 = PC-relative (RA must be r0) (LE 20)
++// [12..13] reserved = 0 (LE 19..18)
++// [14..31] d0: high 18 bits of 34-bit signed immediate (LE 17..0)
++//
++// Suffix (paddi/pld, GPR target):
++// [0..5] suffix opcode (paddi=14, pld=57) (LE 31..26)
++// [6..10] RT (LE 25..21)
++// [11..15] RA (LE 20..16)
++// [16..31] d1: low 16 bits of immediate (LE 15..0)
++//
++// Suffix (plxv, VSR target — has the TX bit at suffix bit 5/LE bit 26):
++// [0..4] plxv 5-bit opcode = 11001 (=25) (LE 31..27)
++// [5] TX (high bit of 6-bit XT) (LE 26)
++// [6..10] T (low 5 bits of XT) (LE 25..21)
++// [11..15] RA (LE 20..16)
++// [16..31] d1 (LE 15..0)
++//
++// The prefix and suffix of a prefixed instruction must lie in the same
++// 64-byte aligned block at **runtime**. The JitCode allocator only
++// guarantees 16-byte alignment, so the buffer-relative offset and the
++// runtime address can differ by 0/16/32/48 mod 64. A buffer-only check
++// `(currentOffset() & 63) == 60` is correct when the allocator base is
++// 64-aligned but misses three of the four 16-aligned base classes — pad
++// whenever `(currentOffset() & 15) == 12`, which catches all four. The
++// enterNoPool guard prevents the constant-pool flusher from inserting
++// bodies between the (optional) nop, prefix, and suffix.
++
++static uint32_t EncodePower10Prefix(uint32_t type, bool R, uint32_t d0) {
++ MOZ_ASSERT(type == 0 || type == 2); // 8LS=0, MLS=2
++ MOZ_ASSERT(d0 < (1u << 18));
++ return (1u << 26) | (type << 24) | (uint32_t(R ? 1 : 0) << 20) |
++ (d0 & 0x3FFFFu);
++}
++
++static void SplitImm34(int64_t imm34, uint32_t* d0, uint32_t* d1) {
++ MOZ_ASSERT(imm34 >= -(int64_t(1) << 33));
++ MOZ_ASSERT(imm34 < (int64_t(1) << 33));
++ uint64_t u = uint64_t(imm34) & 0x3FFFFFFFFull; // low 34 bits
++ *d0 = uint32_t(u >> 16) & 0x3FFFFu; // 18 bits
++ *d1 = uint32_t(u) & 0xFFFFu; // 16 bits
++}
++
++void Assembler::ensurePrefixedAlignment() {
++ if ((currentOffset() & 15) == 12) {
++ as_nop();
++ }
++}
++
++// paddi RT, RA, SI, R (MLS, suffix opcode 14 = addi)
++// R=0: RT = (RA==0 ? 0 : RA) + sign_extend(SI, 34)
++// R=1: RT = CIA(prefix) + sign_extend(SI, 34) (RA must be r0)
++BufferOffset Assembler::as_paddi(Register rt, Register ra, int64_t imm34,
++ bool R) {
++ MOZ_ASSERT_IF(R, ra == r0);
++ spew("paddi\t%s,%s,%lld,%d", rt.name(), ra.name(), (long long)imm34,
++ R ? 1 : 0);
++ uint32_t d0, d1;
++ SplitImm34(imm34, &d0, &d1);
++ uint32_t prefix = EncodePower10Prefix(/*type=MLS*/ 2, R, d0);
++ uint32_t suffix = (14u << 26) | (uint32_t(rt.code()) << 21) |
++ (uint32_t(ra.code()) << 16) | d1;
++ // Reservation = nop (worst case) + prefix + suffix.
++ m_buffer.enterNoPool(3);
++ ensurePrefixedAlignment();
++ BufferOffset bo = writeInst(prefix);
++ writeInst(suffix);
++ m_buffer.leaveNoPool();
++ return bo;
++}
++
++// pld RT, D(RA), R (8LS, suffix opcode 57)
++BufferOffset Assembler::as_pld(Register rt, Register ra, int64_t imm34,
++ bool R) {
++ MOZ_ASSERT_IF(R, ra == r0);
++ spew("pld\t%s,%lld(%s),%d", rt.name(), (long long)imm34, ra.name(),
++ R ? 1 : 0);
++ uint32_t d0, d1;
++ SplitImm34(imm34, &d0, &d1);
++ uint32_t prefix = EncodePower10Prefix(/*type=8LS*/ 0, R, d0);
++ uint32_t suffix = (57u << 26) | (uint32_t(rt.code()) << 21) |
++ (uint32_t(ra.code()) << 16) | d1;
++ m_buffer.enterNoPool(3);
++ ensurePrefixedAlignment();
++ BufferOffset bo = writeInst(prefix);
++ writeInst(suffix);
++ m_buffer.leaveNoPool();
++ return bo;
++}
++
++// plxv XT, D(RA), R (8LS, 5-bit suffix opcode 25, TX in suffix bit 26)
++// XT is 6-bit: TX (high) || T (low 5) — matches lxvx convention.
++BufferOffset Assembler::as_plxv(uint8_t xt, Register ra, int64_t imm34,
++ bool R) {
++ MOZ_ASSERT(xt < 64);
++ MOZ_ASSERT_IF(R, ra == r0);
++ spew("plxv\tvs%u,%lld(%s),%d", xt, (long long)imm34, ra.name(),
++ R ? 1 : 0);
++ uint32_t d0, d1;
++ SplitImm34(imm34, &d0, &d1);
++ uint32_t prefix = EncodePower10Prefix(/*type=8LS*/ 0, R, d0);
++ uint32_t T = xt & 0x1Fu;
++ uint32_t TX = (xt >> 5) & 1u;
++ uint32_t suffix = (25u << 27) | (TX << 26) | (T << 21) |
++ (uint32_t(ra.code()) << 16) | d1;
++ m_buffer.enterNoPool(3);
++ ensurePrefixedAlignment();
++ BufferOffset bo = writeInst(prefix);
++ writeInst(suffix);
++ m_buffer.leaveNoPool();
++ return bo;
++}
++
++// plfd FRT, D(RA), R (MLS, suffix opcode 50; D-form-like FPR load)
++BufferOffset Assembler::as_plfd(FloatRegister frt, Register ra, int64_t imm34,
++ bool R) {
++ MOZ_ASSERT_IF(R, ra == r0);
++ spew("plfd\tf%u,%lld(%s),%d", uint32_t(frt.encoding()),
++ (long long)imm34, ra.name(), R ? 1 : 0);
++ uint32_t d0, d1;
++ SplitImm34(imm34, &d0, &d1);
++ uint32_t prefix = EncodePower10Prefix(/*type=MLS*/ 2, R, d0);
++ uint32_t suffix = (50u << 26) | (uint32_t(frt.encoding()) << 21) |
++ (uint32_t(ra.code()) << 16) | d1;
++ m_buffer.enterNoPool(3);
++ ensurePrefixedAlignment();
++ BufferOffset bo = writeInst(prefix);
++ writeInst(suffix);
++ m_buffer.leaveNoPool();
++ return bo;
++}
++
++// plfs FRT, D(RA), R (MLS, suffix opcode 48; widens single → double in FPR)
++BufferOffset Assembler::as_plfs(FloatRegister frt, Register ra, int64_t imm34,
++ bool R) {
++ MOZ_ASSERT_IF(R, ra == r0);
++ spew("plfs\tf%u,%lld(%s),%d", uint32_t(frt.encoding()),
++ (long long)imm34, ra.name(), R ? 1 : 0);
++ uint32_t d0, d1;
++ SplitImm34(imm34, &d0, &d1);
++ uint32_t prefix = EncodePower10Prefix(/*type=MLS*/ 2, R, d0);
++ uint32_t suffix = (48u << 26) | (uint32_t(frt.encoding()) << 21) |
++ (uint32_t(ra.code()) << 16) | d1;
++ m_buffer.enterNoPool(3);
++ ensurePrefixedAlignment();
++ BufferOffset bo = writeInst(prefix);
++ writeInst(suffix);
++ m_buffer.leaveNoPool();
++ return bo;
++}
++
++// pstd RS, D(RA), R (8LS, suffix opcode 61 = std D-form)
++BufferOffset Assembler::as_pstd(Register rs, Register ra, int64_t imm34,
++ bool R) {
++ MOZ_ASSERT_IF(R, ra == r0);
++ spew("pstd\t%s,%lld(%s),%d", rs.name(), (long long)imm34, ra.name(),
++ R ? 1 : 0);
++ uint32_t d0, d1;
++ SplitImm34(imm34, &d0, &d1);
++ uint32_t prefix = EncodePower10Prefix(/*type=8LS*/ 0, R, d0);
++ uint32_t suffix = (61u << 26) | (uint32_t(rs.code()) << 21) |
++ (uint32_t(ra.code()) << 16) | d1;
++ m_buffer.enterNoPool(3);
++ ensurePrefixedAlignment();
++ BufferOffset bo = writeInst(prefix);
++ writeInst(suffix);
++ m_buffer.leaveNoPool();
++ return bo;
++}
++
++// pstxv XS, D(RA), R (8LS, 5-bit suffix opcode 27, SX in suffix bit 26)
++// XS is 6-bit: SX (high) || S (low 5) — matches stxvx convention.
++BufferOffset Assembler::as_pstxv(uint8_t xs, Register ra, int64_t imm34,
++ bool R) {
++ MOZ_ASSERT(xs < 64);
++ MOZ_ASSERT_IF(R, ra == r0);
++ spew("pstxv\tvs%u,%lld(%s),%d", xs, (long long)imm34, ra.name(),
++ R ? 1 : 0);
++ uint32_t d0, d1;
++ SplitImm34(imm34, &d0, &d1);
++ uint32_t prefix = EncodePower10Prefix(/*type=8LS*/ 0, R, d0);
++ uint32_t sx = (xs >> 5) & 1;
++ uint32_t s = xs & 0x1F;
++ uint32_t suffix = (27u << 27) | (sx << 26) | (s << 21) |
++ (uint32_t(ra.code()) << 16) | d1;
++ m_buffer.enterNoPool(3);
++ ensurePrefixedAlignment();
++ BufferOffset bo = writeInst(prefix);
++ writeInst(suffix);
++ m_buffer.leaveNoPool();
++ return bo;
++}
++
++// pstfd FRS, D(RA), R (MLS, suffix opcode 54 = stfd)
++BufferOffset Assembler::as_pstfd(FloatRegister frs, Register ra, int64_t imm34,
++ bool R) {
++ MOZ_ASSERT_IF(R, ra == r0);
++ spew("pstfd\tf%u,%lld(%s),%d", uint32_t(frs.encoding()),
++ (long long)imm34, ra.name(), R ? 1 : 0);
++ uint32_t d0, d1;
++ SplitImm34(imm34, &d0, &d1);
++ uint32_t prefix = EncodePower10Prefix(/*type=MLS*/ 2, R, d0);
++ uint32_t suffix = (54u << 26) | (uint32_t(frs.encoding()) << 21) |
++ (uint32_t(ra.code()) << 16) | d1;
++ m_buffer.enterNoPool(3);
++ ensurePrefixedAlignment();
++ BufferOffset bo = writeInst(prefix);
++ writeInst(suffix);
++ m_buffer.leaveNoPool();
++ return bo;
++}
++
++// pstfs FRS, D(RA), R (MLS, suffix opcode 52 = stfs)
++BufferOffset Assembler::as_pstfs(FloatRegister frs, Register ra, int64_t imm34,
++ bool R) {
++ MOZ_ASSERT_IF(R, ra == r0);
++ spew("pstfs\tf%u,%lld(%s),%d", uint32_t(frs.encoding()),
++ (long long)imm34, ra.name(), R ? 1 : 0);
++ uint32_t d0, d1;
++ SplitImm34(imm34, &d0, &d1);
++ uint32_t prefix = EncodePower10Prefix(/*type=MLS*/ 2, R, d0);
++ uint32_t suffix = (52u << 26) | (uint32_t(frs.encoding()) << 21) |
++ (uint32_t(ra.code()) << 16) | d1;
++ m_buffer.enterNoPool(3);
++ ensurePrefixedAlignment();
++ BufferOffset bo = writeInst(prefix);
++ writeInst(suffix);
++ m_buffer.leaveNoPool();
++ return bo;
++}
++
++// POWER10 (ISA 3.1) Vector Extract Mask. RT (GPR) gets the wasm-spec
++// bitmask (one bit per lane MSB) directly in low 16/8/4/2 bits. UIM
++// is baked into PPC_vextract{b,h,w,d}m (8/9/10/11). Caller must have
++// verified HasPOWER10().
++#define DEF_VEXTRACT_M(op) \
++ BufferOffset Assembler::as_##op(Register rt, FloatRegister vrb) { \
++ spew(#op "\t%s,vr%u", rt.name(), uint32_t(vrb.encoding() & 31)); \
++ return writeInst(PPC_##op | (uint32_t(rt.code()) << 21) | \
++ ((uint32_t(vrb.encoding()) & 31) << 11)); \
++ }
++DEF_VEXTRACT_M(vextractbm)
++DEF_VEXTRACT_M(vextracthm)
++DEF_VEXTRACT_M(vextractwm)
++DEF_VEXTRACT_M(vextractdm)
++#undef DEF_VEXTRACT_M
++
++// POWER10 (ISA 3.1) Vector Insert Word/Doubleword from GPR. VX-form:
++// VRT at bits 21..25, UIM at bits 16..20, RB at bits 11..15.
++#define DEF_VINS(op, max_uim) \
++ BufferOffset Assembler::as_##op(FloatRegister vrt, Register rb, \
++ uint8_t uim) { \
++ MOZ_ASSERT(uim <= (max_uim)); \
++ spew(#op "\tvr%u,%s,%u", uint32_t(vrt.encoding() & 31), rb.name(), \
++ uint32_t(uim)); \
++ return writeInst(PPC_##op | \
++ ((uint32_t(vrt.encoding()) & 31) << 21) | \
++ (uint32_t(uim) << 16) | \
++ (uint32_t(rb.code()) << 11)); \
++ }
++DEF_VINS(vinsw, 12)
++DEF_VINS(vinsd, 8)
++#undef DEF_VINS
++
++// POWER10 (ISA 3.1) Vector Insert byte/halfword from GPR with
++// register-supplied byte position. VX-form: VRT at bits 21..25,
++// RA at bits 16..20, RB at bits 11..15. "rx" is right-indexed
++// (LE-natural — index 0 = LSB byte).
++#define DEF_VINS_RX(op) \
++ BufferOffset Assembler::as_##op(FloatRegister vrt, Register ra, \
++ Register rb) { \
++ spew(#op "\tvr%u,%s,%s", uint32_t(vrt.encoding() & 31), ra.name(), \
++ rb.name()); \
++ return writeInst(PPC_##op | \
++ ((uint32_t(vrt.encoding()) & 31) << 21) | \
++ (uint32_t(ra.code()) << 16) | \
++ (uint32_t(rb.code()) << 11)); \
++ }
++DEF_VINS_RX(vinsbrx)
++DEF_VINS_RX(vinshrx)
++#undef DEF_VINS_RX
++
++// POWER9 (ISA 3.0) V-form 3-operand instructions with VRT, UIM, VRB at
++// bits 21..25, 16..20, 11..15 respectively (vinsert{b,h}, vextract{ub,uh}).
++// Simd128 lives in VSR32-63 (= VR0-31), so we mask VRT and VRB to the
++// 5-bit VR field via `encoding() & 31`.
++#define DEF_VRT_UIM_VRB(op, max_uim, uim_step) \
++ BufferOffset Assembler::as_##op(FloatRegister vrt, FloatRegister vrb, \
++ uint8_t uim) { \
++ MOZ_ASSERT(uim <= (max_uim)); \
++ MOZ_ASSERT((uim) % (uim_step) == 0); \
++ spew(#op "\tvr%u,vr%u,%u", uint32_t(vrt.encoding() & 31), \
++ uint32_t(vrb.encoding() & 31), uint32_t(uim)); \
++ return writeInst(PPC_##op | \
++ ((uint32_t(vrt.encoding()) & 31) << 21) | \
++ (uint32_t(uim) << 16) | \
++ ((uint32_t(vrb.encoding()) & 31) << 11)); \
++ }
++DEF_VRT_UIM_VRB(vinsertb, 15, 1)
++DEF_VRT_UIM_VRB(vinserth, 14, 2)
++DEF_VRT_UIM_VRB(vextractub, 15, 1)
++DEF_VRT_UIM_VRB(vextractuh, 14, 2)
++#undef DEF_VRT_UIM_VRB
++
++// VMX binary VX-form pack/merge (re-use DEF_VMX_VVV pattern).
++#define DEF_VMX_VVV(op) \
++ BufferOffset Assembler::as_##op(uint8_t vrt, uint8_t vra, uint8_t vrb) { \
++ MOZ_ASSERT(vrt < 32 && vra < 32 && vrb < 32); \
++ spew(#op "\tvr%d,vr%d,vr%d", vrt, vra, vrb); \
++ return writeInst(PPC_##op | vrt << 21 | vra << 16 | vrb << 11); \
++ }
++DEF_VMX_VVV(vpkshss)
++DEF_VMX_VVV(vpkswss) DEF_VMX_VVV(vpkshus) DEF_VMX_VVV(vpkswus)
++ DEF_VMX_VVV(vmrghb)
++ DEF_VMX_VVV(vmrghh) DEF_VMX_VVV(vmrghw) DEF_VMX_VVV(vmrglb)
++ DEF_VMX_VVV(vmrglh) DEF_VMX_VVV(vmrglw) DEF_VMX_VVV(vmulesb)
++ DEF_VMX_VVV(vmulosb) DEF_VMX_VVV(vmuleub) DEF_VMX_VVV(vmuloub)
++ DEF_VMX_VVV(vmulesh) DEF_VMX_VVV(vmulosh)
++ DEF_VMX_VVV(vmuleuh) DEF_VMX_VVV(vmulouh)
++ DEF_VMX_VVV(vmulesw) DEF_VMX_VVV(vmulosw)
++ DEF_VMX_VVV(vmuleuw) DEF_VMX_VVV(vmulouw)
++#undef DEF_VMX_VVV
++
++ // vperm VA-form: (4<<26) | VRT<<21 | VRA<<16 | VRB<<11 | VRC<<6 | XO
++ BufferOffset Assembler::as_vperm(uint8_t vrt, uint8_t vra, uint8_t vrb,
++ uint8_t vrc) {
++ MOZ_ASSERT(vrt < 32 && vra < 32 && vrb < 32 && vrc < 32);
++ spew("vperm\tvr%d,vr%d,vr%d,vr%d", vrt, vra, vrb, vrc);
++ return writeInst(PPC_vperm | vrt << 21 | vra << 16 | vrb << 11 | vrc << 6);
++}
++
++// VA-form ternary VMX: (4<<26) | VRT<<21 | VRA<<16 | VRB<<11 | VRC<<6 |
++// XO(6-bit)
++BufferOffset Assembler::as_vmladduhm(uint8_t vrt, uint8_t vra, uint8_t vrb,
++ uint8_t vrc) {
++ MOZ_ASSERT(vrt < 32 && vra < 32 && vrb < 32 && vrc < 32);
++ spew("vmladduhm\tvr%d,vr%d,vr%d,vr%d", vrt, vra, vrb, vrc);
++ return writeInst(PPC_vmladduhm | vrt << 21 | vra << 16 | vrb << 11 |
++ vrc << 6);
++}
++
++BufferOffset Assembler::as_vmhraddshs(uint8_t vrt, uint8_t vra, uint8_t vrb,
++ uint8_t vrc) {
++ MOZ_ASSERT(vrt < 32 && vra < 32 && vrb < 32 && vrc < 32);
++ spew("vmhraddshs\tvr%d,vr%d,vr%d,vr%d", vrt, vra, vrb, vrc);
++ return writeInst(PPC_vmhraddshs | vrt << 21 | vra << 16 | vrb << 11 |
++ vrc << 6);
++}
++
++BufferOffset Assembler::as_vmsumshm(uint8_t vrt, uint8_t vra, uint8_t vrb,
++ uint8_t vrc) {
++ MOZ_ASSERT(vrt < 32 && vra < 32 && vrb < 32 && vrc < 32);
++ spew("vmsumshm\tvr%d,vr%d,vr%d,vr%d", vrt, vra, vrb, vrc);
++ return writeInst(PPC_vmsumshm | vrt << 21 | vra << 16 | vrb << 11 |
++ vrc << 6);
++}
++
++BufferOffset Assembler::as_vmsumuhm(uint8_t vrt, uint8_t vra, uint8_t vrb,
++ uint8_t vrc) {
++ MOZ_ASSERT(vrt < 32 && vra < 32 && vrb < 32 && vrc < 32);
++ spew("vmsumuhm\tvr%d,vr%d,vr%d,vr%d", vrt, vra, vrb, vrc);
++ return writeInst(PPC_vmsumuhm | vrt << 21 | vra << 16 | vrb << 11 |
++ vrc << 6);
++}
++
++BufferOffset Assembler::as_vspltisb(uint8_t vrt, int8_t simm5) {
++ MOZ_ASSERT(vrt < 32);
++ MOZ_ASSERT(simm5 >= -16 && simm5 <= 15);
++ spew("vspltisb\tvr%d,%d", vrt, simm5);
++ return writeInst(PPC_vspltisb | uint32_t(vrt) << 21 |
++ (uint32_t(simm5) & 0x1F) << 16);
++}
++
++BufferOffset Assembler::as_vspltish(uint8_t vrt, int8_t simm5) {
++ MOZ_ASSERT(vrt < 32);
++ MOZ_ASSERT(simm5 >= -16 && simm5 <= 15);
++ spew("vspltish\tvr%d,%d", vrt, simm5);
++ return writeInst(PPC_vspltish | uint32_t(vrt) << 21 |
++ (uint32_t(simm5) & 0x1F) << 16);
++}
++
++BufferOffset Assembler::as_vspltisw(uint8_t vrt, int8_t simm5) {
++ MOZ_ASSERT(vrt < 32);
++ MOZ_ASSERT(simm5 >= -16 && simm5 <= 15);
++ spew("vspltisw\tvr%d,%d", vrt, simm5);
++ return writeInst(PPC_vspltisw | uint32_t(vrt) << 21 |
++ (uint32_t(simm5) & 0x1F) << 16);
++}
++
++// --- Convenience pseudo-instructions ---
++
++BufferOffset Assembler::xs_trap() {
++ spew("trap @ %08x", currentOffset());
++ return writeInst(PPC_trap);
++}
++
++BufferOffset Assembler::xs_trap_tagged(TrapTag tag) {
++ uint32_t tv = PPC_trap | ((uint8_t)tag << 16) | ((uint8_t)tag << 11);
++ spew("trap @ %08x ; MARK %d %08x", currentOffset(), (uint8_t)tag, tv);
++ return writeInst(tv);
++}
++
++BufferOffset Assembler::xs_mr(Register rd, Register ra) {
++ return as_or_(rd, ra, ra);
++}
++
++BufferOffset Assembler::xs_mtctr(Register ra) {
++ return as_mtspr((SPRegisterID)spr_ctr, ra);
++}
++
++BufferOffset Assembler::xs_mtlr(Register ra) {
++ return as_mtspr((SPRegisterID)spr_lr, ra);
++}
++
++BufferOffset Assembler::xs_mflr(Register rd) {
++ return as_mfspr(rd, (SPRegisterID)spr_lr);
++}
++
++BufferOffset Assembler::xs_mtcr(Register rs) { return as_mtcrf(0xff, rs); }
++
++BufferOffset Assembler::xs_mfxer(Register ra) {
++ return as_mfspr(ra, (SPRegisterID)spr_xer);
++}
++
++BufferOffset Assembler::xs_mtxer(Register ra) {
++ return as_mtspr((SPRegisterID)spr_xer, ra);
++}
++
++BufferOffset Assembler::xs_li(Register rd, int16_t im) {
++ return as_addi(rd, r0, im, true);
++}
++
++BufferOffset Assembler::xs_lis(Register rd, int16_t im) {
++ return as_addis(rd, r0, im, true);
++}
++
++BufferOffset Assembler::x_subi(Register rd, Register ra, int16_t im) {
++ return as_addi(rd, ra, -im);
++}
++
++BufferOffset Assembler::x_not(Register rd, Register ra) {
++ return as_nor(rd, ra, ra);
++}
++
++BufferOffset Assembler::x_slwi(Register rd, Register rs, int n) {
++ MOZ_ASSERT(n >= 0 && n < 32);
++ return as_rlwinm(rd, rs, n, 0, 31 - n);
++}
++
++BufferOffset Assembler::x_sldi(Register rd, Register rs, int n) {
++ return as_rldicr(rd, rs, n, 63 - n);
++}
++
++BufferOffset Assembler::x_srwi(Register rd, Register rs, int n) {
++ MOZ_ASSERT(n >= 0 && n < 32);
++ if (n == 0) {
++ return as_rlwinm(rd, rs, 0, 0, 31);
++ }
++ return as_rlwinm(rd, rs, 32 - n, n, 31);
++}
++
++BufferOffset Assembler::x_srdi(Register rd, Register rs, int n) {
++ MOZ_ASSERT(n >= 0 && n < 64);
++ if (n == 0) {
++ return as_or_(rd, rs, rs);
++ }
++ return as_rldicl(rd, rs, 64 - n, n);
++}
++
++BufferOffset Assembler::x_bit_value(Register rd, Register rs, unsigned bit) {
++ return as_rlwinm(rd, rs, bit + 1, 31, 31);
++}
++
++BufferOffset Assembler::x_insertbits0_15(Register rd, Register rs) {
++ return as_rlwimi(rd, rs, 0, 16, 31);
++}
++
++BufferOffset Assembler::x_sr_mulli(Register rd, Register ra, int16_t im) {
++ as_sradi(rd, ra, 63);
++ return as_mulli(rd, rd, im);
++}
++
++void Assembler::as_break(uint32_t code) {
++ spew("break\t%d", code);
++ writeInst(PPC_trap);
++}
++
++// ========================================================================
++// Label binding, retarget, and code label processing.
++// ========================================================================
++
++// Forward-declared shape helpers; full definitions and the layout
++// commentary live with the WriteLoad64Instructions section below.
++static bool IsAddpcisLoad64Stanza(uint32_t enc0);
++static uint8_t Load64StanzaDestReg(Instruction* inst0);
++
++InstImm Assembler::invertBranch(InstImm branch, BOffImm16 skipOffset) {
++ // Flip the BO condition-true/condition-false bit (bit 24).
++ uint32_t data = branch.encode();
++ data = (data ^ 0x01000000) & 0xFFFF0003;
++ data |= skipOffset.encode();
++ branch.setData(data);
++ return branch;
++}
++
++void Assembler::bind(InstImm* inst, uintptr_t branch, uintptr_t target) {
++ intptr_t offset = target - branch;
++ Instruction* i0 = (Instruction*)inst;
++
++ if (i0->next()->encode() == PPC_bcl_always_plus4 ||
++ IsAddpcisLoad64Stanza(i0->encode())) {
++ // Pre-existing long stanza, either P8 (mflr + bcl marker at [1]) or
++ // P9+ (addpcis at [0]; major opcode 19). Either way, just register
++ // the long jump — the stanza's .quad at [6..7] gets patched later
++ // via UpdateLoad64Value.
++ addLongJump(BufferOffset(branch), BufferOffset(target));
++ return;
++ }
++
++ if (i0->isOpcode((uint32_t)PPC_tw)) {
++ // Tagged trap stanza. The tag tells us which branch type was reserved.
++ TrapTag tag = (TrapTag)inst->traptag();
++ Instruction* i1 = i0->next();
++ Instruction* i2 = i1->next();
++ Instruction* i3 = i2->next();
++ Instruction* i4 = i3->next();
++ Instruction* i5 = i4->next();
++ Instruction* i6 = i5->next();
++ Instruction* i7 = i6->next();
++ Instruction* i8 = i7->next();
++ Instruction* i9 = i8->next();
++
++ switch (tag) {
++ case BCTag: {
++ // inst[-1] is the original bc instruction.
++ Instruction* bc = i0 - 1;
++ // Try short bc (offset + 4 because bc is one instruction before tw).
++ if (BOffImm16::IsInRange(offset + (intptr_t)sizeof(uint32_t))) {
++ bc->setData(((bc->encode() ^ 0x01000000) & 0xFFFF0003) |
++ BOffImm16(offset + sizeof(uint32_t)).encode());
++ i0->makeNop();
++ i1->makeNop();
++ i2->makeNop();
++ i3->makeNop();
++ i4->makeNop();
++ i5->makeNop();
++ i6->makeNop();
++ i7->makeNop();
++ i8->makeNop();
++ i9->makeNop();
++ return;
++ }
++ // Try short b (unconditional).
++ if (JOffImm26::IsInRange(offset)) {
++ i0->setData(PPC_b | JOffImm26(offset).encode());
++ i1->makeNop();
++ i2->makeNop();
++ i3->makeNop();
++ i4->makeNop();
++ i5->makeNop();
++ i6->makeNop();
++ i7->makeNop();
++ i8->makeNop();
++ i9->makeNop();
++ return;
++ }
++ // Long: WriteLoad64 to SecondScratchReg + mtctr + bctr.
++ addLongJump(BufferOffset(branch), BufferOffset(target));
++ WriteLoad64Instructions(i0, SecondScratchReg,
++ LabelBase::INVALID_OFFSET);
++ i8->makeOp_mtctr(SecondScratchReg);
++ i9->makeOp_bctr();
++ break;
++ }
++ case CallTag: {
++ // For calls, the actual call instruction goes at inst[9] and
++ // the return address must be after the stanza.
++ intptr_t callOffset = offset - 9 * (intptr_t)sizeof(uint32_t);
++ if (JOffImm26::IsInRange(callOffset)) {
++ i0->makeNop();
++ i1->makeNop();
++ i2->makeNop();
++ i3->makeNop();
++ i4->makeNop();
++ i5->makeNop();
++ i6->makeNop();
++ i7->makeNop();
++ i8->makeNop();
++ i9->setData(PPC_b | JOffImm26(callOffset).encode() | LinkB);
++ return;
++ }
++ // Long: WriteLoad64 to SecondScratchReg + mtctr + bctrl.
++ addLongJump(BufferOffset(branch), BufferOffset(target));
++ WriteLoad64Instructions(i0, SecondScratchReg,
++ LabelBase::INVALID_OFFSET);
++ i8->makeOp_mtctr(SecondScratchReg);
++ i9->makeOp_bctr(LinkB);
++ break;
++ }
++ case BTag: {
++ if (JOffImm26::IsInRange(offset)) {
++ i0->setData(PPC_b | JOffImm26(offset).encode());
++ i1->makeNop();
++ i2->makeNop();
++ i3->makeNop();
++ i4->makeNop();
++ i5->makeNop();
++ i6->makeNop();
++ i7->makeNop();
++ i8->makeNop();
++ i9->makeNop();
++ return;
++ }
++ // Long: WriteLoad64 to SecondScratchReg + mtctr + bctr.
++ addLongJump(BufferOffset(branch), BufferOffset(target));
++ WriteLoad64Instructions(i0, SecondScratchReg,
++ LabelBase::INVALID_OFFSET);
++ i8->makeOp_mtctr(SecondScratchReg);
++ i9->makeOp_bctr();
++ break;
++ }
++ default:
++ MOZ_CRASH("Unexpected TrapTag");
++ }
++ return;
++ }
++
++ if (i0->isOpcode(PPC_b)) {
++ // Short unconditional branch — set offset, nop next-in-chain slot.
++ MOZ_ASSERT(JOffImm26::IsInRange(offset));
++ i0->setData((i0->encode() & ~0x03FFFFFC) | JOffImm26(offset).encode());
++ i0->next()->makeNop();
++ return;
++ }
++
++ if (i0->isOpcode(PPC_bc)) {
++ // Short conditional branch — preserve upper 16 bits, set offset.
++ MOZ_ASSERT(BOffImm16::IsInRange(offset));
++ i0->setData((i0->encode() & 0xFFFF0003) | BOffImm16(offset).encode());
++ i0->next()->makeNop();
++ return;
++ }
++
++ MOZ_CRASH("Unexpected instruction in bind");
++}
++
++void Assembler::bind(Label* label, BufferOffset boff) {
++ if (label->used()) {
++ bool more;
++ BufferOffset b(label);
++ do {
++ BufferOffset next;
++ InstImm* inst = (InstImm*)editSrc(b);
++ Instruction* i1 = ((Instruction*)inst)->next();
++ more = (i1->encode() != LabelBase::INVALID_OFFSET);
++ if (more) {
++ next = BufferOffset(i1->encode());
++ }
++ bind(inst, b.getOffset(), boff.getOffset());
++ b = next;
++ } while (more);
++ }
++ label->bind(boff.getOffset());
++}
++
++void Assembler::retarget(Label* label, Label* target) {
++ spew("retarget");
++ if (label->used() && !oom()) {
++ if (target->bound()) {
++ bind(label, BufferOffset(target));
++ } else if (target->used()) {
++ // Prepend label's use chain to target's use chain.
++ BufferOffset b(label);
++ BufferOffset next;
++ do {
++ Instruction* inst = (Instruction*)editSrc(b);
++ Instruction* i1 = inst->next();
++ if (i1->encode() != LabelBase::INVALID_OFFSET) {
++ next = BufferOffset(i1->encode());
++ } else {
++ // End of label's chain — link to target's head.
++ i1->setData(target->offset());
++ break;
++ }
++ b = next;
++ } while (true);
++ }
++ // Transfer label's use list to target.
++ if (!target->bound()) {
++ target->use(label->offset());
++ }
++ }
++ label->reset();
++}
++
++void Assembler::processCodeLabels(uint8_t* rawCode) {
++ for (const CodeLabel& label : codeLabels_) {
++ Bind(rawCode, label);
++ }
++}
++
++// ========================================================================
++// Load64 instruction sequence (8 slots, literal pool format):
++// [0] mflr r0 — save LR
++// [1] bcl 20,0,.+4 — LR = address of [2]
++// [2] mflr rD — rD = address of [2]
++// [3] mtlr r0 — restore LR
++// [4] ld rD, 16(rD) — load from [6..7] (offset = 24 - 8 = 16)
++// [5] b .+12 — skip data
++// [6..7] .quad VALUE — 8-byte data
++// ========================================================================
++
++// ========================================================================
++// Constant pool callbacks (required by AssemblerBufferWithConstantPools).
++// ========================================================================
++
++/* static */
++void Assembler::InsertIndexIntoTag(uint8_t* load, uint32_t index) {
++ // Stash the pool entry index in the hint word's low 16 bits; the high
++ // bits carry the dest reg and load type, consumed by
++ // PatchConstantPoolLoad when the pool is resolved.
++ uint32_t* inst = (uint32_t*)load;
++ *inst = (*inst & 0xFFFF0000) | (index & 0xFFFF);
++}
++
++/* static */
++bool Assembler::PatchConstantPoolLoad(void* loadAddr, void* constPoolAddr) {
++ // Rewrite placeholder instructions with a pool load sequence.
++ // Hint word layout (set by loadFromPoolFloat64 / loadFromPoolFloat32 /
++ // loadFromPoolSimd128):
++ // bits 0-15: pool entry index
++ // bits 16-20: destination register (FPR encoding)
++ // bits 21-22: load type (PoolLoadFPR64, PoolLoadSimd128, PoolLoadFPR32)
++ // bits 28-31: sentinel 0xF
++
++ uint32_t* inst = (uint32_t*)loadAddr;
++
++ uint32_t hint = inst[0];
++ uint32_t index = hint & 0xFFFF;
++ uint32_t destReg = (hint >> 16) & 0x1F;
++ uint32_t loadType = (hint >> 21) & 0x3;
++
++ // Displacement: pool entry address relative to inst[1] (mflr target) for the
++ // bcl path, or relative to inst[0]+4 (addpcis target = CIA+4, which is the
++ // address of inst[1]) for the addpcis path. Both conventions resolve to the
++ // same value: (pool entry) − (loadAddr + 4).
++ int32_t displacement =
++ (int32_t)((uint8_t*)constPoolAddr + index * 4 - ((uint8_t*)loadAddr + 4));
++
++ if (loadType == PoolLoadFPR64 || loadType == PoolLoadFPR32) {
++ // Three emission paths:
++ //
++ // POWER10 (preferred): plfd/plfs FRT, SI(0), R=1 — single PC-relative
++ // prefixed FP load. 8 bytes = 2 slots; slot 2 becomes a nop. If
++ // loadAddr % 64 == 60, plfd would straddle a 64-byte block, so emit
++ // a leading nop at slot 0 and place plfd at slots 1-2 instead.
++ // Reach: ±8 GB (34-bit signed). No LR clobber, no r16 base.
++ //
++ // POWER9: addpcis + lfd/lfs + nop. 2 real insns, no LR clobber, no
++ // Return Address Stack corruption. Base register is r16.
++ // Displacement splits into (hi << 16) + lo where lo is the 16-bit
++ // signed D-field of lfd/lfs. Reach: ±2 GB.
++ //
++ // POWER8: bcl + mflr r16 + lfd/lfs. Same clobber + RAS caveat as before.
++ // Kept as a correctness fallback; not exercised today because the
++ // loadConstantDouble/Float32 wrappers skip the pool on POWER8.
++ //
++ // lfs/plfs (32-bit) auto-expand their result to double-precision in the
++ // FPR, replacing the non-pool path's separate xscvspdpn step.
++ uint32_t baseReg = SavedScratchRegister.code();
++ uint32_t loadOp = (loadType == PoolLoadFPR64) ? PPC_lfd : PPC_lfs;
++
++ if (HasPOWER10()) {
++ // MLS prefixed FP load. plfd suffix opcode = 50, plfs = 48. Same
++ // alignment-driven slot placement as PoolLoadSimd128 above.
++ uint64_t loadAddrBits = reinterpret_cast<uint64_t>(loadAddr);
++ // loadAddr is the buffer-time pointer; the final executable base is
++ // only 16-byte aligned, so the unsafe straddle is when
++ // (loadAddrBits & 15) == 12 (matches ensurePrefixedAlignment above).
++ bool needLeadingNop = (loadAddrBits & 15) == 12;
++ int prefixSlot = needLeadingNop ? 1 : 0;
++ int prefixByteOffset = prefixSlot * 4;
++ int64_t SI = int64_t(displacement) + 4 - prefixByteOffset;
++ MOZ_ASSERT(SI >= -(int64_t(1) << 33) && SI < (int64_t(1) << 33));
++ uint32_t d0 = uint32_t((uint64_t(SI) >> 16) & 0x3FFFFu);
++ uint32_t d1 = uint32_t(uint64_t(SI) & 0xFFFFu);
++ // Type 2 (MLS), R=1, RA=0.
++ uint32_t prefix =
++ (1u << 26) | (2u << 24) | (1u << 20) | (d0 & 0x3FFFFu);
++ uint32_t suffixOp = (loadType == PoolLoadFPR64) ? 50u : 48u;
++ uint32_t suffix = (suffixOp << 26) | (destReg << 21) | d1;
++
++ if (needLeadingNop) {
++ inst[0] = NopInst;
++ inst[1] = prefix;
++ inst[2] = suffix;
++ } else {
++ inst[0] = prefix;
++ inst[1] = suffix;
++ inst[2] = NopInst;
++ }
++ } else if (HasPOWER9()) {
++ // Split displacement into addpcis hi field and lfd/lfs lo field so that
++ // target = (CIA + 4) + (hi << 16) + SEXT16(lo).
++ // Only 2 slots are reserved on P9 (loadFromPoolFloat{32,64} above);
++ // do NOT touch inst[2], it belongs to the next entry.
++ int16_t lo = (int16_t)(displacement & 0xFFFF);
++ int32_t hiAdj = displacement - lo;
++ MOZ_ASSERT((hiAdj & 0xFFFF) == 0);
++ int32_t hi = hiAdj >> 16;
++ MOZ_ASSERT(hi >= -32768 && hi <= 32767);
++ // [0] addpcis r16, hi
++ uint32_t Dhi = uint16_t(hi);
++ inst[0] = (19u << 26) | (baseReg << 21) | ((Dhi >> 1) & 0x1F) << 16 |
++ ((Dhi >> 6) & 0x3FF) << 6 | (2u << 1) | (Dhi & 1u);
++ // [1] lfd/lfs fD, lo(r16)
++ inst[1] = loadOp | (destReg << 21) | (baseReg << 16) | (uint16_t(lo));
++ } else {
++ MOZ_ASSERT(displacement >= -32768 && displacement < 32768);
++ // [0] bcl 20,0,$+4
++ inst[0] = PPC_bcl_always_plus4;
++ // [1] mflr r16
++ inst[1] = PPC_mfspr | (baseReg << 21) | PPC_SPR(spr_lr);
++ // [2] lfd/lfs fD, displacement(r16)
++ inst[2] =
++ loadOp | (destReg << 21) | (baseReg << 16) | (displacement & 0xFFFF);
++ }
++ } else if (loadType == PoolLoadSimd128) {
++ // Three emission paths (5 slots reserved by loadFromPoolSimd128):
++ //
++ // POWER10 (preferred): plxv vsD, SI(0), R=1 — single PC-relative
++ // prefixed load, natural-LE byte order (no xxpermdi needed). 8 bytes
++ // = 2 slots; slots 2-4 become nops. If the prefix would straddle a
++ // 64-byte block (loadAddr % 64 == 60), emit a leading nop at slot 0
++ // and place plxv at slots 1-2 instead. Reach: ±8 GB (34-bit signed).
++ //
++ // POWER9: addpcis-equivalent via bcl + mflr + addi + lxvx + nop. 5
++ // real insns, natural LE.
++ //
++ // POWER8: same prelude + lxvd2x + xxpermdi (BE-DW byte-swap fixup).
++ //
++ // See PoolLoadFPR64 above for why r16 instead of r12.
++ MOZ_ASSERT(displacement >= -32768 && displacement < 32768);
++ // Simd128 dest is in VR-namespace (encoding 32-63). Hint stores only
++ // the low 5 bits (loadFromPoolSimd128 masks); we set TX unconditionally
++ // since PoolLoadSimd128 always targets a Simd128.
++ constexpr uint32_t kTX = 1u;
++ constexpr uint32_t kAxBxTx_xxpermdi = (1u << 2) | (1u << 1) | 1u;
++
++ if (HasPOWER10()) {
++ // Place plxv prefix at the highest 4-byte-aligned offset within
++ // the 5 reserved slots that doesn't straddle a 64-byte block.
++ uint64_t loadAddrBits = reinterpret_cast<uint64_t>(loadAddr);
++ // loadAddr is the buffer-time pointer; the final executable base is
++ // only 16-byte aligned, so the unsafe straddle is when
++ // (loadAddrBits & 15) == 12 (matches ensurePrefixedAlignment above).
++ bool needLeadingNop = (loadAddrBits & 15) == 12;
++ int prefixSlot = needLeadingNop ? 1 : 0;
++ int prefixByteOffset = prefixSlot * 4;
++ // SI = (pool entry addr) - (prefix addr)
++ // = (loadAddr + 4 + displacement) - (loadAddr + prefixByteOffset)
++ // = displacement + 4 - prefixByteOffset
++ int64_t SI = int64_t(displacement) + 4 - prefixByteOffset;
++ MOZ_ASSERT(SI >= -(int64_t(1) << 33) && SI < (int64_t(1) << 33));
++ uint32_t d0 = uint32_t((uint64_t(SI) >> 16) & 0x3FFFFu);
++ uint32_t d1 = uint32_t(uint64_t(SI) & 0xFFFFu);
++ // Prefix: primary opcode 1, Type 0 (8LS), R=1, d0 at LE bits 17..0.
++ uint32_t prefix =
++ (1u << 26) | (0u << 24) | (1u << 20) | (d0 & 0x3FFFFu);
++ // Suffix: 5-bit opcode 25 at LE 31..27, TX at LE 26, T at LE 25..21,
++ // RA=0 at LE 20..16, d1 at LE 15..0.
++ uint32_t suffix = (25u << 27) | (kTX << 26) | (destReg << 21) | d1;
++
++ // P10 reserves 3 slots; only inst[0..2] are written. Slots 3..4
++ // belong to the next pool entry on P10.
++ if (needLeadingNop) {
++ inst[0] = NopInst;
++ inst[1] = prefix;
++ inst[2] = suffix;
++ } else {
++ inst[0] = prefix;
++ inst[1] = suffix;
++ inst[2] = NopInst;
++ }
++ } else if (HasPOWER9()) {
++ // addpcis + addi + lxvx (3 slots) — no LR clobber, no RAS hazard.
++ // Same displacement split as the FP scalar P9 path: target =
++ // (CIA+4) + (hi << 16) + SEXT16(lo). lxvx is X-form indexed (no
++ // immediate offset), so combine the low 16 bits into r16 via addi
++ // before the load.
++ int16_t lo = (int16_t)(displacement & 0xFFFF);
++ int32_t hiAdj = displacement - lo;
++ MOZ_ASSERT((hiAdj & 0xFFFF) == 0);
++ int32_t hi = hiAdj >> 16;
++ MOZ_ASSERT(hi >= -32768 && hi <= 32767);
++ uint32_t Dhi = uint16_t(hi);
++ uint32_t baseReg = SavedScratchRegister.code();
++ // [0] addpcis r16, hi
++ inst[0] = (19u << 26) | (baseReg << 21) | ((Dhi >> 1) & 0x1F) << 16 |
++ ((Dhi >> 6) & 0x3FF) << 6 | (2u << 1) | (Dhi & 1u);
++ // [1] addi r16, r16, lo
++ inst[1] = PPC_addi | (baseReg << 21) | (baseReg << 16) | uint16_t(lo);
++ // [2] lxvx vsD, 0, r16 (XT[0:4] in bits 21-25, TX at bit 0)
++ inst[2] = PPC_lxvx | (destReg << 21) | (baseReg << 11) | kTX;
++ } else {
++ // P8 fallback: bcl + mflr + addi + lxvd2x + xxpermdi (5 slots).
++ // Clobbers LR; correctness-only path.
++ uint32_t baseReg = SavedScratchRegister.code();
++ inst[0] = PPC_bcl_always_plus4;
++ inst[1] = PPC_mfspr | (baseReg << 21) | PPC_SPR(spr_lr);
++ inst[2] = PPC_addi | (baseReg << 21) | (baseReg << 16) |
++ (displacement & 0xFFFF);
++ // lxvd2x XT, RA=0, RB=r16 — loads in BE order on LE.
++ inst[3] = PPC_lxvd2x | (destReg << 21) | (baseReg << 11) | kTX;
++ // xxpermdi XT, XT, XT, 2 — swap doublewords for LE byte order.
++ inst[4] = PPC_xxpermdi | (destReg << 21) | (destReg << 16) |
++ (destReg << 11) | (2u << 8) | kAxBxTx_xxpermdi;
++ }
++ } else {
++ MOZ_CRASH("PatchConstantPoolLoad: unsupported load type");
++ }
++
++ return false;
++}
++
++/* static */
++void Assembler::WritePoolGuard(BufferOffset branch, Instruction* inst,
++ BufferOffset dest) {
++ // Emit an unconditional branch over the pool data.
++ int32_t offset = dest.getOffset() - branch.getOffset();
++ MOZ_ASSERT(JOffImm26::IsInRange(offset));
++ inst->setData(PPC_b | (offset & 0x03FFFFFC));
++}
++
++/* static */
++void Assembler::WritePoolHeader(uint8_t* start, Pool* p, bool isNatural) {
++ // Write pool identification header.
++ // Encode pool size and isNatural flag in a single 32-bit word.
++ uint32_t poolSize = p->getPoolSize();
++ uint32_t sizeInWords = (poolSize + 4 + 3) >> 2; // header + data, in words
++ MOZ_ASSERT(sizeInWords < (1 << 15));
++ uint32_t header = (sizeInWords & 0x7FFF) | (isNatural ? (1 << 15) : 0) |
++ 0xFFFF0000; // sentinel
++ *(uint32_t*)start = header;
++}
++
++/* static */
++void Assembler::PatchShortRangeBranchToVeneer(PPCBuffer*, unsigned rangeIdx,
++ BufferOffset deadline,
++ BufferOffset veneer) {
++ // PPC64 does not use short-range branch tracking (NumShortBranchRanges = 0).
++ MOZ_CRASH("PatchShortRangeBranchToVeneer: should not be called");
++}
++
++// Two stanza shapes share the same 8-slot footprint and the same .quad
++// location at slots [6..7] (so ExtractLoad64Value / UpdateLoad64Value are
++// shape-agnostic):
++//
++// POWER8 (no addpcis):
++// [0] mflr r0
++// [1] bcl 20,0,.+4 (LR := pc of [2])
++// [2] mflr rD
++// [3] mtlr r0
++// [4] ld rD, 16(rD)
++// [5] b .+12
++// [6..7] .quad VALUE
++//
++// POWER9+ (addpcis):
++// [0] addpcis rD, 0 (rD := NIA = pc of [1])
++// [1] ld rD, 20(rD) (rD := mem[pc_of_[1] + 20] = mem[slot[6]])
++// [2] b .+24
++// [3..5] NOP, NOP, NOP
++// [6..7] .quad VALUE
++//
++// The P9+ form drops the bcl/mflr/mtlr LR-bounce (no RAS thrash) and runs
++// 2 dynamic insns instead of 6. Distinguished at patch time by inst[0]'s
++// major opcode: 31 = mfspr (P8) vs 19 = addpcis (P9+).
++static bool IsAddpcisLoad64Stanza(uint32_t enc0) {
++ return ((enc0 >> 26) & 0x3f) == 19;
++}
++
++// Extract the destination register from a load64 stanza in either shape.
++// P8 stores rD in `mflr rD` at slot [2]; P9+ stores rD in `addpcis rD, 0`
++// at slot [0]. Both encode RT at LE bits [21..25].
++static uint8_t Load64StanzaDestReg(Instruction* inst0) {
++ if (IsAddpcisLoad64Stanza(inst0->encode())) {
++ return (inst0[0].encode() >> 21) & 0x1f;
++ }
++ return (inst0[2].encode() >> 21) & 0x1f;
++}
++
++/* static */
++void Assembler::WriteLoad64Instructions(Instruction* inst0, Register reg,
++ uint64_t value) {
++ Instruction* i1 = inst0->next();
++ Instruction* i2 = i1->next();
++ Instruction* i3 = i2->next();
++ Instruction* i4 = i3->next();
++ Instruction* i5 = i4->next();
++ Instruction* i6 = i5->next();
++ Instruction* i7 = i6->next();
++
++ if (HasPOWER9()) {
++ // [0] addpcis rD, 0 (DX-form: opcode=19, XO=2, all D fields = 0)
++ inst0->setData(0x4C000004u | (uint32_t(reg.code()) << 21));
++ // [1] ld rD, 20(rD) (rD := *(slot[1] + 20) = *(slot[6]) = .quad)
++ i1->setData(PPC_ld | (uint32_t(reg.code()) << 21) |
++ (uint32_t(reg.code()) << 16) | 20);
++ // [2] b .+24 (skip slots [3..7] to land at slot [8])
++ i2->setData(PPC_b | (24 & 0x03FFFFFC));
++ // [3..5] NOP filler — unreachable but kept aligned for the patcher.
++ i3->setData(NopInst);
++ i4->setData(NopInst);
++ i5->setData(NopInst);
++ } else {
++ // [0] mflr r0
++ inst0->setData(PPC_mfspr | (r0.code() << 21) | PPC_SPR(spr_lr));
++ // [1] bcl 20,0,.+4
++ i1->setData(PPC_bcl_always_plus4);
++ // [2] mflr rD
++ i2->setData(PPC_mfspr | (reg.code() << 21) | PPC_SPR(spr_lr));
++ // [3] mtlr r0
++ i3->setData(PPC_mtspr | (r0.code() << 21) | PPC_SPR(spr_lr));
++ // [4] ld rD, 16(rD)
++ i4->setData(PPC_ld | (reg.code() << 21) | (reg.code() << 16) | 16);
++ // [5] b .+12
++ i5->setData(PPC_b | (12 & 0x03FFFFFC));
++ }
++
++ // [6..7] .quad VALUE (low 32 at lower addr, high 32 at higher addr).
++ i6->setData((uint32_t)(value & 0xFFFFFFFF));
++ i7->setData((uint32_t)(value >> 32));
++}
++
++/* static */
++uint64_t Assembler::ExtractLoad64Value(Instruction* inst0) {
++ // The 8-byte value is at inst0[6..7] in both shapes.
++ Instruction* i6 = inst0 + 6;
++ Instruction* i7 = inst0 + 7;
++
++ uint64_t lo = (uint64_t)i6->encode(); // low 32 at lower addr
++ uint64_t hi = (uint64_t)i7->encode(); // high 32 at higher addr
++ return (hi << 32) | lo;
++}
++
++/* static */
++void Assembler::UpdateLoad64Value(Instruction* inst0, uint64_t value) {
++ // Sanity-check that inst0 is the start of a load64 stanza in either shape.
++ // P8: inst0[1] == bcl 20,0,.+4. P9+: inst0[0] is addpcis (major opcode 19).
++ MOZ_ASSERT(inst0[1].encode() == PPC_bcl_always_plus4 ||
++ IsAddpcisLoad64Stanza(inst0->encode()),
++ "UpdateLoad64Value: inst0 is not a load64 stanza");
++
++ // .quad lives at inst0[6..7] in both shapes.
++ Instruction* i6 = inst0 + 6;
++ Instruction* i7 = inst0 + 7;
++
++ i6->setData((uint32_t)(value & 0xFFFFFFFF)); // low 32 at lower addr
++ i7->setData((uint32_t)(value >> 32)); // high 32 at higher addr
++}
++
++// ========================================================================
++// Patching and toggle operations.
++// ========================================================================
++
++/* static */
++uint32_t Assembler::PatchWrite_NearCallSize() {
++ // 8 instructions for Load64 + mtctr + bctrl = 10 instructions.
++ return 10 * sizeof(uint32_t);
++}
++
++/* static */
++void Assembler::PatchWrite_NearCall(CodeLocationLabel start,
++ CodeLocationLabel toCall) {
++ Instruction* inst = (Instruction*)start.raw();
++ uint8_t* dest = toCall.raw();
++
++ Assembler::WriteLoad64Instructions(inst, SavedScratchRegister,
++ (uint64_t)dest);
++ inst[8].makeOp_mtctr(SavedScratchRegister);
++ inst[9].makeOp_bctr(LinkB);
++ FlushICache(inst, 10 * sizeof(Instruction));
++}
++
++/* static */
++void Assembler::PatchWrite_Imm32(CodeLocationLabel label, Imm32 imm) {
++ uint32_t* l = (uint32_t*)label.raw();
++ *(l - 1) = imm.value;
++ FlushICache(l - 1, sizeof(uint32_t));
++}
++
++void Assembler::PatchDataWithValueCheck(CodeLocationLabel label,
++ ImmPtr newValue, ImmPtr expectedValue) {
++ PatchDataWithValueCheck(label, PatchedImmPtr(newValue.value),
++ PatchedImmPtr(expectedValue.value));
++}
++
++void Assembler::PatchDataWithValueCheck(CodeLocationLabel label,
++ PatchedImmPtr newValue,
++ PatchedImmPtr expectedValue) {
++ Instruction* inst = (Instruction*)label.raw();
++
++ DebugOnly<uint64_t> value = Assembler::ExtractLoad64Value(inst);
++ MOZ_ASSERT(value == uint64_t(expectedValue.value));
++
++ Assembler::UpdateLoad64Value(inst, uint64_t(newValue.value));
++ FlushICache(inst, 8 * sizeof(Instruction));
++}
++
++// ToggleCall toggles the call portion of a toggledCall stanza.
++// Layout: 8 load64 instructions + mtctr + bctrl (10 total).
++// We toggle the last two instructions (mtctr/bctrl vs nop/nop).
++// The destination register is extracted via Load64StanzaDestReg, which
++// handles both the P8 (mflr-rD at slot [2]) and P9+ (addpcis-rD at slot
++// [0]) shapes.
++
++/* static */
++void Assembler::ToggleCall(CodeLocationLabel inst_, bool enabled) {
++ Instruction* i0 = (Instruction*)inst_.raw();
++ Instruction* i8 = (Instruction*)(inst_.raw() + 8 * sizeof(uint32_t));
++ Instruction* i9 = (Instruction*)(inst_.raw() + 9 * sizeof(uint32_t));
++
++ // Accept either P8 stanza (mflr r0 at slot [0]) or P9+ stanza (addpcis at
++ // slot [0]; major opcode 19).
++ MOZ_ASSERT(i0->encode() == (PPC_mfspr | (r0.code() << 21) | PPC_SPR(spr_lr)) ||
++ IsAddpcisLoad64Stanza(i0->encode()));
++
++ // ToggleCall is idempotent across the same `enabled` value: re-enabling
++ // an already-enabled site (or re-disabling a disabled one) is a no-op.
++ // Mozilla's debugger machinery may legitimately toggle the same call site
++ // multiple times in the same direction (e.g. setting both a breakpoint
++ // and a frame.onStep on the same script).
++ Register scratch = Register::FromCode(Load64StanzaDestReg(i0));
++ uint32_t mtctr = PPC_mtspr | (scratch.code() << 21) | PPC_SPR(spr_ctr);
++ uint32_t bctrl = (uint32_t)PPC_bctr | (uint32_t)LinkB;
++ if (enabled) {
++ MOZ_ASSERT(i8->encode() == NopInst || i8->encode() == mtctr);
++ MOZ_ASSERT(i9->encode() == NopInst || i9->encode() == bctrl);
++ i8->setData(mtctr);
++ i9->setData(bctrl);
++ } else {
++ MOZ_ASSERT(i8->encode() == NopInst || i8->encode() == mtctr);
++ MOZ_ASSERT(i9->encode() == NopInst || i9->encode() == bctrl);
++ i8->setData(NopInst);
++ i9->setData(NopInst);
++ }
++ FlushICache(i8, 2 * sizeof(Instruction));
++}
++
++// toggledJump emits a trap stanza via jump(label). After binding, the first
++// instruction becomes "b offset" (short branch). We toggle between b and ori:
++// b offset: [010010][LI:24][0][0]
++// ori r0,r0,imm: [011000][00000][00000][UI:16]
++// For short forward jumps (offset < 64KB), bits 25:16 of LI are 0, so
++// swapping the opcode preserves the offset in the lower 16 bits.
++// ori r0,r0,X is effectively a nop (writes to r0).
++
++/* static */
++void Assembler::ToggleToJmp(CodeLocationLabel inst_) {
++ Instruction* inst = (Instruction*)inst_.raw();
++ MOZ_ASSERT(inst->isOpcode(PPC_ori));
++ // Verify RS=0 and RA=0 (r0).
++ MOZ_ASSERT((inst->encode() & 0x03E00000) == 0);
++ MOZ_ASSERT((inst->encode() & 0x001F0000) == 0);
++ // Swap opcode from ori (011000) to b (010010).
++ uint32_t encoding = inst->encode();
++ encoding = (encoding & 0x03FFFFFF) | (uint32_t)PPC_b;
++ inst->setData(encoding);
++ FlushICache(inst, sizeof(Instruction));
++}
++
++/* static */
++void Assembler::ToggleToCmp(CodeLocationLabel inst_) {
++ Instruction* inst = (Instruction*)inst_.raw();
++ MOZ_ASSERT(inst->isOpcode(PPC_b));
++ // Verify short forward branch: upper LI bits (25:16) are 0, AA=0, LK=0.
++ MOZ_ASSERT((inst->encode() & 0x03FF0003) == 0);
++ // Swap opcode from b (010010) to ori (011000).
++ uint32_t encoding = inst->encode();
++ encoding = (encoding & 0x03FFFFFF) | (uint32_t)PPC_ori;
++ inst->setData(encoding);
++ FlushICache(inst, sizeof(Instruction));
++}
++
++// ========================================================================
++// Bind, tracing, and pointer extraction.
++// ========================================================================
++
++void Assembler::Bind(uint8_t* rawCode, const CodeLabel& label) {
++ if (label.patchAt().bound()) {
++ auto mode = label.linkMode();
++ intptr_t offset = label.patchAt().offset();
++ intptr_t target = label.target().offset();
++
++ if (mode == CodeLabel::RawPointer) {
++ *reinterpret_cast<const void**>(rawCode + offset) = rawCode + target;
++ } else {
++ MOZ_ASSERT(mode == CodeLabel::MoveImmediate ||
++ mode == CodeLabel::JumpImmediate);
++ Instruction* inst = (Instruction*)(rawCode + offset);
++ Assembler::UpdateLoad64Value(inst, (uint64_t)(rawCode + target));
++ }
++ }
++}
++
++uintptr_t Assembler::GetPointer(uint8_t* instPtr) {
++ Instruction* inst = (Instruction*)instPtr;
++ return Assembler::ExtractLoad64Value(inst);
++}
++
++static JitCode* CodeFromJump(Instruction* jump) {
++ uint8_t* target = (uint8_t*)Assembler::ExtractLoad64Value(jump);
++ return JitCode::FromExecutable(target);
++}
++
++void Assembler::TraceJumpRelocations(JSTracer* trc, JitCode* code,
++ CompactBufferReader& reader) {
++ while (reader.more()) {
++ JitCode* child =
++ CodeFromJump((Instruction*)(code->raw() + reader.readUnsigned()));
++ TraceManuallyBarrieredEdge(trc, &child, "rel32");
++ }
++}
++
++static void TraceOneDataRelocation(JSTracer* trc,
++ mozilla::Maybe<AutoWritableJitCode>& awjc,
++ JitCode* code, Instruction* inst) {
++ void* ptr = (void*)Assembler::ExtractLoad64Value(inst);
++ void* prior = ptr;
++
++ uintptr_t word = reinterpret_cast<uintptr_t>(ptr);
++ if (word >> JSVAL_TAG_SHIFT) {
++ Value v = Value::fromRawBits(word);
++ TraceManuallyBarrieredEdge(trc, &v, "jit-masm-value");
++ ptr = (void*)v.bitsAsPunboxPointer();
++ } else {
++ TraceManuallyBarrieredGenericPointerEdge(
++ trc, reinterpret_cast<gc::Cell**>(&ptr), "jit-masm-ptr");
++ }
++
++ if (ptr != prior) {
++ if (awjc.isNothing()) {
++ awjc.emplace(code);
++ }
++ Assembler::UpdateLoad64Value(inst, uint64_t(ptr));
++ }
++}
++
++/* static */
++void Assembler::TraceDataRelocations(JSTracer* trc, JitCode* code,
++ CompactBufferReader& reader) {
++ mozilla::Maybe<AutoWritableJitCode> awjc;
++ while (reader.more()) {
++ size_t offset = reader.readUnsigned();
++ Instruction* inst = (Instruction*)(code->raw() + offset);
++ TraceOneDataRelocation(trc, awjc, code, inst);
++ }
++}
++
++/* static */
++uint8_t* Assembler::NextInstruction(uint8_t* instruction, uint32_t* count) {
++ if (count != nullptr) {
++ *count += sizeof(Instruction);
++ }
++ return instruction + sizeof(Instruction);
++}
++
++// ========================================================================
++// UseScratchRegisterScope implementation.
++// ========================================================================
++
++UseScratchRegisterScope::UseScratchRegisterScope(Assembler& assembler)
++ : available_(assembler.GetScratchRegisterList()),
++ old_available_(*available_) {}
++
++UseScratchRegisterScope::UseScratchRegisterScope(Assembler* assembler)
++ : available_(assembler->GetScratchRegisterList()),
++ old_available_(*available_) {}
++
++UseScratchRegisterScope::~UseScratchRegisterScope() {
++ *available_ = old_available_;
++}
++
++Register UseScratchRegisterScope::Acquire() {
++ MOZ_ASSERT(available_ != nullptr);
++ MOZ_ASSERT(!available_->empty());
++ Register index = GeneralRegisterSet::FirstRegister(available_->bits());
++ available_->takeRegisterIndex(index);
++ return index;
++}
++
++void UseScratchRegisterScope::Release(const Register& reg) {
++ MOZ_ASSERT(available_ != nullptr);
++ MOZ_ASSERT(old_available_.hasRegisterIndex(reg));
++ MOZ_ASSERT(!available_->hasRegisterIndex(reg));
++ Include(GeneralRegisterSet(1 << reg.code()));
++}
++
++bool UseScratchRegisterScope::hasAvailable() const {
++ return (available_->size()) != 0;
++}
+diff --git a/js/src/jit/ppc64/Assembler-ppc64.h b/js/src/jit/ppc64/Assembler-ppc64.h
+new file mode 100644
+index 000000000000..60e84bf71cf7
+--- /dev/null
++++ b/js/src/jit/ppc64/Assembler-ppc64.h
+@@ -0,0 +1,2114 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifndef jit_ppc64_Assembler_ppc64_h
++#define jit_ppc64_Assembler_ppc64_h
++
++#include "jit/CompactBuffer.h"
++#include "jit/JitCode.h"
++#include "jit/JitSpewer.h"
++#include "jit/ppc64/Architecture-ppc64.h"
++#include "jit/shared/Assembler-shared.h"
++#include "jit/shared/Disassembler-shared.h"
++#include "jit/shared/IonAssemblerBuffer.h"
++#include "jit/shared/IonAssemblerBufferWithConstantPools.h"
++#include "wasm/WasmTypeDecls.h"
++
++namespace js {
++namespace jit {
++
++// GPR register constants.
++static constexpr Register r0{Registers::r0};
++static constexpr Register r1{Registers::r1};
++static constexpr Register r2{Registers::r2};
++static constexpr Register r3{Registers::r3};
++static constexpr Register r4{Registers::r4};
++static constexpr Register r5{Registers::r5};
++static constexpr Register r6{Registers::r6};
++static constexpr Register r7{Registers::r7};
++static constexpr Register r8{Registers::r8};
++static constexpr Register r9{Registers::r9};
++static constexpr Register r10{Registers::r10};
++static constexpr Register r11{Registers::r11};
++static constexpr Register r12{Registers::r12};
++static constexpr Register r13{Registers::r13};
++static constexpr Register r14{Registers::r14};
++static constexpr Register r15{Registers::r15};
++static constexpr Register r16{Registers::r16};
++static constexpr Register r17{Registers::r17};
++static constexpr Register r18{Registers::r18};
++static constexpr Register r19{Registers::r19};
++static constexpr Register r20{Registers::r20};
++static constexpr Register r21{Registers::r21};
++static constexpr Register r22{Registers::r22};
++static constexpr Register r23{Registers::r23};
++static constexpr Register r24{Registers::r24};
++static constexpr Register r25{Registers::r25};
++static constexpr Register r26{Registers::r26};
++static constexpr Register r27{Registers::r27};
++static constexpr Register r28{Registers::r28};
++static constexpr Register r29{Registers::r29};
++static constexpr Register r30{Registers::r30};
++static constexpr Register r31{Registers::r31};
++
++// FPR register constants.
++static constexpr FloatRegister f0{FloatRegisters::f0, FloatRegisters::Double};
++static constexpr FloatRegister f1{FloatRegisters::f1, FloatRegisters::Double};
++static constexpr FloatRegister f2{FloatRegisters::f2, FloatRegisters::Double};
++static constexpr FloatRegister f3{FloatRegisters::f3, FloatRegisters::Double};
++static constexpr FloatRegister f4{FloatRegisters::f4, FloatRegisters::Double};
++static constexpr FloatRegister f5{FloatRegisters::f5, FloatRegisters::Double};
++static constexpr FloatRegister f6{FloatRegisters::f6, FloatRegisters::Double};
++static constexpr FloatRegister f7{FloatRegisters::f7, FloatRegisters::Double};
++static constexpr FloatRegister f8{FloatRegisters::f8, FloatRegisters::Double};
++static constexpr FloatRegister f9{FloatRegisters::f9, FloatRegisters::Double};
++static constexpr FloatRegister f10{FloatRegisters::f10, FloatRegisters::Double};
++static constexpr FloatRegister f11{FloatRegisters::f11, FloatRegisters::Double};
++static constexpr FloatRegister f12{FloatRegisters::f12, FloatRegisters::Double};
++static constexpr FloatRegister f13{FloatRegisters::f13, FloatRegisters::Double};
++static constexpr FloatRegister f14{FloatRegisters::f14, FloatRegisters::Double};
++static constexpr FloatRegister f15{FloatRegisters::f15, FloatRegisters::Double};
++static constexpr FloatRegister f16{FloatRegisters::f16, FloatRegisters::Double};
++static constexpr FloatRegister f17{FloatRegisters::f17, FloatRegisters::Double};
++static constexpr FloatRegister f18{FloatRegisters::f18, FloatRegisters::Double};
++static constexpr FloatRegister f19{FloatRegisters::f19, FloatRegisters::Double};
++static constexpr FloatRegister f20{FloatRegisters::f20, FloatRegisters::Double};
++static constexpr FloatRegister f21{FloatRegisters::f21, FloatRegisters::Double};
++static constexpr FloatRegister f22{FloatRegisters::f22, FloatRegisters::Double};
++static constexpr FloatRegister f23{FloatRegisters::f23, FloatRegisters::Double};
++static constexpr FloatRegister f24{FloatRegisters::f24, FloatRegisters::Double};
++static constexpr FloatRegister f25{FloatRegisters::f25, FloatRegisters::Double};
++static constexpr FloatRegister f26{FloatRegisters::f26, FloatRegisters::Double};
++static constexpr FloatRegister f27{FloatRegisters::f27, FloatRegisters::Double};
++static constexpr FloatRegister f28{FloatRegisters::f28, FloatRegisters::Double};
++static constexpr FloatRegister f29{FloatRegisters::f29, FloatRegisters::Double};
++static constexpr FloatRegister f30{FloatRegisters::f30, FloatRegisters::Double};
++static constexpr FloatRegister f31{FloatRegisters::f31, FloatRegisters::Double};
++
++static constexpr Register InvalidReg{Registers::Invalid};
++static constexpr FloatRegister InvalidFloatReg;
++
++static constexpr Register StackPointer = r1;
++static constexpr Register FramePointer = r31;
++static constexpr Register ReturnReg = r3;
++static constexpr Register64 ReturnReg64(ReturnReg);
++static constexpr FloatRegister ReturnFloat32Reg{FloatRegisters::f1,
++ FloatRegisters::Single};
++static constexpr FloatRegister ReturnDoubleReg = f1;
++static constexpr FloatRegister ReturnSimd128Reg{FloatRegisters::f1,
++ FloatRegisters::Simd128};
++
++// r16 is non-volatile and non-allocatable, used as a saved scratch.
++static constexpr Register SavedScratchRegister = r16;
++
++static constexpr Register SecondScratchReg = r12;
++
++static constexpr FloatRegister ScratchFloat32Reg{FloatRegisters::f0,
++ FloatRegisters::Single};
++static constexpr FloatRegister ScratchDoubleReg = f0;
++static constexpr FloatRegister ScratchSimd128Reg{FloatRegisters::f0,
++ FloatRegisters::Simd128};
++
++struct ScratchFloat32Scope : public AutoFloatRegisterScope {
++ explicit ScratchFloat32Scope(MacroAssembler& masm)
++ : AutoFloatRegisterScope(masm, ScratchFloat32Reg) {}
++};
++
++struct ScratchDoubleScope : public AutoFloatRegisterScope {
++ explicit ScratchDoubleScope(MacroAssembler& masm)
++ : AutoFloatRegisterScope(masm, ScratchDoubleReg) {}
++};
++
++// PPC64: ScratchSimd128Scope is a simple register wrapper, NOT a scoped
++// acquire/release. On PPC64, ScratchSimd128Reg is v0 (VSR32; encoded as
++// {FloatRegisters::f0, Simd128} so encoding() = 0 + 32 = 32) — distinct
++// from ScratchDoubleReg = f0 (VSR0). It is non-allocatable and always
++// available. Many SIMD functions call other SIMD functions that also need
++// v0, creating nested "scopes". Using AutoFloatRegisterScope would assert
++// on double-acquire in debug builds. Since v0 is never allocated by the
++// register allocator, nesting is safe.
++struct ScratchSimd128Scope : public FloatRegister {
++ explicit ScratchSimd128Scope(MacroAssembler&)
++ : FloatRegister(ScratchSimd128Reg) {}
++};
++
++class Assembler;
++
++class UseScratchRegisterScope {
++ public:
++ explicit UseScratchRegisterScope(Assembler& assembler);
++ explicit UseScratchRegisterScope(Assembler* assembler);
++ ~UseScratchRegisterScope();
++
++ Register Acquire();
++ void Release(const Register& reg);
++ bool hasAvailable() const;
++ void Include(const GeneralRegisterSet& list) {
++ *available_ = GeneralRegisterSet::Union(*available_, list);
++ }
++ void Exclude(const GeneralRegisterSet& list) {
++ *available_ = GeneralRegisterSet::Subtract(*available_, list);
++ }
++
++ private:
++ GeneralRegisterSet* available_;
++ GeneralRegisterSet old_available_;
++};
++
++static constexpr Register OsrFrameReg = r6;
++static constexpr Register PreBarrierReg = r4;
++static constexpr Register InterpreterPCReg = r17;
++
++static constexpr Register CallTempReg0 = r4;
++static constexpr Register CallTempReg1 = r9;
++static constexpr Register CallTempReg2 = r10;
++static constexpr Register CallTempReg3 = r7;
++// CallTempReg4 must NOT be JSReturnReg (r5): LMegamorphicLoadSlotPermissive
++// uses tempFixed(CallTempReg4) for a saved obj pointer AND defineReturn
++// (JSReturnOperand=r5) for output. If they alias, the megamorphic cache
++// lookup clobbers the saved obj, corrupting the 'this' pointer.
++static constexpr Register CallTempReg4 = r8;
++static constexpr Register CallTempReg5 = r6;
++
++// PPC64 ELFv2 has no volatile non-arg GPRs (r3-r10 are all arg regs).
++// Use allocatable non-volatile registers as overflow temps.
++static constexpr Register CallTempNonArgRegs[] = {r14, r15};
++static const uint32_t NumCallTempNonArgRegs = std::size(CallTempNonArgRegs);
++
++static constexpr Register IntArgReg0 = r3;
++static constexpr Register IntArgReg1 = r4;
++static constexpr Register IntArgReg2 = r5;
++static constexpr Register IntArgReg3 = r6;
++static constexpr Register IntArgReg4 = r7;
++static constexpr Register IntArgReg5 = r8;
++static constexpr Register IntArgReg6 = r9;
++static constexpr Register IntArgReg7 = r10;
++
++// Registers used by RegExpMatcher and RegExpExecMatch stubs.
++static constexpr Register RegExpMatcherRegExpReg = CallTempReg0;
++static constexpr Register RegExpMatcherStringReg = CallTempReg1;
++static constexpr Register RegExpMatcherLastIndexReg = CallTempReg2;
++
++// Registers used by RegExpExecTest stub (do not use ReturnReg).
++static constexpr Register RegExpExecTestRegExpReg = CallTempReg0;
++static constexpr Register RegExpExecTestStringReg = CallTempReg1;
++
++// Registers used by RegExpSearcher stub (do not use ReturnReg).
++static constexpr Register RegExpSearcherRegExpReg = CallTempReg0;
++static constexpr Register RegExpSearcherStringReg = CallTempReg1;
++static constexpr Register RegExpSearcherLastIndexReg = CallTempReg2;
++
++static constexpr Register JSReturnReg_Type = r6;
++static constexpr Register JSReturnReg_Data = r5;
++static constexpr Register JSReturnReg = r5;
++static constexpr ValueOperand JSReturnOperand = ValueOperand(JSReturnReg);
++
++static constexpr Register ABINonArgReg0 = r19;
++static constexpr Register ABINonArgReg1 = r20;
++static constexpr Register ABINonArgReg2 = r21;
++static constexpr Register ABINonArgReg3 = r22;
++static constexpr Register ABINonArgReturnReg0 = r29;
++static constexpr Register ABINonArgReturnReg1 = r30;
++static constexpr Register ABINonVolatileReg = r14;
++static constexpr Register ABINonArgReturnVolatileReg = r11;
++
++static constexpr FloatRegister ABINonArgDoubleReg{FloatRegisters::f14,
++ FloatRegisters::Double};
++
++// Wasm instance pointer register. Preserved across wasm function calls.
++static constexpr Register InstanceReg = r18;
++static constexpr Register HeapReg = r24;
++static constexpr Register GlobalReg = r23;
++
++// Wasm table call registers.
++static constexpr Register WasmTableCallScratchReg0 = ABINonArgReg0;
++static constexpr Register WasmTableCallScratchReg1 = ABINonArgReg1;
++static constexpr Register WasmTableCallSigReg = ABINonArgReg2;
++static constexpr Register WasmTableCallIndexReg = ABINonArgReg3;
++
++// Wasm ref call registers.
++static constexpr Register WasmCallRefCallScratchReg0 = ABINonArgReg0;
++static constexpr Register WasmCallRefCallScratchReg1 = ABINonArgReg1;
++static constexpr Register WasmCallRefCallScratchReg2 = ABINonArgReg2;
++static constexpr Register WasmCallRefReg = ABINonArgReg3;
++
++// Wasm tail call scratch registers.
++// WasmTailCallRAScratchReg must NOT be ABINonArgReg0: the shared tail-call
++// code (wasmReturnCallImport, wasmReturnCallIndirect, wasmReturnCallRef)
++// stores the callee address in ABINonArgReg0, and CollapseWasmFrame*
++// overwrites tempForRA. On architectures with a GPR link register (ARM,
++// MIPS, LA64, RISC-V) this is ra/lr. PPC64's LR is an SPR, so we use r14
++// (ABINonVolatileReg) which is callee-saved and not used in call setup.
++static constexpr Register WasmTailCallInstanceScratchReg = ABINonArgReg1;
++static constexpr Register WasmTailCallRAScratchReg = ABINonVolatileReg;
++static constexpr Register WasmTailCallFPScratchReg = ABINonArgReg3;
++
++// Register used as a scratch along the return path in the fast js -> wasm stub
++// code. Must not overlap ReturnReg, JSReturnOperand, or InstanceReg.
++// Must be volatile.
++static constexpr Register WasmJitEntryReturnScratch = r10;
++
++static constexpr uint32_t ABIStackAlignment = 16;
++static constexpr uint32_t CodeAlignment = 16;
++static constexpr uint32_t JitStackAlignment = 16;
++
++static constexpr uint32_t JitStackValueAlignment =
++ JitStackAlignment / sizeof(Value);
++static_assert(JitStackAlignment % sizeof(Value) == 0 &&
++ JitStackValueAlignment >= 1,
++ "Stack alignment should be a non-zero multiple of sizeof(Value)");
++
++static constexpr uint32_t SimdMemoryAlignment = 16;
++static_assert(
++ CodeAlignment % SimdMemoryAlignment == 0,
++ "Code alignment should be larger than any of the alignments "
++ "which are used for the constant sections of the code buffer. "
++ "Thus it should be larger than the alignment for SIMD constants.");
++
++static constexpr uint32_t WasmStackAlignment = SimdMemoryAlignment;
++static const uint32_t WasmTrapInstructionLength = 4;
++
++static constexpr uint32_t WasmCheckedCallEntryOffset = 0u;
++static constexpr uint32_t WasmCheckedTailEntryOffset = 32u;
++
++static constexpr Scale ScalePointer = TimesEight;
++
++class ABIArgGenerator : public ABIArgGeneratorShared {
++ public:
++ explicit ABIArgGenerator(ABIKind kind)
++ : ABIArgGeneratorShared(kind),
++ intRegIndex_(0),
++ floatRegIndex_(0),
++ current_() {
++ // PPC64 ELFv2 ABI: the callee saves LR, CR, TOC into the caller's
++ // frame (offsets 8, 16, 24 from caller SP). Reserve 32 bytes so that
++ // callWithABIPre always allocates enough space for this link area.
++ stackOffset_ += ShadowStackSpace;
++ }
++
++ ABIArg next(MIRType argType);
++ ABIArg& current() { return current_; }
++
++ protected:
++ unsigned intRegIndex_;
++ unsigned floatRegIndex_;
++ ABIArg current_;
++};
++
++static constexpr uint32_t NumIntArgRegs = 8;
++static constexpr uint32_t NumFloatArgRegs = 13;
++
++static inline bool GetIntArgReg(uint32_t usedIntArgs, Register* out) {
++ if (usedIntArgs < NumIntArgRegs) {
++ *out = Register::FromCode(r3.code() + usedIntArgs);
++ return true;
++ }
++ return false;
++}
++
++static inline bool GetFloatArgReg(uint32_t usedFloatArgs, FloatRegister* out) {
++ if (usedFloatArgs < NumFloatArgRegs) {
++ *out = FloatRegister::FromCode(f1.code() + usedFloatArgs);
++ return true;
++ }
++ return false;
++}
++
++static inline bool GetTempRegForIntArg(uint32_t usedIntArgs,
++ uint32_t usedFloatArgs, Register* out) {
++ MOZ_ASSERT(usedFloatArgs == 0);
++
++ if (GetIntArgReg(usedIntArgs, out)) {
++ return true;
++ }
++
++ usedIntArgs -= NumIntArgRegs;
++ if (usedIntArgs >= NumCallTempNonArgRegs) {
++ return false;
++ }
++ *out = CallTempNonArgRegs[usedIntArgs];
++ return true;
++}
++
++// PPC64 instruction field positions.
++// PPC uses big-endian bit numbering (bit 0 = MSB), but we store instructions
++// in a uint32_t where bit 0 = LSB. The shifts below are in LSB-0 terms.
++//
++// [0:5] primary opcode (OpcodeShift=26)
++// [6:10] RT/RS/BF/TO (RTShift=21, 5 bits)
++// [11:15] RA/BI (RAShift=16, 5 bits)
++// [16:20] RB/SH (RBShift=11, 5 bits)
++// [16:31] SI/UI/D (Imm16Shift=0, 16 bits)
++// [21:25] subop bits (varies)
++// [21:30] XO (X-form; A/M/MD/MDS narrower)
++// [31] Rc bit (RcShift=0)
++
++static const uint32_t OpcodeShift = 26;
++static const uint32_t OpcodeBits = 6;
++
++static const uint32_t RTShift = 21;
++static const uint32_t RTBits = 5;
++static const uint32_t RSShift = 21;
++static const uint32_t RSBits = 5;
++static const uint32_t RAShift = 16;
++static const uint32_t RABits = 5;
++static const uint32_t RBShift = 11;
++static const uint32_t RBBits = 5;
++static const uint32_t RCShift = 6;
++static const uint32_t RCBits = 5;
++
++static const uint32_t BOShift = 21;
++static const uint32_t BOBits = 5;
++static const uint32_t BIShift = 16;
++static const uint32_t BIBits = 5;
++
++static const uint32_t Imm16Shift = 0;
++static const uint32_t Imm16Bits = 16;
++
++static const uint32_t RcShift = 0;
++static const uint32_t RcBit = 1;
++
++static const uint32_t RTMask = ((1 << RTBits) - 1) << RTShift;
++static const uint32_t RSMask = ((1 << RSBits) - 1) << RSShift;
++static const uint32_t RAMask = ((1 << RABits) - 1) << RAShift;
++static const uint32_t RBMask = ((1 << RBBits) - 1) << RBShift;
++static const uint32_t Imm16Mask = (1 << Imm16Bits) - 1;
++static const uint32_t RegMask = (1 << RTBits) - 1;
++
++static inline uint32_t RT(Register r) { return (uint32_t)r.code() << RTShift; }
++static inline uint32_t RT(FloatRegister r) {
++ return (uint32_t)r.code() << RTShift;
++}
++static inline uint32_t RS(Register r) { return (uint32_t)r.code() << RSShift; }
++static inline uint32_t RS(FloatRegister r) {
++ return (uint32_t)r.code() << RSShift;
++}
++static inline uint32_t RA(Register r) { return (uint32_t)r.code() << RAShift; }
++static inline uint32_t RA(FloatRegister r) {
++ return (uint32_t)r.code() << RAShift;
++}
++static inline uint32_t RB(Register r) { return (uint32_t)r.code() << RBShift; }
++static inline uint32_t RB(FloatRegister r) {
++ return (uint32_t)r.code() << RBShift;
++}
++
++// SPR encoding: the SPR number is split across bits 11-15 and 16-20 in a
++// swapped arrangement. PPC_SPR(x) produces the value to OR into an
++// mtspr/mfspr instruction at the RB+RA position (bits 11-20).
++#define PPC_SPR(x) ((((int)(x) >> 5) & 0x1f) << 11 | ((int)(x) & 0x1f) << 16)
++
++enum PPCOpcodes {
++ PPC_add = 0x7C000214,
++ PPC_addc = 0x7C000014,
++ PPC_adde = 0x7C000114,
++ PPC_addi = 0x38000000,
++ PPC_addis = 0x3C000000,
++ PPC_and_ = 0x7C000038,
++ // andi. is always record form (no non-record andi exists).
++ PPC_andi_dot = 0x70000000,
++ PPC_b = 0x48000000,
++ PPC_bc = 0x40000000,
++ // Encoded "bcl 20, lt, $+4": PC-relative branch-and-link by 4 bytes
++ // (land at the next instruction) with BO=20 (branch always); BI=0
++ // (=lt) is don't-care because BO=20 forces the branch. Used by
++ // PoolLoadFPR{32,64}'s POWER8 stanza and PoolLoadSimd128's stanza to
++ // seed LR with the current PC for the subsequent mflr+ld base
++ // computation. Used by patch sites that write raw instruction memory
++ // (PatchConstantPoolLoad, WriteLoad64Instructions, etc.). Named for
++ // grep-ability and to avoid magic-number copies.
++ PPC_bcl_always_plus4 = 0x42800005,
++ PPC_bctr = 0x4E800420,
++ PPC_bcctr = 0x4C000420,
++ PPC_blr = 0x4E800020,
++ PPC_cmpd = 0x7C200000,
++ PPC_cmpdi = 0x2C200000,
++ PPC_cmpld = 0x7C200040,
++ PPC_cmpldi = 0x28200000,
++ PPC_cmpw = 0x7C000000,
++ PPC_cmpwi = 0x2C000000,
++ PPC_cmplw = 0x7C000040,
++ PPC_cmplwi = 0x28000000,
++ PPC_cntlzd = 0x7C000074,
++ PPC_cntlzw = 0x7C000034,
++ PPC_cnttzd = 0x7C000474,
++ PPC_cnttzw = 0x7C000434,
++ PPC_crandc = 0x4C000102,
++ PPC_cror = 0x4C000382,
++ PPC_crorc = 0x4C000342,
++ PPC_divd = 0x7C0003D2,
++ PPC_divdu = 0x7C000392,
++ PPC_divw = 0x7C0003D6,
++ PPC_divwu = 0x7C000396,
++ // POWER9 (ISA 3.0) modulo instructions.
++ PPC_modsd = 0x7C000612,
++ PPC_modsw = 0x7C000616,
++ PPC_modud = 0x7C000212,
++ PPC_moduw = 0x7C000216,
++ PPC_extsb = 0x7C000774,
++ PPC_extsh = 0x7C000734,
++ PPC_extsw = 0x7C0007B4,
++ PPC_fabs = 0xFC000210,
++ PPC_fadd = 0xFC00002A,
++ PPC_fadds = 0xEC00002A,
++ PPC_fcpsgn = 0xFC000010,
++ PPC_fcfid = 0xFC00069C,
++ PPC_fcfids = 0xEC00069C,
++ PPC_fcfidu = 0xFC00079C,
++ PPC_fcfidus = 0xEC00079C,
++ PPC_fcmpu = 0xFC000000,
++ PPC_fctid = 0xFC00065C,
++ PPC_fctidz = 0xFC00065E,
++ PPC_fctiduz = 0xFC00075E,
++ PPC_fctiwz = 0xFC00001E,
++ PPC_fdiv = 0xFC000024,
++ PPC_fdivs = 0xEC000024,
++ PPC_fmr = 0xFC000090,
++ PPC_fmul = 0xFC000032,
++ PPC_fmuls = 0xEC000032,
++ PPC_fneg = 0xFC000050,
++ PPC_frim = 0xFC0003D0,
++ PPC_frip = 0xFC000390,
++ PPC_friz = 0xFC000350,
++ PPC_frsp = 0xFC000018,
++ PPC_fsub = 0xFC000028,
++ PPC_fsubs = 0xEC000028,
++ PPC_fsqrt = 0xFC00002C,
++ PPC_fsqrts = 0xEC00002C,
++ PPC_isel = 0x7C00001E,
++ // POWER10 (ISA 3.1). RT = (CR[BI]==1) ? 1 : 0. XO=384 at bits 21-30.
++ PPC_setbc = 0x7C000300,
++ // POWER10 (ISA 3.1). RT = (CR[BI]==0) ? 1 : 0. XO=416.
++ PPC_setbcr = 0x7C000340,
++ PPC_lbarx = 0x7C000068,
++ PPC_lbz = 0x88000000,
++ PPC_lbzx = 0x7C0000AE,
++ PPC_ld = 0xE8000000,
++ PPC_ldarx = 0x7C0000A8,
++ PPC_ldx = 0x7C00002A,
++ PPC_lfd = 0xC8000000,
++ PPC_lfdx = 0x7C0004AE,
++ PPC_lfiwax = 0x7C0006AE,
++ PPC_lfiwzx = 0x7C0006EE,
++ PPC_lfs = 0xC0000000,
++ PPC_lfsx = 0x7C00042E,
++ PPC_lha = 0xA8000000,
++ PPC_lharx = 0x7C0000E8,
++ PPC_lhax = 0x7C0002AE,
++ PPC_lhz = 0xA0000000,
++ PPC_lhzx = 0x7C00022E,
++ PPC_lwa = 0xE8000002,
++ PPC_lwarx = 0x7C000028,
++ PPC_lwz = 0x80000000,
++ // X-form sign-extending word load (opcode 31, XO=341). Single-insn
++ // equivalent of lwzx + extsw.
++ PPC_lwax = 0x7C0002AA,
++ PPC_lwzx = 0x7C00002E,
++ PPC_mcrxrx = 0x7C000480,
++ PPC_mcrfs = 0xFC000080,
++ PPC_mfocrf = 0x7C100026,
++ PPC_mffs = 0xFC00048E,
++ PPC_mfspr = 0x7C0002A6,
++ PPC_mfvsrd = 0x7C000066,
++ PPC_mtcrf = 0x7C000120,
++ PPC_mtfsb0 = 0xFC00008C,
++ PPC_mtvsrd = 0x7C000166,
++ // POWER8+ (ISA 2.07). VSR[XT].dw[0] = sign_ext_64(RA[32:63]).
++ // XO=211 at bits 21-30. Combines extsw + mtvsrd into one insn.
++ PPC_mtvsrwa = 0x7C0001A6,
++ PPC_mtvsrws = 0x7C000326,
++ PPC_mtvsrwz = 0x7C0001E6,
++ PPC_mtspr = 0x7C0003A6,
++ PPC_mulhd = 0x7C000092,
++ PPC_mulhdu = 0x7C000012,
++ PPC_mulhwu = 0x7C000016,
++ PPC_mulli = 0x1C000000,
++ PPC_mulld = 0x7C0001D2,
++ PPC_mulldo = 0x7C0005D2,
++ PPC_mullw = 0x7C0001D6,
++ PPC_neg = 0x7C0000D0,
++ PPC_nor = 0x7C0000F8,
++ PPC_or_ = 0x7C000378,
++ PPC_ori = 0x60000000,
++ PPC_oris = 0x64000000,
++ PPC_popcntb = 0x7C0000F4,
++ PPC_popcntd = 0x7C0003F4,
++ PPC_popcntw = 0x7C0002F4,
++ PPC_brd = 0x7C000176, // POWER10: byte-reverse doubleword (X-form, XO=187)
++ PPC_brh = 0x7C0001B6, // POWER10: byte-reverse each halfword (X-form, XO=219)
++ PPC_brw = 0x7C000136, // POWER10: byte-reverse each word (X-form, XO=155)
++ PPC_rldcl = 0x78000010,
++ PPC_rldicl = 0x78000000,
++ PPC_rldcr = 0x78000012,
++ PPC_rldicr = 0x78000004,
++ PPC_rldimi = 0x7800000C,
++ PPC_rlwimi = 0x50000000,
++ PPC_rlwinm = 0x54000000,
++ PPC_rlwnm = 0x5C000000,
++ PPC_sld = 0x7C000036,
++ PPC_slw = 0x7C000030,
++ PPC_srad = 0x7C000634,
++ PPC_sradi = 0x7C000674,
++ PPC_sraw = 0x7C000630,
++ PPC_srawi = 0x7C000670,
++ PPC_srd = 0x7C000436,
++ PPC_srw = 0x7C000430,
++ PPC_stb = 0x98000000,
++ PPC_stbcx = 0x7C00056D,
++ PPC_stbx = 0x7C0001AE,
++ PPC_std = 0xF8000000,
++ PPC_stdcx = 0x7C0001AD,
++ PPC_stdu = 0xF8000001,
++ PPC_stdx = 0x7C00012A,
++ PPC_stfd = 0xD8000000,
++ PPC_stfdu = 0xDC000000,
++ PPC_stfdx = 0x7C0005AE,
++ PPC_stfs = 0xD0000000,
++ PPC_stfsu = 0xD4000000,
++ PPC_stfsx = 0x7C00052E,
++ PPC_sth = 0xB0000000,
++ PPC_sthcx = 0x7C0005AD,
++ PPC_sthx = 0x7C00032E,
++ PPC_stw = 0x90000000,
++ PPC_stwx = 0x7C00012E,
++ PPC_stwbrx = 0x7C00052C,
++ PPC_stwcx = 0x7C00012D,
++ PPC_subf = 0x7C000050,
++ PPC_subfc = 0x7C000010,
++ PPC_subfe = 0x7C000110,
++ PPC_subfic = 0x20000000,
++ PPC_sync = 0x7C0004AC,
++ // isync — execution synchronization. Discards prefetched instructions and
++ // forces a refetch+reexecute of everything past the barrier; prevents
++ // speculative bypass. Used for Spectre v1 mitigation in speculationBarrier.
++ // Encoding: bytes `2c 01 00 4c` (LE) = 0x4C00012C.
++ PPC_isync = 0x4C00012C,
++ PPC_trap = 0x7FE00008,
++ PPC_tw = 0x7C000008,
++ PPC_xor_ = 0x7C000278,
++ PPC_xori = 0x68000000,
++ PPC_xoris = 0x6C000000,
++ // VMX register load/store (X-form, opcode 31, XO=103/231).
++ // Operate on raw VR0-31 (the lvx/stvx mnemonics predate VSX, so the
++ // assembler exposes them with a uint8_t VR index rather than via the
++ // VSR-namespace FloatRegister overloads used for lxvx/stxvx.)
++ PPC_lvx = 0x7C0000CE,
++ PPC_lxvd2x = 0x7C000698,
++ PPC_lxvx = 0x7C000218,
++ PPC_mfvsrld = 0x7C000266,
++ PPC_mtvsrdd = 0x7C000366,
++ PPC_stvx = 0x7C0001CE,
++ PPC_stxvd2x = 0x7C000798,
++ PPC_stxvx = 0x7C000318,
++ PPC_vaddubm = 0x10000000,
++ PPC_vavgub = 0x10000402,
++ PPC_vavguh = 0x10000442,
++ PPC_vcmpequb = 0x10000006,
++ PPC_vcmpequh = 0x10000046,
++ PPC_vcmpequw = 0x10000086,
++ PPC_vcmpequd = 0x100000C7,
++ PPC_vcmpgtsb = 0x10000306,
++ PPC_vcmpgtsh = 0x10000346,
++ PPC_vcmpgtsw = 0x10000386,
++ PPC_vcmpgtsd = 0x100003C7,
++ PPC_vcmpgtub = 0x10000206,
++ PPC_vcmpgtuh = 0x10000246,
++ PPC_vcmpgtuw = 0x10000286,
++ PPC_vcmpgtud = 0x100002C7,
++ PPC_vcmpneb = 0x10000007, // POWER9 (ISA 3.0)
++ PPC_vcmpneh = 0x10000047, // POWER9
++ PPC_vcmpnew = 0x10000087, // POWER9
++ PPC_vadduhm = 0x10000040,
++ PPC_vadduwm = 0x10000080,
++ PPC_vaddudm = 0x100000C0,
++ PPC_vaddubs = 0x10000200,
++ PPC_vadduhs = 0x10000240,
++ PPC_vaddsbs = 0x10000300,
++ PPC_vaddshs = 0x10000340,
++ PPC_vmaxsb = 0x10000102,
++ PPC_vmaxsh = 0x10000142,
++ PPC_vmaxsw = 0x10000182,
++ PPC_vmaxsd = 0x100001C2,
++ PPC_vmaxub = 0x10000002,
++ PPC_vmaxuh = 0x10000042,
++ PPC_vmaxuw = 0x10000082,
++ PPC_vmhraddshs = 0x10000021,
++ PPC_vmrghb = 0x1000000C,
++ PPC_vmrghh = 0x1000004C,
++ PPC_vmrghw = 0x1000008C,
++ PPC_vmrglb = 0x1000010C,
++ PPC_vmrglh = 0x1000014C,
++ PPC_vmrglw = 0x1000018C,
++ PPC_vminsb = 0x10000302,
++ PPC_vminsh = 0x10000342,
++ PPC_vminsw = 0x10000382,
++ PPC_vminub = 0x10000202,
++ PPC_vminuh = 0x10000242,
++ PPC_vminuw = 0x10000282,
++ // POWER9 (ISA 3.0) per-lane integer negate. VRA field carries the subop
++ // code: 6 for vnegw, 7 for vnegd. Base XO is 0x602.
++ PPC_vnegw = 0x10060602,
++ PPC_vnegd = 0x10070602,
++ PPC_vmladduhm = 0x10000022,
++ PPC_vmuluwm = 0x10000089,
++ PPC_vmulld = 0x100001C9, // POWER10 (XO=457, vector i64x2 multiply low)
++ PPC_vmulesb = 0x10000308,
++ PPC_vmuleub = 0x10000208,
++ PPC_vmulesh = 0x10000348,
++ PPC_vmuleuh = 0x10000248,
++ PPC_vmulesw = 0x10000388,
++ PPC_vmuleuw = 0x10000288,
++ PPC_vmulosb = 0x10000108,
++ PPC_vmuloub = 0x10000008,
++ PPC_vmulosh = 0x10000148,
++ PPC_vmulouh = 0x10000048,
++ PPC_vmulosw = 0x10000188,
++ PPC_vmulouw = 0x10000088,
++ PPC_vmsumshm = 0x10000028,
++ PPC_vmsumuhm = 0x10000026,
++ PPC_vperm = 0x1000002B,
++ // VX-form, opcode 4, XO=0x54C. Per-byte bit-permute of a 128-bit value;
++ // result 16-bit bitmap lands in dw0 low 16 bits, recoverable via mfvsrd.
++ // Available on POWER8+ (ISA 2.07).
++ PPC_vbpermq = 0x1000054C,
++ // POWER10 (ISA 3.1) Vector Extract Mask. VX-form, opcode 4, XO=0x642,
++ // with UIM at bits 11..15 selecting lane width: 8=byte, 9=halfword,
++ // 10=word, 11=doubleword. RT is a GPR (low N bits = wasm bitmask).
++ PPC_vextractbm = 0x10080642,
++ PPC_vextracthm = 0x10090642,
++ PPC_vextractwm = 0x100A0642,
++ PPC_vextractdm = 0x100B0642,
++ // POWER10 vector insert from GPR at immediate byte offset:
++ // vinsw VRT, RB, UIM VRT[UIM*8:UIM*8+31] ← RB[32:63]
++ // vinsd VRT, RB, UIM VRT[UIM*8:UIM*8+63] ← RB[0:63]
++ // VX-form, opcode 4. RB at bits 16..20, UIM at bits 11..15.
++ PPC_vinsw = 0x100000CF, // POWER10 (XO=207)
++ PPC_vinsd = 0x100001CF, // POWER10 (XO=463)
++ // POWER10 vector insert byte/halfword from GPR with register-supplied
++ // (right-indexed = LE-natural) byte position:
++ // vinsbrx VRT, RA, RB VRT.byte[RA & 0xF] ← RB & 0xFF
++ // vinshrx VRT, RA, RB VRT.hword[(RA & 0xE)/2] ← RB & 0xFFFF
++ // VX-form, opcode 4. RA at bits 16..20, RB at bits 11..15.
++ PPC_vinsbrx = 0x1000030F, // POWER10 (XO=783)
++ PPC_vinshrx = 0x1000034F, // POWER10 (XO=847)
++ // POWER9 (ISA 3.0) vector insert byte/halfword from VR at immediate
++ // byte position:
++ // vinsertb VRT, VRB, UIM VRT.byte[UIM] ← VRB.byte[7] (BE)
++ // vinserth VRT, VRB, UIM VRT.hword[UIM..+1] ← VRB.byte[6..7] (BE)
++ // V-form, opcode 4. VRB at bits 11..15, UIM at bits 16..20. Simd128
++ // lives in VSR32-63 (= VR0-31), so the V-form VRT field addresses our
++ // Simd128 storage via `encoding() & 31`.
++ PPC_vinsertb = 0x1000030D, // POWER9 (XO=781)
++ PPC_vinserth = 0x1000034D, // POWER9 (XO=845)
++ PPC_vextractub = 0x1000020D, // POWER9 (XO=525)
++ PPC_vextractuh = 0x1000024D, // POWER9 (XO=589)
++ PPC_vspltisb = 0x1000030C, // POWER7+ (XO=780, splat 5-bit SIMM to all 16 byte lanes)
++ PPC_vspltish = 0x1000034C, // POWER7+ (XO=844, splat 5-bit SIMM to all 8 i16 lanes)
++ PPC_vspltisw = 0x1000038C, // POWER7+ (XO=908, splat 5-bit SIMM to all 4 i32 lanes)
++ PPC_vpopcntb = 0x10000703,
++ PPC_vslb = 0x10000104,
++ PPC_vsld = 0x100005C4,
++ PPC_vsldoi = 0x1000002C,
++ PPC_vslh = 0x10000144,
++ PPC_vslo = 0x1000040C,
++ PPC_vslw = 0x10000184,
++ PPC_vspltb = 0x1000020C,
++ PPC_vsplth = 0x1000024C,
++ PPC_vsrab = 0x10000304,
++ PPC_vsrad = 0x100003C4,
++ PPC_vsrah = 0x10000344,
++ PPC_vsraw = 0x10000384,
++ PPC_vsrb = 0x10000204,
++ PPC_vsrd = 0x100006C4,
++ PPC_vsrh = 0x10000244,
++ PPC_vsro = 0x1000044C,
++ PPC_vsrw = 0x10000284,
++ PPC_vpkshss = 0x1000018E,
++ PPC_vpkshus = 0x1000010E,
++ PPC_vpkswss = 0x100001CE,
++ PPC_vpkswus = 0x1000014E,
++ PPC_vupkhsb = 0x1000020E,
++ PPC_vupkhsh = 0x1000024E,
++ PPC_vupkhsw = 0x1000064E,
++ PPC_vupklsb = 0x1000028E,
++ PPC_vupklsh = 0x100002CE,
++ PPC_vupklsw = 0x100006CE,
++ PPC_vsububm = 0x10000400,
++ PPC_vsubuhm = 0x10000440,
++ PPC_vsubuwm = 0x10000480,
++ PPC_vsubudm = 0x100004C0,
++ PPC_vsububs = 0x10000600,
++ PPC_vsubuhs = 0x10000640,
++ PPC_vsubsbs = 0x10000700,
++ PPC_vsubshs = 0x10000740,
++ PPC_xscvdpspn = 0xF000042C,
++ PPC_xscvspdpn = 0xF000052C,
++ // POWER9 (ISA 3.0) scalar FP16 conversions, XX2-form. The UIM
++ // disambiguator is baked into the constant (xscvdphp=17, xscvhpdp=16).
++ // Encodings cross-checked against binutils with `.machine power9`.
++ PPC_xscvdphp = 0xF011056C,
++ PPC_xscvhpdp = 0xF010056C,
++ // POWER9 (ISA 3.0) scalar VSX extract biased exponent, XX2-form.
++ // XT.dword[0] = (zero || biased_exp_11bit), XT.dword[1] = 0. XO=347
++ // (shares XO with xscv{dp,hp}{hp,dp} — disambiguated by bits 16-20=0).
++ // Encoding cross-checked against binutils with `.machine power9`.
++ PPC_xsxexpdp = 0xF000056C,
++ // POWER9 (ISA 3.0) scalar FP16 load/store, X-form (opcode 31).
++ // lxsihzx zero-extends; stxsihx writes 16 bits from VSR dword 0
++ // word 1's low halfword.
++ PPC_lxsihzx = 0x7C00065A,
++ PPC_stxsihx = 0x7C00075A,
++ // POWER9 scalar VSX max/min with Java/JavaScript semantics — handles
++ // ±0 and NaN identically to Math.max/Math.min in ECMA-262 (covers
++ // 19 corner cases against the JS shell).
++ // XX3-form, primary opcode 60, XO=144 (max) / XO=152 (min).
++ PPC_xsmaxjdp = 0xF0000480,
++ PPC_xsminjdp = 0xF00004C0,
++ PPC_xxbrd = 0xF017076C,
++ PPC_xvabsdp = 0xF0000764,
++ PPC_xvabssp = 0xF0000664,
++ PPC_xvadddp = 0xF0000300,
++ PPC_xvaddsp = 0xF0000200,
++ PPC_xvcmpeqdp = 0xF0000318,
++ PPC_xvcmpeqsp = 0xF0000218,
++ PPC_xvcmpgedp = 0xF0000398,
++ PPC_xvcmpgesp = 0xF0000298,
++ PPC_xvcmpgtdp = 0xF0000358,
++ PPC_xvcmpgtsp = 0xF0000258,
++ PPC_xvcvdpsp = 0xF0000624,
++ PPC_xvcvdpsxws = 0xF0000360,
++ PPC_xvcvdpuxws = 0xF0000320,
++ PPC_xvcvspdp = 0xF0000724,
++ PPC_xvcvspsxws = 0xF0000260,
++ PPC_xvcvspuxws = 0xF0000220,
++ PPC_xvcvsxwdp = 0xF00003E0,
++ PPC_xvcvsxwsp = 0xF00002E0,
++ PPC_xvcvuxwdp = 0xF00003A0,
++ PPC_xvcvuxwsp = 0xF00002A0,
++ PPC_xvdivdp = 0xF00003C0,
++ PPC_xvdivsp = 0xF00002C0,
++ PPC_xvmaddadp = 0xF0000308,
++ PPC_xvmaddasp = 0xF0000208,
++ PPC_xvmaxdp = 0xF0000700,
++ PPC_xvmaxsp = 0xF0000600,
++ PPC_xvmindp = 0xF0000740,
++ PPC_xvminsp = 0xF0000640,
++ PPC_xvmuldp = 0xF0000380,
++ PPC_xvmulsp = 0xF0000280,
++ PPC_xvnegdp = 0xF00007E4,
++ PPC_xvnmsubadp = 0xF0000788,
++ PPC_xvnmsubasp = 0xF0000688,
++ PPC_xvnegsp = 0xF00006E4,
++ PPC_xvrdpic = 0xF00003AC,
++ PPC_xvrdpim = 0xF00003E4,
++ PPC_xvrdpip = 0xF00003A4,
++ PPC_xvrdpiz = 0xF0000364,
++ PPC_xvrspic = 0xF00002AC,
++ PPC_xvrspim = 0xF00002E4,
++ PPC_xvrspip = 0xF00002A4,
++ PPC_xvrspiz = 0xF0000264,
++ PPC_xvsqrtdp = 0xF000032C,
++ PPC_xvsqrtsp = 0xF000022C,
++ PPC_xvsubdp = 0xF0000340,
++ PPC_xvsubsp = 0xF0000240,
++ PPC_xxextractuw = 0xF0000294,
++ PPC_xxinsertw = 0xF00002D4,
++ PPC_xxland = 0xF0000410,
++ PPC_xxlandc = 0xF0000450,
++ PPC_xxlnor = 0xF0000510,
++ PPC_xxlor = 0xF0000490,
++ PPC_xxlxor = 0xF00004D0,
++ PPC_xxpermdi = 0xF0000050,
++ PPC_xxsel = 0xF0000030,
++ PPC_xxspltib = 0xF00002D0, // POWER9 (ISA 3.0): XX1-form, no Rc
++ PPC_xxspltw = 0xF0000290,
++
++ // Simplified mnemonics.
++ PPC_mr = PPC_or_,
++ PPC_not = PPC_nor,
++ PPC_nop = PPC_ori,
++ PPC_lwsync = PPC_sync | (1 << 21),
++
++ PPC_MAJOR_OPCODE_MASK = 0xFC000000
++};
++
++static const uint32_t NopInst = (uint32_t)PPC_nop;
++static const uint32_t PPC_STANZA_LENGTH = 16;
++
++class Instruction;
++class InstReg;
++class InstImm;
++class BOffImm16;
++class JOffImm26;
++
++// PPC64 base instruction type: a single 32-bit word.
++class Instruction {
++ protected:
++ uint32_t data;
++
++ public:
++ explicit Instruction(uint32_t data_) : data(data_) {}
++ explicit Instruction(PPCOpcodes op) : data((uint32_t)op) {}
++
++ uint32_t encode() const { return data; }
++
++ void makeNop() { data = NopInst; }
++ void makeOp_mtctr(Register r) {
++ data = PPC_mtspr | ((uint32_t)r.code()) << 21 | PPC_SPR(9);
++ }
++ void makeOp_bctr(uint32_t linkBit = 0) { data = PPC_bctr | linkBit; }
++
++ void setData(uint32_t data) { this->data = data; }
++
++ const Instruction& operator=(const Instruction& src) {
++ data = src.data;
++ return *this;
++ }
++
++ uint32_t extractBit(uint32_t bit) const { return (encode() >> bit) & 1; }
++ uint32_t extractBitField(uint32_t hi, uint32_t lo) const {
++ return (encode() >> lo) & ((2 << (hi - lo)) - 1);
++ }
++
++ uint32_t extractOpcode() const { return data & PPC_MAJOR_OPCODE_MASK; }
++ bool isOpcode(uint32_t op) const {
++ return extractOpcode() == (op & PPC_MAJOR_OPCODE_MASK);
++ }
++
++ uint32_t extractRT() const {
++ return extractBitField(RTShift + RTBits - 1, RTShift);
++ }
++ uint32_t extractRA() const {
++ return extractBitField(RAShift + RABits - 1, RAShift);
++ }
++ uint32_t extractRB() const {
++ return extractBitField(RBShift + RBBits - 1, RBShift);
++ }
++ uint32_t extractImm16() const { return data & Imm16Mask; }
++
++ Instruction* next() { return this + 1; }
++
++ const uint32_t* raw() const { return &data; }
++ uint32_t size() const { return 4; }
++};
++
++static_assert(sizeof(Instruction) == 4);
++
++class InstNOP : public Instruction {
++ public:
++ InstNOP() : Instruction(NopInst) {}
++};
++
++// Register-register-register instruction (X-form and XO-form).
++class InstReg : public Instruction {
++ public:
++ explicit InstReg(PPCOpcodes op) : Instruction(op) {}
++ InstReg(PPCOpcodes op, Register rt, Register ra, Register rb)
++ : Instruction((uint32_t)op | RT(rt) | RA(ra) | RB(rb)) {}
++ InstReg(PPCOpcodes op, FloatRegister frt, FloatRegister fra,
++ FloatRegister frb)
++ : Instruction((uint32_t)op | RT(frt) | RA(fra) | RB(frb)) {}
++
++ void setRT(Register r) { data = (data & ~RTMask) | RT(r); }
++ void setRA(Register r) { data = (data & ~RAMask) | RA(r); }
++ void setRB(Register r) { data = (data & ~RBMask) | RB(r); }
++
++ void setImm16(uint32_t imm) {
++ data = (data & 0xFFFF0000) | (imm & Imm16Mask);
++ }
++ uint32_t extractImm16Value() const { return data & Imm16Mask; }
++};
++
++// Register-immediate instruction (D-form).
++// Bits 21-25 hold RT (loads, addi) or RS (stores, ori). Both encode identically
++// since RT and RS occupy the same field; the caller simply passes the right
++// register.
++class InstImm : public Instruction {
++ public:
++ explicit InstImm(PPCOpcodes op) : Instruction(op) {}
++ InstImm(PPCOpcodes op, Register rt, Register ra, uint32_t imm16)
++ : Instruction((uint32_t)op | RT(rt) | RA(ra) | (imm16 & Imm16Mask)) {}
++
++ void setRT(Register r) { data = (data & ~RTMask) | RT(r); }
++ void setRA(Register r) { data = (data & ~RAMask) | RA(r); }
++
++ void setImm16(uint32_t imm) {
++ data = (data & 0xFFFF0000) | (imm & Imm16Mask);
++ }
++ void setLowerReg(Register rl) {
++ data = (data & 0xFFE0FFFF) | ((uint32_t)rl.code() << 16);
++ }
++ uint32_t extractImm16Value() const { return data & Imm16Mask; }
++
++ // Extract the TrapTag from a tagged trap instruction (tw).
++ // Defined in Assembler-ppc64.cpp. Returns a TrapTag value as uint8_t
++ // because Assembler::TrapTag is not yet defined at this point in the header.
++ uint8_t traptag();
++};
++
++// A BOffImm16 is a 16-bit signed branch offset for conditional branches
++// (bc-form instructions). The offset is stored in bits 2..15 and is
++// 4-byte aligned, giving a range of +/-32 KB.
++class BOffImm16 {
++ int32_t data;
++
++ public:
++ uint32_t encode() const {
++ MOZ_ASSERT(!isInvalid());
++ return static_cast<uint32_t>(data) & 0xFFFC;
++ }
++ int32_t decode() const {
++ MOZ_ASSERT(!isInvalid());
++ return data;
++ }
++
++ explicit BOffImm16(int offset) : data(offset) {
++ MOZ_ASSERT((offset & 0x3) == 0);
++ MOZ_ASSERT(IsInRange(offset));
++ }
++ static bool IsInRange(int offset) {
++ return offset >= -32768 && offset <= 32764;
++ }
++
++ static const int32_t INVALID = 0x00020000;
++ BOffImm16() : data(INVALID) {}
++
++ bool isInvalid() const { return data == INVALID; }
++
++ Instruction* getDest(Instruction* src) const;
++
++ explicit BOffImm16(InstImm inst);
++};
++
++// A JOffImm26 is a 26-bit signed branch offset for unconditional branches
++// (b/bl instructions). Bits 2..25 encode the offset, 4-byte aligned,
++// giving a range of +/-32 MB.
++class JOffImm26 {
++ int32_t data;
++
++ public:
++ uint32_t encode() const {
++ MOZ_ASSERT(!isInvalid());
++ return static_cast<uint32_t>(data) & 0x03FFFFFC;
++ }
++ int32_t decode() const {
++ MOZ_ASSERT(!isInvalid());
++ return data;
++ }
++
++ explicit JOffImm26(int offset) : data(offset) {
++ MOZ_ASSERT((offset & 0x3) == 0);
++ MOZ_ASSERT(IsInRange(offset));
++ }
++ static bool IsInRange(int offset) {
++ return offset >= -33554432 && offset <= 33554428;
++ }
++
++ static const int32_t INVALID = 0x20000000;
++ JOffImm26() : data(INVALID) {}
++
++ bool isInvalid() const { return data == INVALID; }
++
++ Instruction* getDest(Instruction* src) const;
++};
++
++// A 16-bit immediate value used in D-form instructions.
++class Imm16 {
++ int32_t value;
++
++ public:
++ Imm16();
++ explicit Imm16(uint32_t imm) : value(imm) {}
++ uint32_t encode() const { return static_cast<uint32_t>(value) & 0xffff; }
++ int32_t decodeSigned() const { return value; }
++ uint32_t decodeUnsigned() const { return value; }
++ static bool IsInSignedRange(int32_t imm) {
++ return imm >= INT16_MIN && imm <= INT16_MAX;
++ }
++ static bool IsInUnsignedRange(uint32_t imm) { return imm <= UINT16_MAX; }
++ static Imm16 Lower(Imm32 imm) { return Imm16(imm.value & 0xffff); }
++ static Imm16 Upper(Imm32 imm) { return Imm16((imm.value >> 16) & 0xffff); }
++};
++
++class Imm8 {
++ uint8_t value;
++
++ public:
++ Imm8();
++ explicit Imm8(uint32_t imm) : value(imm) {}
++ uint32_t encode(uint32_t shift) const { return value << shift; }
++ int32_t decodeSigned() const { return value; }
++ uint32_t decodeUnsigned() const { return value; }
++ static bool IsInSignedRange(int32_t imm) {
++ return imm >= INT8_MIN && imm <= INT8_MAX;
++ }
++ static bool IsInUnsignedRange(uint32_t imm) { return imm <= UINT8_MAX; }
++ static Imm8 Lower(Imm16 imm) { return Imm8(imm.decodeSigned() & 0xff); }
++ static Imm8 Upper(Imm16 imm) {
++ return Imm8((imm.decodeSigned() >> 8) & 0xff);
++ }
++};
++
++class Operand {
++ public:
++ enum Tag { REG, FREG, MEM };
++
++ private:
++ Tag tag : 3;
++ uint32_t reg : 5;
++ int32_t offset;
++
++ public:
++ MOZ_IMPLICIT Operand(Register reg_) : tag(REG), reg(reg_.code()) {}
++
++ explicit Operand(FloatRegister freg) : tag(FREG), reg(freg.code()) {}
++
++ Operand(Register base, Imm32 off)
++ : tag(MEM), reg(base.code()), offset(off.value) {}
++
++ Operand(Register base, int32_t off)
++ : tag(MEM), reg(base.code()), offset(off) {}
++
++ explicit Operand(const Address& addr)
++ : tag(MEM), reg(addr.base.code()), offset(addr.offset) {}
++
++ Tag getTag() const { return tag; }
++
++ Register toReg() const {
++ MOZ_ASSERT(tag == REG);
++ return Register::FromCode(reg);
++ }
++
++ FloatRegister toFReg() const {
++ MOZ_ASSERT(tag == FREG);
++ return FloatRegister::FromCode(reg);
++ }
++
++ void toAddr(Register* r, Imm32* dest) const {
++ MOZ_ASSERT(tag == MEM);
++ *r = Register::FromCode(reg);
++ *dest = Imm32(offset);
++ }
++ Address toAddress() const {
++ MOZ_ASSERT(tag == MEM);
++ return Address(Register::FromCode(reg), offset);
++ }
++ int32_t disp() const {
++ MOZ_ASSERT(tag == MEM);
++ return offset;
++ }
++
++ int32_t base() const {
++ MOZ_ASSERT(tag == MEM);
++ return reg;
++ }
++ Register baseReg() const {
++ MOZ_ASSERT(tag == MEM);
++ return Register::FromCode(reg);
++ }
++};
++
++// Bug 2034064 collapsed the per-buffer compile-time configuration of
++// AssemblerBufferWithConstantPools into AssemblerBufferSettings, and reduced
++// the runtime ctor to (poolMaxOffset, nopFill). instBufferAlign and the
++// NumShortBranchRanges template arg were dropped: PPC64 previously passed
++// instBufferAlign=8 (unused on this backend; pool entries are 4-byte aligned)
++// and NumShortBranchRanges=0.
++using PPCBuffer = js::jit::AssemblerBufferWithConstantPools<
++ Instruction, Assembler,
++ js::jit::AssemblerBufferSettings{
++ .instSize = 4,
++ .guardSize = 1,
++ .headerSize = 1,
++ .pcBias = 0,
++ .alignFillInst = NopInst,
++ .nopFillInst = NopInst,
++ }>;
++
++// Inherits executableCopy() and appendRawCode() from
++// AssemblerBufferWithConstantPools, which assert pool is flushed.
++class PPCBufferWithExecutableCopy : public PPCBuffer {
++ public:
++ PPCBufferWithExecutableCopy(size_t poolMaxOffset, unsigned nopFill)
++ : PPCBuffer(poolMaxOffset, nopFill) {}
++};
++
++class Assembler : public AssemblerShared {
++ public:
++ // Trap tags encoded in the low bits of a trap word.
++ // FreeBSD and others may use r1 in their trap word, so bit 0 is avoided.
++ enum TrapTag {
++ BTag = 2,
++ BCTag = 4,
++ CallTag = 6,
++ DebugTag0 = 10,
++ DebugTag1 = 12,
++ DebugTag2 = 14
++ };
++
++ // Pool load types encoded in bits 21-22 of pool hint words.
++ // Used by InsertIndexIntoTag / PatchConstantPoolLoad.
++ enum PoolLoadType {
++ PoolLoadFPR64 = 1, // lfd fD, offset(rBase)
++ PoolLoadSimd128 = 2, // addi rBase, rBase, offset; lxvx vsD, 0, rBase
++ PoolLoadFPR32 = 3 // lfs fD, offset(rBase) — auto-expands to double
++ };
++
++ enum BranchBits {
++ BranchOnClear = 0x04,
++ BranchOnSet = 0x0c,
++ BranchOptionMask = 0x0f,
++ BranchOptionInvert = 0x08
++ };
++
++ // PPC condition encoding. The top nybble is the offset to the CR field
++ // (the x in BIF*4+x), and the bottom is the BO field.
++ // Synthetic flags sit in the MSB and are masked off before use.
++ enum Condition {
++ ConditionUnsigned = 0x100,
++ ConditionUnsignedHandled = 0x2ff,
++ ConditionZero = 0x400,
++ ConditionOnlyXER = 0x200,
++ ConditionXERCA = 0x23c,
++ ConditionXERNCA = 0x234,
++ ConditionXEROV = 0x21c,
++
++ Equal = 0x2c,
++ NotEqual = 0x24,
++ GreaterThan = 0x1c,
++ GreaterThanOrEqual = 0x04,
++ LessThan = 0x0c,
++ LessThanOrEqual = 0x14,
++
++ Above = GreaterThan | ConditionUnsigned,
++ AboveOrEqual = GreaterThanOrEqual | ConditionUnsigned,
++ Below = LessThan | ConditionUnsigned,
++ BelowOrEqual = LessThanOrEqual | ConditionUnsigned,
++
++ Signed = LessThan | ConditionZero,
++ NotSigned = GreaterThanOrEqual | ConditionZero,
++ Zero = Equal | ConditionZero,
++ NonZero = NotEqual | ConditionZero,
++
++ Overflow = ConditionXEROV,
++ NotOverflow = ConditionOnlyXER | LessThanOrEqual,
++ CarrySet = ConditionXERCA,
++ CarryClear = ConditionXERNCA,
++
++ Always = 0x1f,
++ SOBit = 0x3c,
++ NSOBit = 0x34
++ };
++
++ enum DoubleCondition {
++ DoubleConditionUnordered = 0x100,
++ DoubleOrdered = 0x34,
++ DoubleEqual = 0x2c,
++ DoubleNotEqual = 0x24,
++ DoubleGreaterThan = 0x1c,
++ DoubleGreaterThanOrEqual = 0x04,
++ DoubleLessThan = 0x0c,
++ DoubleLessThanOrEqual = 0x14,
++ DoubleUnordered = 0x3c,
++ DoubleEqualOrUnordered = DoubleEqual | DoubleConditionUnordered,
++ DoubleNotEqualOrUnordered = DoubleNotEqual | DoubleConditionUnordered,
++ DoubleGreaterThanOrUnordered = DoubleGreaterThan | DoubleConditionUnordered,
++ DoubleGreaterThanOrEqualOrUnordered =
++ DoubleGreaterThanOrEqual | DoubleConditionUnordered,
++ DoubleLessThanOrUnordered = DoubleLessThan | DoubleConditionUnordered,
++ DoubleLessThanOrEqualOrUnordered =
++ DoubleLessThanOrEqual | DoubleConditionUnordered,
++ };
++
++ enum JumpOrCall { BranchIsJump, BranchIsCall };
++
++ enum LinkBit {
++ DontLinkB = 0,
++ LinkB = 1,
++ };
++
++ enum LikelyBit {
++ NotLikelyB = 0,
++ LikelyB = 1,
++ };
++
++ enum BranchAddressType {
++ RelativeBranch = 0,
++ AbsoluteBranch = 2,
++ };
++
++ enum FloatFormat { SingleFloat, DoubleFloat };
++ enum FloatTestKind { TestForTrue, TestForFalse };
++
++ BufferOffset nextOffset() { return m_buffer.nextOffset(); }
++
++ protected:
++ Instruction* editSrc(BufferOffset bo) {
++ if (!bo.assigned()) {
++ // Under OOM, writeInst may return an unassigned BufferOffset.
++ // Return a dummy writable area so callers (WriteLoad64Instructions)
++ // can proceed harmlessly; the compilation will be discarded.
++ static uint32_t oomDummy_[8];
++ return (Instruction*)oomDummy_;
++ }
++ return m_buffer.getInst(bo);
++ }
++
++ struct RelativePatch {
++ BufferOffset offset;
++ void* target;
++ RelocationKind kind;
++
++ RelativePatch(BufferOffset offset, void* target, RelocationKind kind)
++ : offset(offset), target(target), kind(kind) {}
++ };
++
++ js::Vector<RelativePatch, 8, SystemAllocPolicy> jumps_;
++
++ CompactBufferWriter jumpRelocations_;
++ CompactBufferWriter dataRelocations_;
++
++ PPCBufferWithExecutableCopy m_buffer;
++
++#ifdef JS_JITSPEW
++ Sprinter* printer;
++#endif
++
++ public:
++ // Which absolute bit number does a CR + Condition pair refer to?
++ static uint8_t crBit(CRegisterID cr, Condition cond) {
++ return (cr << 2) + ((cond & 0xf0) >> 4);
++ }
++ static uint8_t crBit(CRegisterID cr, DoubleCondition cond) {
++ return (cr << 2) + ((cond & 0xf0) >> 4);
++ }
++
++ Assembler()
++ : m_buffer(/* poolMaxOffset */ 8192, /* nopFill */ 0),
++#ifdef JS_JITSPEW
++ printer(nullptr),
++#endif
++ isFinished(false),
++ scratch_register_list_((1 << Registers::r11) | (1 << Registers::r12)) {
++ }
++
++ void setUnlimitedBuffer() { m_buffer.setUnlimited(); }
++
++ // Constant pool callbacks required by AssemblerBufferWithConstantPools.
++ static void InsertIndexIntoTag(uint8_t* load, uint32_t index);
++ static bool PatchConstantPoolLoad(void* loadAddr, void* constPoolAddr);
++ static void WritePoolGuard(BufferOffset branch, Instruction* inst,
++ BufferOffset dest);
++ static void WritePoolHeader(uint8_t* start, js::jit::Pool* p, bool isNatural);
++ static void PatchShortRangeBranchToVeneer(PPCBuffer*, unsigned rangeIdx,
++ BufferOffset deadline,
++ BufferOffset veneer);
++
++ static Condition InvertCondition(Condition cond);
++ static DoubleCondition InvertCondition(DoubleCondition cond);
++
++ void writeRelocation(BufferOffset src) {
++ jumpRelocations_.writeUnsigned(src.getOffset());
++ }
++
++ void writeDataRelocation(ImmGCPtr ptr) {
++ if (ptr.value) {
++ if (gc::IsInsideNursery(ptr.value)) {
++ embedsNurseryPointers_ = true;
++ }
++ dataRelocations_.writeUnsigned(nextOffset().getOffset());
++ }
++ }
++ void writeDataRelocation(BufferOffset bo, ImmGCPtr ptr) {
++ if (ptr.value) {
++ if (gc::IsInsideNursery(ptr.value)) {
++ embedsNurseryPointers_ = true;
++ }
++ dataRelocations_.writeUnsigned(bo.getOffset());
++ }
++ }
++
++ void assertNoGCThings() const {
++#ifdef DEBUG
++ MOZ_ASSERT(dataRelocations_.length() == 0);
++ for (auto& j : jumps_) {
++ MOZ_ASSERT(j.kind == RelocationKind::HARDCODED);
++ }
++#endif
++ }
++
++ bool oom() const;
++
++ void setPrinter(Sprinter* sp) {
++#ifdef JS_JITSPEW
++ printer = sp;
++#endif
++ }
++
++#ifdef JS_JITSPEW
++ inline void spew(const char* fmt, ...) MOZ_FORMAT_PRINTF(2, 3) {
++ if (MOZ_UNLIKELY(printer || JitSpewEnabled(JitSpew_Codegen))) {
++ va_list va;
++ va_start(va, fmt);
++ spewVA(fmt, va);
++ va_end(va);
++ }
++ }
++ MOZ_COLD void spewVA(const char* fmt, va_list va) MOZ_FORMAT_PRINTF(2, 0) {
++ char buf[200];
++ int i = VsprintfLiteral(buf, fmt, va);
++ if (i > -1) {
++ if (printer) {
++ printer->printf("%s\n", buf);
++ }
++ js::jit::JitSpew(js::jit::JitSpew_Codegen, "%s", buf);
++ }
++ }
++#else
++ MOZ_ALWAYS_INLINE void spew(const char* fmt, ...) MOZ_FORMAT_PRINTF(2, 3) {}
++#endif
++
++ Register getStackPointer() const { return StackPointer; }
++
++ protected:
++ bool isFinished;
++
++ public:
++ static uintptr_t GetPointer(uint8_t*);
++ void flush() {
++ MOZ_ASSERT(!isFinished);
++ m_buffer.flushPool();
++ }
++ // Inhibit pool flushes for the next maxInst instructions. Mirrors the
++ // ARM/ARM64 wrappers; lets shared code (e.g. WasmFrameIter epilogues
++ // that need static byte distances between currentOffset() captures)
++ // fence a small instruction window without reaching into m_buffer.
++ void enterNoPool(size_t maxInst) { m_buffer.enterNoPool(maxInst); }
++ void leaveNoPool() { m_buffer.leaveNoPool(); }
++ void finish();
++ bool appendRawCode(const uint8_t* code, size_t numBytes);
++ bool reserve(size_t size);
++ bool swapBuffer(wasm::Bytes& bytes);
++ void executableCopy(void* buffer);
++ void copyJumpRelocationTable(uint8_t* dest);
++ void copyDataRelocationTable(uint8_t* dest);
++
++ size_t size() const;
++ size_t jumpRelocationTableBytes() const;
++ size_t dataRelocationTableBytes() const;
++ size_t bytesNeeded() const;
++
++ BufferOffset writeInst(uint32_t x, uint32_t* dest = nullptr);
++ static void WriteInstStatic(uint32_t x, uint32_t* dest);
++
++ public:
++ BufferOffset haltingAlign(int alignment);
++ BufferOffset nopAlign(int alignment);
++ BufferOffset as_nop();
++
++ // --- Instruction emission (declarations only, implemented in later commits)
++
++ // Branch instructions.
++ uint16_t computeConditionCode(Condition op, CRegisterID cr = cr0);
++ uint16_t computeConditionCode(DoubleCondition cond, CRegisterID cr = cr0);
++ BufferOffset as_b(JOffImm26 off, BranchAddressType bat = RelativeBranch,
++ LinkBit lb = DontLinkB);
++ BufferOffset as_b(int32_t off, BranchAddressType bat = RelativeBranch,
++ LinkBit lb = DontLinkB);
++ BufferOffset as_blr(LinkBit lb = DontLinkB);
++ BufferOffset as_bctr(LinkBit lb = DontLinkB);
++ BufferOffset as_bc(BOffImm16 off, Condition cond, CRegisterID cr = cr0,
++ LikelyBit lkb = NotLikelyB, LinkBit lb = DontLinkB);
++ BufferOffset as_bc(int16_t off, Condition cond, CRegisterID cr = cr0,
++ LikelyBit lkb = NotLikelyB, LinkBit lb = DontLinkB);
++ BufferOffset as_bc(BOffImm16 off, DoubleCondition cond, CRegisterID cr = cr0,
++ LikelyBit lkb = NotLikelyB, LinkBit lb = DontLinkB);
++ BufferOffset as_bc(int16_t off, DoubleCondition cond, CRegisterID cr = cr0,
++ LikelyBit lkb = NotLikelyB, LinkBit lb = DontLinkB);
++ BufferOffset as_bcctr(Condition cond, CRegisterID cr = cr0,
++ LikelyBit lkb = NotLikelyB, LinkBit lb = DontLinkB);
++ BufferOffset as_bcctr(DoubleCondition cond, CRegisterID cr = cr0,
++ LikelyBit lkb = NotLikelyB, LinkBit lb = DontLinkB);
++ BufferOffset as_bc(int16_t off, uint16_t op, LikelyBit lkb = NotLikelyB,
++ LinkBit lb = DontLinkB);
++ BufferOffset as_bcctr(uint16_t op, LikelyBit lkb = NotLikelyB,
++ LinkBit lb = DontLinkB);
++
++ // SPR operations.
++ BufferOffset as_mtspr(SPRegisterID spr, Register ra);
++ BufferOffset as_mfspr(Register rd, SPRegisterID spr);
++
++ // CR operations.
++ BufferOffset as_crand(uint8_t t, uint8_t a, uint8_t b);
++ BufferOffset as_crandc(uint8_t t, uint8_t a, uint8_t b);
++ BufferOffset as_cror(uint8_t t, uint8_t a, uint8_t b);
++ BufferOffset as_crorc(uint8_t t, uint8_t a, uint8_t b);
++ BufferOffset as_crxor(uint8_t t, uint8_t a, uint8_t b);
++ BufferOffset as_mtcrf(uint32_t mask, Register rs);
++ BufferOffset as_mfocrf(Register rd, CRegisterID crfs);
++ BufferOffset as_mcrxrx(CRegisterID crt);
++
++ // Compare instructions.
++ BufferOffset as_cmpd(CRegisterID cr, Register ra, Register rb);
++ BufferOffset as_cmpdi(CRegisterID cr, Register ra, int16_t im);
++ BufferOffset as_cmpld(CRegisterID cr, Register ra, Register rb);
++ BufferOffset as_cmpldi(CRegisterID cr, Register ra, int16_t im);
++ BufferOffset as_cmpw(CRegisterID cr, Register ra, Register rb);
++ BufferOffset as_cmpwi(CRegisterID cr, Register ra, int16_t im);
++ BufferOffset as_cmplw(CRegisterID cr, Register ra, Register rb);
++ BufferOffset as_cmplwi(CRegisterID cr, Register ra, int16_t im);
++ BufferOffset as_cmpd(Register ra, Register rb);
++ BufferOffset as_cmpdi(Register ra, int16_t im);
++ BufferOffset as_cmpld(Register ra, Register rb);
++ BufferOffset as_cmpldi(Register ra, int16_t im);
++ BufferOffset as_cmpw(Register ra, Register rb);
++ BufferOffset as_cmpwi(Register ra, int16_t im);
++ BufferOffset as_cmplw(Register ra, Register rb);
++ BufferOffset as_cmplwi(Register ra, int16_t im);
++
++ // ALU (three-register).
++ BufferOffset as_add(Register rd, Register ra, Register rb);
++ BufferOffset as_addc(Register rd, Register ra, Register rb);
++ BufferOffset as_adde(Register rd, Register ra, Register rb);
++ BufferOffset as_subf(Register rd, Register ra, Register rb);
++ BufferOffset as_subfc(Register rd, Register ra, Register rb);
++ BufferOffset as_subfe(Register rd, Register ra, Register rb);
++ BufferOffset as_neg(Register rd, Register rs);
++
++ BufferOffset as_mulld(Register rd, Register ra, Register rb);
++ BufferOffset as_mulhd(Register rd, Register ra, Register rb);
++ BufferOffset as_mulhdu(Register rd, Register ra, Register rb);
++ BufferOffset as_mulldo(Register rd, Register ra, Register rb);
++ BufferOffset as_mullw(Register rd, Register ra, Register rb);
++ BufferOffset as_mulhwu(Register rd, Register ra, Register rb);
++
++ BufferOffset as_divd(Register rd, Register ra, Register rb);
++ BufferOffset as_divdu(Register rd, Register ra, Register rb);
++ BufferOffset as_divw(Register rd, Register ra, Register rb);
++ BufferOffset as_divwu(Register rd, Register ra, Register rb);
++ // POWER9 modulo.
++ BufferOffset as_modsd(Register rd, Register ra, Register rb);
++ BufferOffset as_modsw(Register rd, Register ra, Register rb);
++ BufferOffset as_modud(Register rd, Register ra, Register rb);
++ BufferOffset as_moduw(Register rd, Register ra, Register rb);
++
++ // ALU immediate.
++ BufferOffset as_addi(Register rd, Register ra, int16_t im,
++ bool actually_li = false);
++ BufferOffset as_addis(Register rd, Register ra, int16_t im,
++ bool actually_lis = false);
++ BufferOffset as_mulli(Register rd, Register ra, int16_t im);
++ BufferOffset as_subfic(Register rd, Register ra, int16_t im);
++
++ // ALU unary/extended.
++ BufferOffset as_cntlzw(Register rd, Register ra);
++ BufferOffset as_cntlzd(Register rd, Register ra);
++ BufferOffset as_cnttzd(Register rd, Register ra);
++ BufferOffset as_cnttzw(Register rd, Register ra);
++ BufferOffset as_popcntd(Register ra, Register rs);
++ BufferOffset as_popcntw(Register ra, Register rs);
++ // POWER10 byte-reverse doubleword: ra = bswap64(rs). 1 insn replacing the
++ // POWER9 mtvsrd / xxbrd / mfvsrd round-trip in byteSwap64.
++ BufferOffset as_brd(Register ra, Register rs);
++ // POWER10 byte-reverse each halfword (4 halfwords) / each word (2 words)
++ // in the 64-bit doubleword. The wasm/asm caller usually masks or
++ // sign-extends the low halfword/word afterwards.
++ BufferOffset as_brh(Register ra, Register rs);
++ BufferOffset as_brw(Register ra, Register rs);
++
++ // Bit operations (logical, three-register).
++ BufferOffset as_and_(Register rd, Register rs, Register rb);
++ BufferOffset as_and__rc(Register rd, Register rs, Register rb);
++ BufferOffset as_nor(Register rd, Register rs, Register rb);
++ BufferOffset as_or_(Register rd, Register rs, Register rb);
++ BufferOffset as_xor_(Register rd, Register rs, Register rb);
++ BufferOffset as_slw(Register rd, Register rs, Register rb);
++ BufferOffset as_srw(Register rd, Register rs, Register rb);
++ BufferOffset as_sraw(Register rd, Register rs, Register rb);
++ BufferOffset as_sld(Register rd, Register rs, Register rb);
++ BufferOffset as_srd(Register rd, Register rs, Register rb);
++ BufferOffset as_srad(Register rd, Register rs, Register rb);
++
++ // Bit operations (logical, immediate).
++ BufferOffset as_ori(Register rd, Register ra, uint16_t im);
++ BufferOffset as_oris(Register rd, Register ra, uint16_t im);
++ BufferOffset as_xori(Register rd, Register ra, uint16_t im);
++ BufferOffset as_xoris(Register rd, Register ra, uint16_t im);
++ BufferOffset as_andi_rc(Register rd, Register ra, uint16_t im);
++
++ // Sign extension.
++ BufferOffset as_extsb(Register rd, Register rs);
++ BufferOffset as_extsh(Register rd, Register rs);
++ BufferOffset as_extsw(Register rd, Register rs);
++ BufferOffset as_extsw_rc(Register rd, Register rs);
++
++ // Shift/rotate with immediates.
++ BufferOffset as_srawi(Register id, Register rs, uint8_t n);
++ BufferOffset as_sradi(Register rd, Register rs, int n);
++ BufferOffset as_rldcl(Register ra, Register rs, Register rb, uint8_t mb);
++ BufferOffset as_rldicl(Register ra, Register rs, uint8_t sh, uint8_t mb);
++ BufferOffset as_rldicl_rc(Register ra, Register rs, uint8_t sh, uint8_t mb);
++ BufferOffset as_rldicr(Register ra, Register rs, uint8_t sh, uint8_t mb);
++ BufferOffset as_rldicr_rc(Register ra, Register rs, uint8_t sh, uint8_t mb);
++ BufferOffset as_rlwinm(Register rd, Register rs, uint8_t sh, uint8_t mb,
++ uint8_t me);
++ BufferOffset as_rlwinm_rc(Register rd, Register rs, uint8_t sh, uint8_t mb,
++ uint8_t me);
++ BufferOffset as_rlwimi(Register rd, Register rs, uint8_t sh, uint8_t mb,
++ uint8_t me);
++ BufferOffset as_rldimi(Register rd, Register rs, uint8_t sh, uint8_t mb);
++ BufferOffset as_rlwnm(Register rd, Register rs, Register rb, uint8_t mb,
++ uint8_t me);
++
++ // Integer loads (D-form).
++ BufferOffset as_lbz(Register rd, Register rb, int16_t off);
++ BufferOffset as_lha(Register rd, Register rb, int16_t off);
++ BufferOffset as_lhz(Register rd, Register rb, int16_t off);
++ BufferOffset as_lwa(Register rd, Register rb, int16_t off);
++ BufferOffset as_lwz(Register rd, Register rb, int16_t off);
++ BufferOffset as_ld(Register rd, Register rb, int16_t off);
++
++ // Integer stores (D-form).
++ BufferOffset as_stb(Register rd, Register rb, int16_t off);
++ BufferOffset as_sth(Register rd, Register rb, int16_t off);
++ BufferOffset as_stw(Register rd, Register rb, int16_t off);
++ BufferOffset as_std(Register rd, Register rb, int16_t off);
++ BufferOffset as_stdu(Register rd, Register rb, int16_t off);
++
++ // Integer loads/stores (X-form, indexed).
++ BufferOffset as_lbzx(Register rd, Register ra, Register rb);
++ BufferOffset as_lhax(Register rd, Register ra, Register rb);
++ BufferOffset as_lhzx(Register rd, Register ra, Register rb);
++ BufferOffset as_lwzx(Register rd, Register ra, Register rb);
++ // X-form sign-extending word load. Single-insn equivalent of lwzx + extsw.
++ BufferOffset as_lwax(Register rd, Register ra, Register rb);
++ BufferOffset as_lwarx(Register rd, Register ra, Register rb);
++ BufferOffset as_lbarx(Register rd, Register ra, Register rb);
++ BufferOffset as_lharx(Register rd, Register ra, Register rb);
++ BufferOffset as_ldx(Register rd, Register ra, Register rb);
++ BufferOffset as_ldarx(Register rd, Register ra, Register rb);
++ BufferOffset as_stbx(Register rd, Register ra, Register rb);
++ BufferOffset as_stbcx(Register rd, Register ra, Register rb);
++ BufferOffset as_stwx(Register rd, Register ra, Register rb);
++ BufferOffset as_stwbrx(Register rd, Register ra, Register rb);
++ BufferOffset as_sthx(Register rd, Register ra, Register rb);
++ BufferOffset as_sthcx(Register rd, Register ra, Register rb);
++ BufferOffset as_stdx(Register rd, Register ra, Register rb);
++ BufferOffset as_stdcx(Register rd, Register ra, Register rb);
++ BufferOffset as_stwcx(Register rd, Register ra, Register rb);
++
++ // Integer select.
++ // POWER10 (ISA 3.1). Set RT = 1/0 based on a CR bit.
++ BufferOffset as_setbc(Register rt, uint16_t bc, CRegisterID cr);
++ BufferOffset as_setbcr(Register rt, uint16_t bc, CRegisterID cr);
++ BufferOffset as_isel(Register rt, Register ra, Register rb, uint16_t rc,
++ CRegisterID cr = cr0);
++ BufferOffset as_isel0(Register rt, Register ra, Register rb, uint16_t rc,
++ CRegisterID cr = cr0);
++
++ // FP compare.
++ BufferOffset as_fcmpu(CRegisterID cr, FloatRegister ra, FloatRegister rb);
++ BufferOffset as_fcmpu(FloatRegister ra, FloatRegister rb);
++
++ // FP arithmetic (two-source).
++ BufferOffset as_fadd(FloatRegister rd, FloatRegister ra, FloatRegister rc);
++ BufferOffset as_fadds(FloatRegister rd, FloatRegister ra, FloatRegister rc);
++ BufferOffset as_fsub(FloatRegister rd, FloatRegister ra, FloatRegister rc);
++ BufferOffset as_fsubs(FloatRegister rd, FloatRegister ra, FloatRegister rc);
++ BufferOffset as_fdiv(FloatRegister rd, FloatRegister ra, FloatRegister rc);
++ BufferOffset as_fdivs(FloatRegister rd, FloatRegister ra, FloatRegister rc);
++ BufferOffset as_fmul(FloatRegister rd, FloatRegister ra, FloatRegister rc);
++ BufferOffset as_fmuls(FloatRegister rd, FloatRegister ra, FloatRegister rc);
++ BufferOffset as_fcpsgn(FloatRegister rd, FloatRegister ra, FloatRegister rc);
++ // FP unary.
++ BufferOffset as_fabs(FloatRegister rd, FloatRegister rs);
++ BufferOffset as_fneg(FloatRegister rd, FloatRegister rs);
++ BufferOffset as_fmr(FloatRegister rd, FloatRegister rs);
++ BufferOffset as_fsqrt(FloatRegister rd, FloatRegister rs);
++ BufferOffset as_fsqrts(FloatRegister rd, FloatRegister rs);
++ BufferOffset as_frsp(FloatRegister rd, FloatRegister rs);
++
++ // FP conversions.
++ BufferOffset as_fcfid(FloatRegister rd, FloatRegister rs);
++ BufferOffset as_fcfids(FloatRegister rd, FloatRegister rs);
++ BufferOffset as_fcfidu(FloatRegister rd, FloatRegister rs);
++ BufferOffset as_fcfidus(FloatRegister rd, FloatRegister rs);
++ BufferOffset as_fctid(FloatRegister rd, FloatRegister rs);
++ BufferOffset as_fctidz(FloatRegister rd, FloatRegister rs);
++ BufferOffset as_fctiduz(FloatRegister rd, FloatRegister rs);
++ BufferOffset as_fctiwz(FloatRegister rd, FloatRegister rs);
++
++ // FP rounding.
++ BufferOffset as_frim(FloatRegister rd, FloatRegister rs);
++ BufferOffset as_frip(FloatRegister rd, FloatRegister rs);
++ BufferOffset as_friz(FloatRegister rd, FloatRegister rs);
++
++ // FP loads (D-form).
++ BufferOffset as_lfd(FloatRegister rd, Register rb, int16_t off);
++ BufferOffset as_lfs(FloatRegister rd, Register rb, int16_t off);
++
++ // FP stores (D-form).
++ BufferOffset as_stfd(FloatRegister rd, Register rb, int16_t off);
++ BufferOffset as_stfs(FloatRegister rd, Register rb, int16_t off);
++ BufferOffset as_stfdu(FloatRegister rd, Register rb, int16_t off);
++ BufferOffset as_stfsu(FloatRegister rd, Register rb, int16_t off);
++
++ // FP loads/stores (X-form, indexed).
++ BufferOffset as_lfdx(FloatRegister rd, Register ra, Register rb);
++ BufferOffset as_lfsx(FloatRegister rd, Register ra, Register rb);
++ BufferOffset as_lfiwax(FloatRegister rd, Register ra, Register rb);
++ BufferOffset as_stfdx(FloatRegister rd, Register ra, Register rb);
++ BufferOffset as_stfsx(FloatRegister rd, Register ra, Register rb);
++
++ // FPSCR operations.
++ BufferOffset as_mtfsb0(uint8_t bt);
++ BufferOffset as_mcrfs(CRegisterID bf, uint8_t bfa);
++
++ // VSX (FPR-only subset).
++ BufferOffset as_mfvsrd(Register ra, FloatRegister xs);
++ BufferOffset as_mtvsrd(FloatRegister xs, Register ra);
++ // POWER8+ (ISA 2.07). Sign-extending move of RA's low 32 bits to FPR.
++ BufferOffset as_mtvsrwa(FloatRegister xs, Register ra);
++ BufferOffset as_mtvsrwz(FloatRegister xs, Register ra);
++ BufferOffset as_mtvsrws(FloatRegister xs, Register ra);
++ BufferOffset as_xxbrd(FloatRegister xt, FloatRegister xb);
++ // POWER9 scalar VSX max/min with Java/JavaScript semantics (matches
++ // ECMA-262 Math.max / Math.min). Operate on FPR-space (encoding 0..31).
++ BufferOffset as_xsmaxjdp(FloatRegister xt, FloatRegister xa,
++ FloatRegister xb);
++ BufferOffset as_xsminjdp(FloatRegister xt, FloatRegister xa,
++ FloatRegister xb);
++ BufferOffset as_xscvdpspn(FloatRegister xt, FloatRegister xb);
++ BufferOffset as_xscvspdpn(FloatRegister xt, FloatRegister xb);
++ // POWER9 (ISA 3.0) scalar FP16 conversions.
++ BufferOffset as_xscvdphp(FloatRegister xt, FloatRegister xb);
++ BufferOffset as_xscvhpdp(FloatRegister xt, FloatRegister xb);
++ // POWER9 (ISA 3.0) scalar extract biased exponent.
++ BufferOffset as_xsxexpdp(FloatRegister xt, FloatRegister xb);
++ // POWER9 (ISA 3.0) scalar FP16 load/store, X-form indexed.
++ BufferOffset as_lxsihzx(FloatRegister xt, Register ra, Register rb);
++ BufferOffset as_stxsihx(FloatRegister xs, Register ra, Register rb);
++
++ // VSX SIMD load/store (X-form, indexed).
++ BufferOffset as_lxvx(FloatRegister xt, Register ra, Register rb);
++ BufferOffset as_stxvx(FloatRegister xs, Register ra, Register rb);
++ BufferOffset as_lxvd2x(FloatRegister xt, Register ra, Register rb);
++ BufferOffset as_stxvd2x(FloatRegister xs, Register ra, Register rb);
++
++ // VMX SIMD load/store (X-form, indexed). Take a raw VR number (0-31)
++ // because VR20-VR31 are outside the FloatRegister encoding (which only
++ // covers VSR0-31 = f0-f31). Used by the JIT trampoline to save/restore
++ // the ELFv2 callee-saved VR20-VR31. EA is force-aligned to 16 bytes
++ // (low 4 bits of the address are ignored), so the slot's alignment
++ // matters for layout but not for trap avoidance.
++ BufferOffset as_lvx(uint8_t vrt, Register ra, Register rb);
++ BufferOffset as_stvx(uint8_t vrs, Register ra, Register rb);
++
++ // VSX SIMD register operations (XX3-form / XX1-form / XX2-form).
++ BufferOffset as_xxlor(FloatRegister xt, FloatRegister xa, FloatRegister xb);
++
++ // VSX bitwise operations (XX3-form).
++ BufferOffset as_xxland(FloatRegister xt, FloatRegister xa, FloatRegister xb);
++ BufferOffset as_xxlxor(FloatRegister xt, FloatRegister xa, FloatRegister xb);
++ BufferOffset as_xxlnor(FloatRegister xt, FloatRegister xa, FloatRegister xb);
++ BufferOffset as_xxlandc(FloatRegister xt, FloatRegister xa, FloatRegister xb);
++ BufferOffset as_xxsel(FloatRegister xt, FloatRegister xa, FloatRegister xb,
++ FloatRegister xc);
++
++ // VMX integer arithmetic (VR0-31 = VSR32-63 only).
++ // Callers must ensure operands are in VR space.
++ BufferOffset as_vaddubm(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vadduhm(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vadduwm(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vaddudm(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vsububm(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vsubuhm(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vsubuwm(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vsubudm(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vaddsbs(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vaddshs(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vaddubs(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vadduhs(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vsubsbs(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vsubshs(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vsububs(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vsubuhs(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vminsb(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vminsh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vminsw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmaxsb(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmaxsh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmaxsw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmaxsd(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vminub(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vminuh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vminuw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmaxub(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmaxuh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmaxuw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ // POWER9 (ISA 3.0): per-lane integer negate.
++ BufferOffset as_vnegw(uint8_t vrt, uint8_t vrb);
++ BufferOffset as_vnegd(uint8_t vrt, uint8_t vrb);
++ // POWER9 (ISA 3.0): addpcis rT, D. Computes rT = (CIA + 4) + (D << 16).
++ // D is a 16-bit signed immediate; DX-form splits D across three instruction
++ // fields (d0[16..25] ∥ d1[11..15] ∥ d2[31]). No LR clobber, no RAS hazard.
++ BufferOffset as_addpcis(Register rt, int16_t d);
++ // POWER10 (ISA 3.1) prefixed instructions. Each emits 8 bytes (prefix +
++ // suffix) with a single nop inserted before iff the prefix would
++ // straddle a 64-byte block. Caller must guarantee HasPOWER10().
++ // imm34 is signed 34-bit; R=true selects PC-relative form (RA must be r0).
++ // Returns the offset of the prefix word.
++ BufferOffset as_paddi(Register rt, Register ra, int64_t imm34, bool R);
++ BufferOffset as_pld(Register rt, Register ra, int64_t imm34, bool R);
++ BufferOffset as_plxv(uint8_t xt, Register ra, int64_t imm34, bool R);
++ // FP-target prefixed loads: plfd/plfs are MLS (Type=2) with suffix
++ // opcodes 50 and 48. plfs widens single → double in the FPR
++ // (matches non-prefixed lfs semantics).
++ BufferOffset as_plfd(FloatRegister frt, Register ra, int64_t imm34,
++ bool R);
++ BufferOffset as_plfs(FloatRegister frt, Register ra, int64_t imm34,
++ bool R);
++ // Prefixed-store counterparts. Same prefix shape; suffix opcodes are
++ // the D-form variants of std/stxv/stfd/stfs (61, 27, 54, 52).
++ BufferOffset as_pstd(Register rs, Register ra, int64_t imm34, bool R);
++ BufferOffset as_pstxv(uint8_t xs, Register ra, int64_t imm34, bool R);
++ BufferOffset as_pstfd(FloatRegister frs, Register ra, int64_t imm34,
++ bool R);
++ BufferOffset as_pstfs(FloatRegister frs, Register ra, int64_t imm34,
++ bool R);
++
++ private:
++ // Emit a nop before a prefixed instruction iff the prefix would otherwise
++ // start at offset 60 (mod 64) and the suffix would land in the next block.
++ void ensurePrefixedAlignment();
++
++ public:
++ BufferOffset as_vavgub(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vavguh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmuluwm(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmulld(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ // VMX shift (VR0-31 only).
++ BufferOffset as_vslb(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vslh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vslw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vsld(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vsrb(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vsrh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vsrw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vsrd(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vsrab(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vsrah(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vsraw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vsrad(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vslo(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vsro(uint8_t vrt, uint8_t vra, uint8_t vrb);
++
++ // VMX integer compare (VR0-31 only).
++ BufferOffset as_vcmpequb(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vcmpequh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vcmpequw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vcmpequd(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ // Record forms set CR6: LT = all-true, EQ = none-true.
++ BufferOffset as_vcmpequb_rc(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vcmpequh_rc(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vcmpequw_rc(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vcmpequd_rc(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vcmpgtsb(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vcmpgtsh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vcmpgtsw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vcmpgtsd(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vcmpgtub(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vcmpgtuh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vcmpgtuw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vcmpgtud(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ // POWER9 (ISA 3.0). NotEqual compare; no doubleword variant.
++ BufferOffset as_vcmpneb(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vcmpneh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vcmpnew(uint8_t vrt, uint8_t vra, uint8_t vrb);
++
++ // VSX float compare (XX3-form, VSR0-63).
++ BufferOffset as_xvcmpeqsp(FloatRegister xt, FloatRegister xa,
++ FloatRegister xb);
++ BufferOffset as_xvcmpgtsp(FloatRegister xt, FloatRegister xa,
++ FloatRegister xb);
++ BufferOffset as_xvcmpgesp(FloatRegister xt, FloatRegister xa,
++ FloatRegister xb);
++ BufferOffset as_xvcmpeqdp(FloatRegister xt, FloatRegister xa,
++ FloatRegister xb);
++ BufferOffset as_xvcmpgtdp(FloatRegister xt, FloatRegister xa,
++ FloatRegister xb);
++ BufferOffset as_xvcmpgedp(FloatRegister xt, FloatRegister xa,
++ FloatRegister xb);
++
++ // VSX float arithmetic (XX3-form binary, XX2-form unary).
++#define DECL_VSX_BIN(op) \
++ BufferOffset as_##op(FloatRegister xt, FloatRegister xa, FloatRegister xb);
++ DECL_VSX_BIN(xvaddsp)
++ DECL_VSX_BIN(xvadddp) DECL_VSX_BIN(xvsubsp) DECL_VSX_BIN(
++ xvsubdp) DECL_VSX_BIN(xvmulsp) DECL_VSX_BIN(xvmuldp) DECL_VSX_BIN(xvdivsp)
++ DECL_VSX_BIN(xvdivdp) DECL_VSX_BIN(xvminsp) DECL_VSX_BIN(
++ xvmindp) DECL_VSX_BIN(xvmaxsp) DECL_VSX_BIN(xvmaxdp)
++ DECL_VSX_BIN(xvmaddasp) DECL_VSX_BIN(xvmaddadp) DECL_VSX_BIN(
++ xvnmsubasp) DECL_VSX_BIN(xvnmsubadp)
++#undef DECL_VSX_BIN
++#define DECL_VSX_UN(op) \
++ BufferOffset as_##op(FloatRegister xt, FloatRegister xb);
++ DECL_VSX_UN(xvabssp) DECL_VSX_UN(xvabsdp) DECL_VSX_UN(xvnegsp)
++ DECL_VSX_UN(xvnegdp) DECL_VSX_UN(xvsqrtsp) DECL_VSX_UN(
++ xvsqrtdp) DECL_VSX_UN(xvrspip) DECL_VSX_UN(xvrdpip)
++ DECL_VSX_UN(xvrspim) DECL_VSX_UN(xvrdpim) DECL_VSX_UN(
++ xvrspiz) DECL_VSX_UN(xvrdpiz) DECL_VSX_UN(xvrspic)
++ DECL_VSX_UN(xvrdpic) DECL_VSX_UN(xvcvsxwsp)
++ DECL_VSX_UN(xvcvuxwsp) DECL_VSX_UN(xvcvsxwdp)
++ DECL_VSX_UN(xvcvuxwdp) DECL_VSX_UN(xvcvspsxws)
++ DECL_VSX_UN(xvcvspuxws)
++ DECL_VSX_UN(xvcvdpsxws)
++ DECL_VSX_UN(xvcvdpuxws)
++ DECL_VSX_UN(xvcvdpsp)
++ DECL_VSX_UN(xvcvspdp)
++#undef DECL_VSX_UN
++
++ // VMX widen/narrow/merge/pack (VR0-31 only).
++ BufferOffset as_vupkhsb(uint8_t vrt, uint8_t vrb);
++ BufferOffset as_vupklsb(uint8_t vrt, uint8_t vrb);
++ BufferOffset as_vupkhsh(uint8_t vrt, uint8_t vrb);
++ BufferOffset as_vupklsh(uint8_t vrt, uint8_t vrb);
++ BufferOffset as_vupkhsw(uint8_t vrt, uint8_t vrb);
++ BufferOffset as_vupklsw(uint8_t vrt, uint8_t vrb);
++ BufferOffset as_vpkshss(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vpkswss(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vpkshus(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vpkswus(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmrghb(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmrghh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmrghw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmrglb(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmrglh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmrglw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++
++ // VMX extended multiply (VR0-31 only).
++ BufferOffset as_vmulesb(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmulosb(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmuleub(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmuloub(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmulesh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmulosh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmuleuh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmulouh(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmulesw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmulosw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmuleuw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vmulouw(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ BufferOffset as_vpopcntb(uint8_t vrt, uint8_t vrb);
++ BufferOffset as_vperm(uint8_t vrt, uint8_t vra, uint8_t vrb, uint8_t vrc);
++ // POWER8+ (ISA 2.07). VX-form bit-permute. See PPC_vbpermq comment.
++ BufferOffset as_vbpermq(uint8_t vrt, uint8_t vra, uint8_t vrb);
++ // POWER10 (ISA 3.1) Vector Extract Mask. RT is a GPR.
++ BufferOffset as_vextractbm(Register rt, FloatRegister vrb);
++ BufferOffset as_vextracthm(Register rt, FloatRegister vrb);
++ BufferOffset as_vextractwm(Register rt, FloatRegister vrb);
++ BufferOffset as_vextractdm(Register rt, FloatRegister vrb);
++ // POWER10 (ISA 3.1) Vector Insert from GPR at immediate byte offset.
++ // UIM range: vinsw 0..12, vinsd 0..8 (caller must enforce).
++ BufferOffset as_vinsw(FloatRegister vrt, Register rb, uint8_t uim);
++ BufferOffset as_vinsd(FloatRegister vrt, Register rb, uint8_t uim);
++ // POWER10 (ISA 3.1) Vector Insert byte / halfword from GPR with the
++ // byte position supplied by another GPR (RA & 0xF for vinsbrx,
++ // RA & 0xE for vinshrx). "rx" = right-indexed = LE-natural.
++ BufferOffset as_vinsbrx(FloatRegister vrt, Register ra, Register rb);
++ BufferOffset as_vinshrx(FloatRegister vrt, Register ra, Register rb);
++ // POWER9 (ISA 3.0) Vector Insert byte / halfword from VR at immediate
++ // byte position. UIM range: vinsertb 0..15, vinserth 0..14
++ // (caller must enforce; vinserth UIM is in bytes, even-aligned).
++ BufferOffset as_vinsertb(FloatRegister vrt, FloatRegister vrb, uint8_t uim);
++ BufferOffset as_vinserth(FloatRegister vrt, FloatRegister vrb, uint8_t uim);
++ // POWER9 (ISA 3.0) Vector Extract byte / halfword from VR at immediate
++ // BE byte position. UIM range: vextractub 0..15, vextractuh 0..14
++ // (caller must enforce; vextractuh UIM is in bytes, even-aligned). The
++ // extracted byte/halfword lands at BE byte 7 of VRT, with the rest
++ // zeroed — so a subsequent mfvsrd reads it as the low byte/halfword
++ // of the GPR with implicit zero-extension.
++ BufferOffset as_vextractub(FloatRegister vrt, FloatRegister vrb, uint8_t uim);
++ BufferOffset as_vextractuh(FloatRegister vrt, FloatRegister vrb, uint8_t uim);
++ // VX-form with 5-bit signed immediate splat: each lane of VRT is
++ // set to sign_extend(SIMM5) (range [-16, 15]) at byte/halfword/word granularity.
++ BufferOffset as_vspltisb(uint8_t vrt, int8_t simm5);
++ BufferOffset as_vspltish(uint8_t vrt, int8_t simm5);
++ BufferOffset as_vspltisw(uint8_t vrt, int8_t simm5);
++
++ // VA-form ternary VMX instructions.
++ BufferOffset as_vmladduhm(uint8_t vrt, uint8_t vra, uint8_t vrb, uint8_t vrc);
++ BufferOffset as_vmhraddshs(uint8_t vrt, uint8_t vra, uint8_t vrb,
++ uint8_t vrc);
++ BufferOffset as_vmsumshm(uint8_t vrt, uint8_t vra, uint8_t vrb, uint8_t vrc);
++ BufferOffset as_vmsumuhm(uint8_t vrt, uint8_t vra, uint8_t vrb, uint8_t vrc);
++ BufferOffset as_xxpermdi(FloatRegister xt, FloatRegister xa, FloatRegister xb,
++ uint8_t dm);
++ BufferOffset as_xxspltw(FloatRegister xt, FloatRegister xb, uint8_t uim);
++ // POWER9 (ISA 3.0). Splat 8-bit immediate to all 16 bytes of an FPR-encoded
++ // VSR (TX bit forced 0). XX1-form, no Rc.
++ BufferOffset as_xxspltib(FloatRegister xt, uint8_t imm8);
++ BufferOffset as_xxinsertw(FloatRegister xt, FloatRegister xb, uint8_t uim);
++ BufferOffset as_xxextractuw(FloatRegister xt, FloatRegister xb, uint8_t uim);
++ BufferOffset as_mtvsrdd(FloatRegister xt, Register ra, Register rb);
++ BufferOffset as_mfvsrld(Register rt, FloatRegister xs);
++
++ // VMX vector operations.
++ BufferOffset as_vspltb(FloatRegister vrt, FloatRegister vrb, uint8_t uim);
++ BufferOffset as_vsplth(FloatRegister vrt, FloatRegister vrb, uint8_t uim);
++ BufferOffset as_vsldoi(FloatRegister vrt, FloatRegister vra,
++ FloatRegister vrb, uint8_t shb);
++
++ // Barrier and sync instructions.
++ BufferOffset as_lwsync();
++ BufferOffset as_sync();
++ BufferOffset as_isync();
++
++ // Convenience pseudo-instructions.
++ BufferOffset xs_trap();
++ BufferOffset xs_trap_tagged(TrapTag tag);
++ BufferOffset xs_mr(Register rd, Register ra);
++ BufferOffset xs_mtctr(Register ra);
++ BufferOffset xs_mtlr(Register ra);
++ BufferOffset xs_mflr(Register rd);
++ BufferOffset xs_mtcr(Register rs);
++ BufferOffset xs_mfxer(Register ra);
++ BufferOffset xs_mtxer(Register ra);
++ BufferOffset xs_li(Register rd, int16_t im);
++ BufferOffset xs_lis(Register rd, int16_t im);
++ BufferOffset x_subi(Register rd, Register ra, int16_t im);
++ BufferOffset x_not(Register rd, Register ra);
++ BufferOffset x_slwi(Register rd, Register rs, int n);
++ BufferOffset x_sldi(Register rd, Register rs, int n);
++ BufferOffset x_srwi(Register rd, Register rs, int n);
++ BufferOffset x_srdi(Register rd, Register rs, int n);
++ BufferOffset x_insertbits0_15(Register rd, Register rs);
++ BufferOffset x_bit_value(Register rd, Register rs, unsigned bit);
++ BufferOffset x_sr_mulli(Register rd, Register ra, int16_t im);
++
++ // --- Label operations.
++ void bind(Label* label) { bind(label, nextOffset()); }
++ void bind(Label* label, BufferOffset boff);
++ void bind(InstImm* inst, uintptr_t branch, uintptr_t target);
++ void bind(CodeLabel* label) { label->target()->bind(currentOffset()); }
++ uint32_t currentOffset() { return nextOffset().getOffset(); }
++ void retarget(Label* label, Label* target);
++ void call(Label* label);
++ void call(void* target);
++
++ void as_break(uint32_t code);
++
++ // --- Static capability queries.
++ static bool SupportsFloatingPoint() { return true; }
++ static bool SupportsWasmSimd() { return true; }
++ static bool SupportsUnalignedAccesses() { return true; }
++ static bool SupportsFastUnalignedFPAccesses() { return true; }
++ // POWER9 has scalar FP16 hardware (xscvdphp/xscvhpdp); POWER8 doesn't.
++ // Runtime-gate like x86's SupportsFloat32To16 (which keys off F16C).
++ static bool SupportsFloat64To16() { return HasPOWER9(); }
++ static bool SupportsFloat32To16() { return HasPOWER9(); }
++ static bool HasRoundInstruction(RoundingMode mode) {
++ // PPC64 has friz (trunc), frip (ceil), frim (floor), which are all correct.
++ // frin (round-to-nearest) does NOT implement proper IEEE banker's rounding
++ // (ties to even), so NearestTiesToEven is not supported.
++ return mode == RoundingMode::TowardsZero || mode == RoundingMode::Up ||
++ mode == RoundingMode::Down;
++ }
++
++ protected:
++ InstImm invertBranch(InstImm branch, BOffImm16 skipOffset);
++ void addPendingJump(BufferOffset src, ImmPtr target, RelocationKind kind) {
++ enoughMemory_ &= jumps_.append(RelativePatch(src, target.value, kind));
++ if (kind == RelocationKind::JITCODE) {
++ writeRelocation(src);
++ }
++ }
++ void addLongJump(BufferOffset src, BufferOffset dst) {
++ CodeLabel cl;
++ cl.patchAt()->bind(src.getOffset());
++ cl.target()->bind(dst.getOffset());
++ cl.setLinkMode(CodeLabel::JumpImmediate);
++ addCodeLabel(std::move(cl));
++ }
++
++ public:
++ void flushBuffer() { m_buffer.flushPool(); }
++ void comment(const char* msg) { spew("; %s", msg); }
++ static uint32_t NopSize() { return 4; }
++
++ // --- Static patching API.
++ static uint64_t ExtractLoad64Value(Instruction* inst0);
++ static void UpdateLoad64Value(Instruction* inst0, uint64_t value);
++ static void WriteLoad64Instructions(Instruction* inst0, Register reg,
++ uint64_t value);
++
++ static void PatchWrite_Imm32(CodeLocationLabel label, Imm32 imm);
++ static uint8_t* NextInstruction(uint8_t* instruction,
++ uint32_t* count = nullptr);
++ static void ToggleToJmp(CodeLocationLabel inst_);
++ static void ToggleToCmp(CodeLocationLabel inst_);
++
++ void verifyHeapAccessDisassembly(uint32_t begin, uint32_t end,
++ const Disassembler::HeapAccess& ha) {}
++
++ // --- Public patching API (required by shared code).
++ static void Bind(uint8_t* rawCode, const CodeLabel& label);
++ void processCodeLabels(uint8_t* rawCode);
++
++ static void TraceJumpRelocations(JSTracer* trc, JitCode* code,
++ CompactBufferReader& reader);
++ static void TraceDataRelocations(JSTracer* trc, JitCode* code,
++ CompactBufferReader& reader);
++
++ void executableCopy(uint8_t* buffer);
++
++ static uint32_t PatchWrite_NearCallSize();
++ static void PatchWrite_NearCall(CodeLocationLabel start,
++ CodeLocationLabel toCall);
++ static void PatchDataWithValueCheck(CodeLocationLabel label, ImmPtr newValue,
++ ImmPtr expectedValue);
++ static void PatchDataWithValueCheck(CodeLocationLabel label,
++ PatchedImmPtr newValue,
++ PatchedImmPtr expectedValue);
++ static void ToggleCall(CodeLocationLabel inst_, bool enabled);
++
++ private:
++ GeneralRegisterSet scratch_register_list_;
++
++ public:
++ GeneralRegisterSet* GetScratchRegisterList() {
++ return &scratch_register_list_;
++ }
++}; // Assembler
++
++inline bool IsUnaligned(const wasm::MemoryAccessDesc& access) {
++ if (!access.align()) {
++ return false;
++ }
++ return access.align() < access.byteSize();
++}
++
++} // namespace jit
++} // namespace js
++
++// Whether an Imm32 fits in an unsigned 16-bit immediate.
++#define PPC_IMM_OK_U(x) (MOZ_LIKELY(((x).value & 0xffff0000) == 0))
++
++// Whether an Imm32 fits in a signed 16-bit immediate.
++#define PPC_IMM_OK_S(x) \
++ (MOZ_LIKELY(((x).value & 0xffff8000) == 0 || \
++ ((x).value & 0xffff8000) == 0xffff8000))
++
++// Whether the offset part of an Address fits in a signed 16-bit immediate.
++#define PPC_OFFS_OK(x) \
++ (MOZ_LIKELY(((x).offset & 0xffff8000) == 0 || \
++ ((x).offset & 0xffff8000) == 0xffff8000))
++
++// Same test but checking a bit ahead (for paired loads).
++#define PPC_OFFS_INCR_OK(x, incr) \
++ (MOZ_LIKELY((((x).offset + (incr)) & 0xffff8000) == 0 || \
++ (((x).offset + (incr)) & 0xffff8000) == 0xffff8000))
++
++#endif /* jit_ppc64_Assembler_ppc64_h */
+diff --git a/js/src/jit/ppc64/CodeGenerator-ppc64.cpp b/js/src/jit/ppc64/CodeGenerator-ppc64.cpp
+new file mode 100644
+index 000000000000..0a436fb1201a
+--- /dev/null
++++ b/js/src/jit/ppc64/CodeGenerator-ppc64.cpp
+@@ -0,0 +1,3647 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#include "jit/ppc64/CodeGenerator-ppc64.h"
++
++#include "mozilla/MathAlgorithms.h"
++
++#include <bit>
++
++#include "builtin/Number.h"
++#include "jit/CodeGenerator.h"
++#include "jit/InlineScriptTree.h"
++#include "jit/JitRuntime.h"
++#include "jit/MIR-wasm.h"
++#include "jit/MIR.h"
++#include "jit/MIRGraph.h"
++#include "vm/JSContext.h"
++#include "vm/Realm.h"
++#include "vm/Shape.h"
++
++#include "jit/shared/CodeGenerator-shared-inl.h"
++#include "vm/JSScript-inl.h"
++
++using namespace js;
++using namespace js::jit;
++
++using JS::GenericNaN;
++using mozilla::NegativeInfinity;
++
++namespace js {
++namespace jit {
++
++CodeGeneratorPPC64::CodeGeneratorPPC64(MIRGenerator* gen, LIRGraph* graph,
++ MacroAssembler* masm,
++ const wasm::CodeMetadata* codeMeta)
++ : CodeGeneratorShared(gen, graph, masm, codeMeta) {}
++
++Operand CodeGeneratorPPC64::ToOperand(const LAllocation& a) {
++ if (a.isGeneralReg()) {
++ return Operand(a.toGeneralReg()->reg());
++ }
++ if (a.isFloatReg()) {
++ return Operand(a.toFloatReg()->reg());
++ }
++ return Operand(ToAddress(a));
++}
++
++Operand CodeGeneratorPPC64::ToOperand(const LAllocation* a) {
++ return ToOperand(*a);
++}
++
++MoveOperand CodeGeneratorPPC64::toMoveOperand(LAllocation a) const {
++ if (a.isGeneralReg()) {
++ return MoveOperand(ToRegister(a));
++ }
++ if (a.isFloatReg()) {
++ return MoveOperand(ToFloatRegister(a));
++ }
++ MoveOperand::Kind kind = a.isStackArea() ? MoveOperand::Kind::EffectiveAddress
++ : MoveOperand::Kind::Memory;
++ Address address = ToAddress(a);
++ MOZ_ASSERT((address.offset & 3) == 0);
++ return MoveOperand(address, kind);
++}
++
++void CodeGeneratorPPC64::bailoutFrom(Label* label, LSnapshot* snapshot) {
++ MOZ_ASSERT_IF(!masm.oom(), label->used());
++ MOZ_ASSERT_IF(!masm.oom(), !label->bound());
++
++ encode(snapshot);
++
++ InlineScriptTree* tree = snapshot->mir()->block()->trackedTree();
++ auto* ool = new (alloc()) LambdaOutOfLineCode([=, this](OutOfLineCode& ool) {
++ // Push snapshotOffset and make sure stack is aligned.
++ masm.subPtr(Imm32(sizeof(Value)), StackPointer);
++ masm.storePtr(ImmWord(snapshot->snapshotOffset()),
++ Address(StackPointer, 0));
++ masm.jump(&deoptLabel_);
++ });
++ addOutOfLineCode(ool,
++ new (alloc()) BytecodeSite(tree, tree->script()->code()));
++
++ masm.retarget(label, ool->entry());
++}
++
++void CodeGeneratorPPC64::bailout(LSnapshot* snapshot) {
++ Label label;
++ masm.jump(&label);
++ bailoutFrom(&label, snapshot);
++}
++
++void CodeGeneratorPPC64::bailoutIfFalseBool(Register lhs, LSnapshot* snapshot) {
++ Label bail;
++ masm.branchTest32(Assembler::Zero, lhs, Imm32(0xFF), &bail);
++ bailoutFrom(&bail, snapshot);
++}
++
++bool CodeGeneratorPPC64::generateOutOfLineCode() {
++ if (!CodeGeneratorShared::generateOutOfLineCode()) {
++ return false;
++ }
++
++ if (deoptLabel_.used()) {
++ masm.bind(&deoptLabel_);
++
++ // Frame size is stored in LR and pushed by GenerateBailoutThunk
++ // (via PushBailoutFrame -> pushReturnAddress -> mflr).
++ {
++ UseScratchRegisterScope temps(masm);
++ Register scratch = temps.Acquire();
++ masm.movePtr(ImmWord(frameSize()), scratch);
++ masm.xs_mtlr(scratch);
++ }
++
++ TrampolinePtr handler = gen->jitRuntime()->getGenericBailoutHandler();
++ masm.jump(handler);
++ }
++
++ return !masm.oom();
++}
++
++void CodeGeneratorPPC64::branchToBlock(MBasicBlock* block) {
++ Label* label = skipTrivialBlocks(block)->lir()->label();
++ masm.jump(label);
++}
++
++void CodeGeneratorPPC64::branchToBlock(Assembler::DoubleCondition cond,
++ FloatRegister lhs, FloatRegister rhs,
++ MBasicBlock* mir) {
++ Label* label = skipTrivialBlocks(mir)->lir()->label();
++ masm.branchDouble(cond, lhs, rhs, label);
++}
++
++void CodeGeneratorPPC64::branchToBlock(Assembler::FloatFormat fmt,
++ Assembler::DoubleCondition cond,
++ FloatRegister lhs, FloatRegister rhs,
++ MBasicBlock* mir) {
++ Label* label = skipTrivialBlocks(mir)->lir()->label();
++ if (fmt == Assembler::DoubleFloat) {
++ masm.branchDouble(cond, lhs, rhs, label);
++ } else {
++ masm.branchFloat(cond, lhs, rhs, label);
++ }
++}
++
++class OutOfLineTableSwitch : public OutOfLineCodeBase<CodeGeneratorPPC64> {
++ MTableSwitch* mir_;
++ CodeLabel jumpLabel_;
++
++ void accept(CodeGeneratorPPC64* codegen) {
++ codegen->visitOutOfLineTableSwitch(this);
++ }
++
++ public:
++ explicit OutOfLineTableSwitch(MTableSwitch* mir) : mir_(mir) {}
++
++ MTableSwitch* mir() const { return mir_; }
++ CodeLabel* jumpLabel() { return &jumpLabel_; }
++};
++
++void CodeGeneratorPPC64::emitTableSwitchDispatch(MTableSwitch* mir,
++ Register index,
++ Register base) {
++ Label* defaultcase = skipTrivialBlocks(mir->getDefault())->lir()->label();
++
++ if (mir->low() != 0) {
++ masm.subPtr(Imm32(mir->low()), index);
++ }
++
++ int32_t cases = mir->numCases();
++ masm.branchPtr(Assembler::AboveOrEqual, index, ImmWord(cases), defaultcase);
++
++ OutOfLineTableSwitch* ool = new (alloc()) OutOfLineTableSwitch(mir);
++ addOutOfLineCode(ool, mir);
++
++ masm.mov(ool->jumpLabel(), base);
++
++ BaseIndex pointer(base, index, ScalePointer);
++ masm.branchToComputedAddress(pointer);
++}
++
++void CodeGeneratorPPC64::generateInvalidateEpilogue() {
++ // Pad with enough nops so that PatchWrite_NearCall on the last OSI point
++ // cannot overlap the invalidation epilogue. The patch area is
++ // PatchWrite_NearCallSize (40) bytes; the last OSI point could be right
++ // before this epilogue.
++ for (size_t i = 0; i < Assembler::PatchWrite_NearCallSize();
++ i += Assembler::NopSize()) {
++ masm.nop();
++ }
++
++ masm.bind(&invalidate_);
++
++ // Push the return address (LR) onto the stack.
++ masm.pushReturnAddress();
++
++ invalidateEpilogueData_ = masm.pushWithPatch(ImmWord(uintptr_t(-1)));
++
++ TrampolinePtr thunk = gen->jitRuntime()->getInvalidationThunk();
++ masm.jump(thunk);
++}
++
++void CodeGeneratorPPC64::visitOutOfLineTableSwitch(OutOfLineTableSwitch* ool) {
++ MTableSwitch* mir = ool->mir();
++
++ masm.haltingAlign(sizeof(void*));
++ masm.bind(ool->jumpLabel());
++ masm.addCodeLabel(*ool->jumpLabel());
++
++ for (size_t i = 0; i < mir->numCases(); i++) {
++ LBlock* caseblock = skipTrivialBlocks(mir->getCase(i))->lir();
++ Label* caseheader = caseblock->label();
++ uint32_t caseoffset = caseheader->offset();
++
++ CodeLabel cl;
++ masm.writeCodePointer(&cl);
++ cl.target()->bind(caseoffset);
++ masm.addCodeLabel(cl);
++ }
++}
++
++void CodeGeneratorPPC64::visitOutOfLineWasmTruncateCheck(
++ OutOfLineWasmTruncateCheck* ool) {
++ if (ool->toType() == MIRType::Int32) {
++ masm.outOfLineWasmTruncateToInt32Check(ool->input(), ool->output(),
++ ool->fromType(), ool->flags(),
++ ool->rejoin(), ool->trapSiteDesc());
++ } else {
++ MOZ_ASSERT(ool->toType() == MIRType::Int64);
++ masm.outOfLineWasmTruncateToInt64Check(ool->input(), ool->output64(),
++ ool->fromType(), ool->flags(),
++ ool->rejoin(), ool->trapSiteDesc());
++ }
++}
++
++void CodeGeneratorPPC64::emitBigIntPtrDiv(LBigIntPtrDiv* ins, Register dividend,
++ Register divisor, Register output) {
++ masm.as_divd(output, dividend, divisor);
++}
++
++void CodeGeneratorPPC64::emitBigIntPtrMod(LBigIntPtrMod* ins, Register dividend,
++ Register divisor, Register output) {
++ if (HasPOWER9()) {
++ masm.as_modsd(output, dividend, divisor);
++ } else {
++ masm.as_divd(output, dividend, divisor);
++ masm.as_mulld(output, output, divisor);
++ masm.as_subf(output, output, dividend);
++ }
++}
++
++// ===============================================================
++// Visitors: Box/Unbox
++
++void CodeGenerator::visitBox(LBox* box) {
++ const LAllocation* in = box->getOperand(0);
++ ValueOperand result = ToOutValue(box);
++
++ masm.moveValue(TypedOrValueRegister(box->type(), ToAnyRegister(in)), result);
++}
++
++void CodeGenerator::visitUnbox(LUnbox* unbox) {
++ MUnbox* mir = unbox->mir();
++
++ Register result = ToRegister(unbox->output());
++
++ if (mir->fallible()) {
++ ValueOperand value = ToValue(unbox->input());
++ Label bail;
++ switch (mir->type()) {
++ case MIRType::Int32:
++ masm.fallibleUnboxInt32(value, result, &bail);
++ break;
++ case MIRType::Boolean:
++ masm.fallibleUnboxBoolean(value, result, &bail);
++ break;
++ case MIRType::Object:
++ masm.fallibleUnboxObject(value, result, &bail);
++ break;
++ case MIRType::String:
++ masm.fallibleUnboxString(value, result, &bail);
++ break;
++ case MIRType::Symbol:
++ masm.fallibleUnboxSymbol(value, result, &bail);
++ break;
++ case MIRType::BigInt:
++ masm.fallibleUnboxBigInt(value, result, &bail);
++ break;
++ default:
++ MOZ_CRASH("Given MIRType cannot be unboxed.");
++ }
++ bailoutFrom(&bail, unbox->snapshot());
++ return;
++ }
++
++ LAllocation* input = unbox->getOperand(LUnbox::Input);
++ if (input->isGeneralReg()) {
++ Register inputReg = ToRegister(input);
++ switch (mir->type()) {
++ case MIRType::Int32:
++ masm.unboxInt32(ValueOperand(inputReg), result);
++ break;
++ case MIRType::Boolean:
++ masm.unboxBoolean(ValueOperand(inputReg), result);
++ break;
++ case MIRType::Object:
++ masm.unboxObject(ValueOperand(inputReg), result);
++ break;
++ case MIRType::String:
++ masm.unboxString(ValueOperand(inputReg), result);
++ break;
++ case MIRType::Symbol:
++ masm.unboxSymbol(ValueOperand(inputReg), result);
++ break;
++ case MIRType::BigInt:
++ masm.unboxBigInt(ValueOperand(inputReg), result);
++ break;
++ default:
++ MOZ_CRASH("Given MIRType cannot be unboxed.");
++ }
++ return;
++ }
++
++ Address inputAddr = ToAddress(input);
++ switch (mir->type()) {
++ case MIRType::Int32:
++ masm.unboxInt32(inputAddr, result);
++ break;
++ case MIRType::Boolean:
++ masm.unboxBoolean(inputAddr, result);
++ break;
++ case MIRType::Object:
++ masm.unboxObject(inputAddr, result);
++ break;
++ case MIRType::String:
++ masm.unboxString(inputAddr, result);
++ break;
++ case MIRType::Symbol:
++ masm.unboxSymbol(inputAddr, result);
++ break;
++ case MIRType::BigInt:
++ masm.unboxBigInt(inputAddr, result);
++ break;
++ default:
++ MOZ_CRASH("Given MIRType cannot be unboxed.");
++ }
++}
++
++// ===============================================================
++// Visitors: Integer Arithmetic
++
++void CodeGenerator::visitAddI(LAddI* ins) {
++ LAllocation* lhs = ins->getOperand(0);
++ LAllocation* rhs = ins->getOperand(1);
++ Register dest = ToRegister(ins->getDef(0));
++
++ if (rhs->isConstant()) {
++ Imm32 imm(ToInt32(rhs));
++ if (ins->snapshot()) {
++ masm.move32(ToRegister(lhs), dest);
++ Label overflow;
++ masm.branchAdd32(Assembler::Overflow, imm, dest, &overflow);
++ bailoutFrom(&overflow, ins->snapshot());
++ } else {
++ masm.add32(imm, ToRegister(lhs), dest);
++ }
++ } else {
++ Register rhsReg = ToRegister(rhs);
++ if (ins->snapshot()) {
++ // Use 3-operand add to avoid clobbering rhs when rhs == dest.
++ masm.as_add(dest, ToRegister(lhs), rhsReg);
++ // Check 32-bit overflow: sign-extend lower 32 and compare.
++ masm.as_extsw(SecondScratchReg, dest);
++ Label overflow;
++ masm.as_cmpd(dest, SecondScratchReg);
++ masm.ma_b(Assembler::NotEqual, &overflow);
++ masm.as_extsw(dest, dest);
++ bailoutFrom(&overflow, ins->snapshot());
++ } else {
++ masm.as_add(dest, ToRegister(lhs), rhsReg);
++ masm.as_extsw(dest, dest);
++ }
++ }
++}
++
++void CodeGenerator::visitAddIntPtr(LAddIntPtr* ins) {
++ Register dest = ToRegister(ins->getDef(0));
++ Register lhs = ToRegister(ins->getOperand(0));
++ const LAllocation* rhs = ins->getOperand(1);
++
++ if (rhs->isConstant()) {
++ if (lhs != dest) {
++ masm.movePtr(lhs, dest);
++ }
++ masm.addPtr(ImmWord(ToIntPtr(rhs)), dest);
++ } else {
++ masm.as_add(dest, lhs, ToRegister(rhs));
++ }
++}
++
++void CodeGenerator::visitAddI64(LAddI64* lir) {
++ Register dest = ToRegister(lir->getDef(0));
++ Register lhs = ToRegister(lir->getOperand(0));
++ const LAllocation* rhs = lir->getOperand(1);
++
++ if (rhs->isConstant()) {
++ if (lhs != dest) {
++ masm.movePtr(lhs, dest);
++ }
++ masm.addPtr(ImmWord(ToInt64(rhs)), dest);
++ } else {
++ masm.as_add(dest, lhs, ToRegister(rhs));
++ }
++}
++
++void CodeGenerator::visitSubI(LSubI* ins) {
++ LAllocation* lhs = ins->getOperand(0);
++ LAllocation* rhs = ins->getOperand(1);
++ Register dest = ToRegister(ins->getDef(0));
++
++ if (rhs->isConstant()) {
++ Imm32 imm(ToInt32(rhs));
++ if (ins->snapshot()) {
++ masm.move32(ToRegister(lhs), dest);
++ Label overflow;
++ masm.branchSub32(Assembler::Overflow, imm, dest, &overflow);
++ bailoutFrom(&overflow, ins->snapshot());
++ } else {
++ masm.move32(ToRegister(lhs), dest);
++ masm.sub32(imm, dest);
++ }
++ } else {
++ Register rhsReg = ToRegister(rhs);
++ if (ins->snapshot()) {
++ // as_subf(d, a, b) computes d = b - a, so subf(dest, rhs, lhs) = lhs -
++ // rhs
++ masm.as_subf(dest, rhsReg, ToRegister(lhs));
++ masm.as_extsw(SecondScratchReg, dest);
++ Label overflow;
++ masm.as_cmpd(dest, SecondScratchReg);
++ masm.ma_b(Assembler::NotEqual, &overflow);
++ masm.as_extsw(dest, dest);
++ bailoutFrom(&overflow, ins->snapshot());
++ } else {
++ masm.as_subf(dest, rhsReg, ToRegister(lhs));
++ masm.as_extsw(dest, dest);
++ }
++ }
++}
++
++void CodeGenerator::visitSubIntPtr(LSubIntPtr* ins) {
++ Register dest = ToRegister(ins->getDef(0));
++ Register lhs = ToRegister(ins->getOperand(0));
++ const LAllocation* rhs = ins->getOperand(1);
++
++ if (rhs->isConstant()) {
++ if (lhs != dest) {
++ masm.movePtr(lhs, dest);
++ }
++ masm.subPtr(Imm32(ToIntPtr(rhs)), dest);
++ } else {
++ // as_subf(d, a, b) = b - a
++ masm.as_subf(dest, ToRegister(rhs), lhs);
++ }
++}
++
++void CodeGenerator::visitSubI64(LSubI64* lir) {
++ Register dest = ToRegister(lir->getDef(0));
++ Register lhs = ToRegister(lir->getOperand(0));
++ const LAllocation* rhs = lir->getOperand(1);
++
++ if (rhs->isConstant()) {
++ if (lhs != dest) {
++ masm.movePtr(lhs, dest);
++ }
++ masm.sub64(Imm64(ToInt64(rhs)), Register64(dest));
++ } else {
++ // as_subf(d, a, b) = b - a
++ masm.as_subf(dest, ToRegister(rhs), lhs);
++ }
++}
++
++void CodeGenerator::visitMulI(LMulI* ins) {
++ Register dest = ToRegister(ins->getDef(0));
++ Register lhs = ToRegister(ins->getOperand(0));
++ const LAllocation* rhs = ins->getOperand(1);
++ MMul* mul = ins->mir();
++
++ if (rhs->isConstant()) {
++ int32_t constant = ToInt32(rhs);
++ Register src = lhs;
++
++ // Bailout on -0.0 before the special-case handling below, since cases
++ // like -1 and 0 return early and would skip the check.
++ if (mul->canBeNegativeZero() && constant <= 0) {
++ Assembler::Condition cond =
++ (constant == 0) ? Assembler::Signed : Assembler::Equal;
++ bailoutCmp32(cond, src, Imm32(0), ins->snapshot());
++ }
++
++ switch (constant) {
++ case -1:
++ if (mul->canOverflow()) {
++ Label ok;
++ masm.branchPtr(Assembler::NotEqual, lhs, ImmWord(INT32_MIN), &ok);
++ bailout(ins->snapshot());
++ masm.bind(&ok);
++ }
++ masm.as_neg(dest, src);
++ masm.as_extsw(dest, dest);
++ return;
++ case 0:
++ masm.move32(Imm32(0), dest);
++ return;
++ case 1:
++ masm.move32(src, dest);
++ return;
++ case 2:
++ if (mul->canOverflow()) {
++ masm.move32(src, dest);
++ Label overflow;
++ masm.branchAdd32(Assembler::Overflow, dest, dest, &overflow);
++ bailoutFrom(&overflow, ins->snapshot());
++ } else {
++ masm.move32(src, dest);
++ masm.add32(dest, dest);
++ }
++ return;
++ default:
++ break;
++ }
++
++ // Check for power of 2 (positive).
++ uint32_t absCst = mozilla::Abs(constant);
++ if (absCst > 0 && (absCst & (absCst - 1)) == 0 && !mul->canOverflow()) {
++ uint32_t shift = mozilla::FloorLog2(absCst);
++ masm.x_slwi(dest, src, shift);
++ if (constant < 0) {
++ masm.as_neg(dest, dest);
++ }
++ masm.as_extsw(dest, dest);
++ return;
++ }
++
++ // General case.
++ if (mul->canOverflow()) {
++ masm.move32(src, dest);
++ Label overflow;
++ masm.branchMul32(Assembler::Overflow, Imm32(constant), dest, &overflow);
++ bailoutFrom(&overflow, ins->snapshot());
++ } else {
++ masm.move32(src, dest);
++ masm.mul32(Imm32(constant), dest);
++ }
++
++ // Check for negative zero (for constants not handled above).
++ if (mul->canBeNegativeZero() && constant < 0) {
++ Label ok;
++ masm.branchPtr(Assembler::NotEqual, dest, ImmWord(0), &ok);
++ bailoutCmp32(Assembler::Signed, src, src, ins->snapshot());
++ masm.bind(&ok);
++ }
++ return;
++ }
++
++ Register rhsReg = ToRegister(rhs);
++
++ if (mul->canOverflow()) {
++ // Use 64-bit multiply so the full result is deterministic, then check
++ // whether truncating to 32 bits changes the value. Match the
++ // visitAddI/visitSubI ordering: branch first, truncate only on the
++ // success path (the bailout discards dest anyway). extsw is
++ // non-recording (ISA v3.0B) so it doesn't disturb CR0
++ // either way; the choice is for consistency.
++ masm.as_mulld(dest, lhs, rhsReg);
++ masm.as_extsw(SecondScratchReg, dest);
++ Label overflow;
++ masm.as_cmpd(dest, SecondScratchReg);
++ masm.ma_b(Assembler::NotEqual, &overflow);
++ masm.as_extsw(dest, dest);
++ bailoutFrom(&overflow, ins->snapshot());
++ } else {
++ masm.as_mullw(dest, lhs, rhsReg);
++ masm.as_extsw(dest, dest);
++ }
++
++ if (mul->canBeNegativeZero()) {
++ Label done;
++ masm.branchPtr(Assembler::NotEqual, dest, ImmWord(0), &done);
++ // Result is 0. Check if lhs|rhs was negative.
++ {
++ UseScratchRegisterScope temps(masm);
++ Register scratch = temps.Acquire();
++ masm.as_or_(scratch, lhs, rhsReg);
++ bailoutCmp32(Assembler::Signed, scratch, scratch, ins->snapshot());
++ }
++ masm.bind(&done);
++ }
++}
++
++void CodeGenerator::visitMulIntPtr(LMulIntPtr* ins) {
++ Register dest = ToRegister(ins->getDef(0));
++ Register lhs = ToRegister(ins->getOperand(0));
++ const LAllocation* rhs = ins->getOperand(1);
++
++ if (rhs->isConstant()) {
++ if (lhs != dest) {
++ masm.movePtr(lhs, dest);
++ }
++ masm.mulPtr(ImmWord(ToIntPtr(rhs)), dest);
++ } else {
++ masm.as_mulld(dest, lhs, ToRegister(rhs));
++ }
++}
++
++void CodeGenerator::visitMulI64(LMulI64* lir) {
++ Register dest = ToRegister(lir->getDef(0));
++ Register lhs = ToRegister(lir->getOperand(0));
++ const LAllocation* rhs = lir->getOperand(1);
++
++ if (rhs->isConstant()) {
++ if (lhs != dest) {
++ masm.movePtr(lhs, dest);
++ }
++ masm.mulPtr(ImmWord(ToInt64(rhs)), dest);
++ } else {
++ masm.as_mulld(dest, lhs, ToRegister(rhs));
++ }
++}
++
++void CodeGenerator::visitDivI(LDivI* ins) {
++ Register lhs = ToRegister(ins->lhs());
++ Register rhs = ToRegister(ins->rhs());
++ Register dest = ToRegister(ins->output());
++ Register temp = ToRegister(ins->temp0());
++ MDiv* mir = ins->mir();
++
++ Label done;
++
++ // Handle divide by zero.
++ if (mir->canBeDivideByZero()) {
++ if (mir->trapOnError()) {
++ Label nonZero;
++ masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(0), &nonZero);
++ masm.wasmTrap(wasm::Trap::IntegerDivideByZero, mir->trapSiteDesc());
++ masm.bind(&nonZero);
++ } else if (mir->canTruncateInfinities()) {
++ Label nonZero;
++ masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(0), &nonZero);
++ masm.move32(Imm32(0), dest);
++ masm.jump(&done);
++ masm.bind(&nonZero);
++ } else {
++ MOZ_ASSERT(mir->fallible());
++ bailoutCmp32(Assembler::Equal, rhs, Imm32(0), ins->snapshot());
++ }
++ }
++
++ // Handle INT32_MIN / -1 overflow.
++ if (mir->canBeNegativeOverflow()) {
++ Label notMinInt;
++ masm.branchPtr(Assembler::NotEqual, lhs, ImmWord(INT32_MIN), ¬MinInt);
++ masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(-1), ¬MinInt);
++
++ if (mir->trapOnError()) {
++ masm.wasmTrap(wasm::Trap::IntegerOverflow, mir->trapSiteDesc());
++ } else if (mir->canTruncateOverflow()) {
++ masm.move32(Imm32(INT32_MIN), dest);
++ masm.jump(&done);
++ } else {
++ MOZ_ASSERT(mir->fallible());
++ bailout(ins->snapshot());
++ }
++ masm.bind(¬MinInt);
++ }
++
++ // Handle negative zero.
++ if (!mir->canTruncateNegativeZero() && mir->canBeNegativeZero()) {
++ Label ok;
++ masm.branchPtr(Assembler::NotEqual, lhs, ImmWord(0), &ok);
++ bailoutCmp32(Assembler::LessThan, rhs, Imm32(0), ins->snapshot());
++ masm.bind(&ok);
++ }
++
++ // Perform the division.
++ masm.as_divw(dest, lhs, rhs);
++ masm.as_extsw(dest, dest);
++
++ // Check remainder if not truncatable.
++ if (!mir->canTruncateRemainder()) {
++ // Compute remainder: temp = lhs - (dest * rhs)
++ masm.as_mullw(temp, dest, rhs);
++ masm.as_subf(temp, temp, lhs); // temp = lhs - temp
++ bailoutCmp32(Assembler::NotEqual, temp, Imm32(0), ins->snapshot());
++ }
++
++ masm.bind(&done);
++}
++
++void CodeGenerator::visitDivPowTwoI(LDivPowTwoI* ins) {
++ Register lhs = ToRegister(ins->numerator());
++ Register dest = ToRegister(ins->output());
++ UseScratchRegisterScope temps(masm);
++ Register tmp = temps.Acquire();
++ int32_t shift = ins->shift();
++
++ if (shift != 0) {
++ MDiv* mir = ins->mir();
++
++ if (!mir->isTruncated()) {
++ // If remainder != 0, bailout (check lower 'shift' bits).
++ masm.x_slwi(tmp, lhs, 32 - shift);
++ bailoutCmp32(Assembler::NotEqual, tmp, Imm32(0), ins->snapshot());
++ }
++
++ if (!mir->canBeNegativeDividend()) {
++ // Non-negative dividend: simple right shift.
++ masm.as_srawi(dest, lhs, shift);
++ } else {
++ // Need rounding adjustment for negative numbers.
++ // Add (1 << shift) - 1 if lhs is negative.
++ if (shift > 1) {
++ masm.as_srawi(tmp, lhs, 31);
++ masm.as_rlwinm(tmp, tmp, 0, 32 - shift, 31);
++ } else {
++ // shift == 1: extract sign bit into bit 31
++ masm.as_rlwinm(tmp, lhs, 1, 31, 31);
++ }
++ masm.add32(lhs, tmp);
++ masm.as_srawi(dest, tmp, shift);
++ }
++ } else {
++ masm.move32(lhs, dest);
++ }
++}
++
++void CodeGenerator::visitModI(LModI* ins) {
++ Register lhs = ToRegister(ins->lhs());
++ Register rhs = ToRegister(ins->rhs());
++ Register dest = ToRegister(ins->output());
++ UseScratchRegisterScope temps(masm);
++ Register temp = temps.Acquire();
++ MMod* mir = ins->mir();
++ Label done;
++
++ // Handle divide by zero.
++ if (mir->canBeDivideByZero()) {
++ if (mir->isTruncated()) {
++ if (mir->trapOnError()) {
++ Label nonZero;
++ masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(0), &nonZero);
++ masm.wasmTrap(wasm::Trap::IntegerDivideByZero, mir->trapSiteDesc());
++ masm.bind(&nonZero);
++ } else {
++ // Truncated division by zero yields integer zero.
++ masm.move32(rhs, dest);
++ Label nonZero;
++ masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(0), &nonZero);
++ masm.jump(&done);
++ masm.bind(&nonZero);
++ }
++ } else {
++ MOZ_ASSERT(mir->fallible());
++ bailoutCmp32(Assembler::Equal, rhs, Imm32(0), ins->snapshot());
++ }
++ }
++
++ // Handle INT32_MIN % -1.
++ // PPC64 divw is undefined for INT32_MIN / -1 (quotient overflows), so we
++ // must return 0 explicitly. The wasm spec also defines rem_s(MIN, -1) = 0.
++ if (!mir->isUnsigned()) {
++ Label notMinOverflow;
++ masm.branchPtr(Assembler::NotEqual, lhs, ImmWord(INT32_MIN),
++ ¬MinOverflow);
++ masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(-1), ¬MinOverflow);
++ masm.move32(Imm32(0), dest);
++ masm.jump(&done);
++ masm.bind(¬MinOverflow);
++ }
++
++ if (HasPOWER9()) {
++ masm.as_modsw(dest, lhs, rhs);
++ } else {
++ masm.as_divw(temp, lhs, rhs);
++ masm.as_mullw(temp, temp, rhs);
++ masm.as_subf(dest, temp, lhs);
++ }
++ masm.as_extsw(dest, dest);
++
++ // If X%Y == 0 and X < 0, the result is -0, and we need to bail out.
++ if (mir->canBeNegativeDividend() && !mir->isTruncated()) {
++ MOZ_ASSERT(mir->fallible());
++ Label ok;
++ masm.branchPtr(Assembler::NotEqual, dest, ImmWord(0), &ok);
++ bailoutCmp32(Assembler::Signed, lhs, Imm32(0), ins->snapshot());
++ masm.bind(&ok);
++ }
++
++ masm.bind(&done);
++}
++
++void CodeGenerator::visitModPowTwoI(LModPowTwoI* ins) {
++ Register in = ToRegister(ins->getOperand(0));
++ Register out = ToRegister(ins->getDef(0));
++ MMod* mir = ins->mir();
++ int32_t shift = ins->shift();
++ uint32_t mask = (uint32_t(1) << shift) - 1;
++
++ if (mir->canBeNegativeDividend() && !mir->isTruncated()) {
++ Label nonNeg;
++ masm.branchPtr(Assembler::NotEqual, in, ImmWord(0), &nonNeg);
++ // in == 0: mod is 0, check for negative zero.
++ bailoutCmp32(Assembler::Signed, in, in, ins->snapshot());
++ masm.bind(&nonNeg);
++ }
++
++ Label negative, done;
++ masm.branch32(Assembler::Signed, in, in, &negative);
++
++ // Positive case: just mask.
++ masm.and32(Imm32(mask), in, out);
++ masm.jump(&done);
++
++ // Negative case: negate, mask, negate back.
++ masm.bind(&negative);
++ masm.as_neg(out, in);
++ masm.and32(Imm32(mask), out);
++ masm.as_neg(out, out);
++ masm.as_extsw(out, out);
++
++ if (!mir->isTruncated() && mir->canBeNegativeDividend()) {
++ Label ok;
++ masm.branchPtr(Assembler::NotEqual, out, ImmWord(0), &ok);
++ bailout(ins->snapshot());
++ masm.bind(&ok);
++ }
++
++ masm.bind(&done);
++}
++
++void CodeGenerator::visitModMaskI(LModMaskI* ins) {
++ Register src = ToRegister(ins->input());
++ Register dest = ToRegister(ins->output());
++ Register tmp0 = ToRegister(ins->temp0());
++ Register tmp1 = ToRegister(ins->temp1());
++ MMod* mir = ins->mir();
++
++ if (!mir->isTruncated() && mir->canBeNegativeDividend()) {
++ MOZ_ASSERT(mir->fallible());
++
++ Label bail;
++ masm.ma_mod_mask(src, dest, tmp0, tmp1, ins->shift(), &bail);
++ bailoutFrom(&bail, ins->snapshot());
++ } else {
++ masm.ma_mod_mask(src, dest, tmp0, tmp1, ins->shift(), nullptr);
++ }
++}
++
++void CodeGenerator::visitNegI(LNegI* ins) {
++ Register input = ToRegister(ins->input());
++ Register output = ToRegister(ins->output());
++ masm.as_neg(output, input);
++ masm.as_extsw(output, output);
++}
++
++void CodeGenerator::visitNegI64(LNegI64* ins) {
++ Register input = ToRegister64(ins->input()).reg;
++ Register output = ToOutRegister64(ins).reg;
++ masm.as_neg(output, input);
++}
++
++void CodeGenerator::visitUDivOrMod(LUDivOrMod* ins) {
++ Register lhs = ToRegister(ins->lhs());
++ Register rhs = ToRegister(ins->rhs());
++ Register output = ToRegister(ins->output());
++ UseScratchRegisterScope temps(masm);
++ Register temp = temps.Acquire();
++ Label done;
++
++ // Division by zero check.
++ if (ins->canBeDivideByZero()) {
++ if (ins->mir()->isTruncated()) {
++ if (ins->trapOnError()) {
++ Label nonZero;
++ masm.branch32(Assembler::NotEqual, rhs, Imm32(0), &nonZero);
++ masm.wasmTrap(wasm::Trap::IntegerDivideByZero, ins->trapSiteDesc());
++ masm.bind(&nonZero);
++ } else {
++ Label nonZero;
++ masm.branch32(Assembler::NotEqual, rhs, Imm32(0), &nonZero);
++ masm.move32(Imm32(0), output);
++ masm.jump(&done);
++ masm.bind(&nonZero);
++ }
++ } else {
++ bailoutCmp32(Assembler::Equal, rhs, Imm32(0), ins->snapshot());
++ }
++ }
++
++ // Zero-extend both operands to 64 bits for unsigned divide.
++ masm.move32To64ZeroExtend(lhs, Register64(lhs));
++ masm.move32To64ZeroExtend(rhs, Register64(rhs));
++
++ if (ins->mir()->isDiv()) {
++ // Division path: compute quotient. Check remainder if needed.
++ if (!ins->mir()->toDiv()->canTruncateRemainder()) {
++ if (HasPOWER9()) {
++ masm.as_moduw(temp, lhs, rhs);
++ } else {
++ masm.as_divwu(temp, lhs, rhs);
++ masm.as_mullw(temp, temp, rhs);
++ masm.as_subf(temp, temp, lhs);
++ }
++ bailoutCmp32(Assembler::NotEqual, temp, Imm32(0), ins->snapshot());
++ }
++ masm.as_divwu(output, lhs, rhs);
++ } else {
++ // Modulo path.
++ if (HasPOWER9()) {
++ masm.as_moduw(output, lhs, rhs);
++ } else {
++ masm.as_divwu(temp, lhs, rhs);
++ masm.as_mullw(temp, temp, rhs);
++ masm.as_subf(output, temp, lhs);
++ }
++ }
++
++ masm.as_extsw(output, output);
++
++ if (!ins->mir()->isTruncated()) {
++ bailoutCmp32(Assembler::LessThan, output, Imm32(0), ins->snapshot());
++ }
++
++ masm.bind(&done);
++}
++
++void CodeGenerator::visitDivOrModI64(LDivOrModI64* lir) {
++ Register lhs = ToRegister(lir->getOperand(0));
++ Register rhs = ToRegister(lir->getOperand(1));
++ Register output = ToRegister(lir->output());
++
++ Label done;
++
++ // Division by zero trap.
++ if (lir->canBeDivideByZero()) {
++ Label nonZero;
++ masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(0), &nonZero);
++ masm.wasmTrap(wasm::Trap::IntegerDivideByZero, lir->trapSiteDesc());
++ masm.bind(&nonZero);
++ }
++
++ // INT64_MIN / -1 overflow trap (for div only).
++ if (lir->canBeNegativeOverflow()) {
++ Label notMinInt;
++ masm.branchPtr(Assembler::NotEqual, lhs, ImmWord(INT64_MIN), ¬MinInt);
++ masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(-1), ¬MinInt);
++ if (lir->mir()->isDiv()) {
++ masm.wasmTrap(wasm::Trap::IntegerOverflow, lir->trapSiteDesc());
++ } else {
++ masm.movePtr(ImmWord(0), output);
++ masm.jump(&done);
++ }
++ masm.bind(¬MinInt);
++ }
++
++ if (lir->mir()->isDiv()) {
++ masm.as_divd(output, lhs, rhs);
++ } else if (HasPOWER9()) {
++ masm.as_modsd(output, lhs, rhs);
++ } else {
++ masm.as_divd(output, lhs, rhs);
++ masm.as_mulld(output, output, rhs);
++ masm.as_subf(output, output, lhs);
++ }
++
++ masm.bind(&done);
++}
++
++void CodeGenerator::visitUDivOrModI64(LUDivOrModI64* lir) {
++ Register lhs = ToRegister(lir->getOperand(0));
++ Register rhs = ToRegister(lir->getOperand(1));
++ Register output = ToRegister(lir->output());
++
++ // Division by zero trap.
++ if (lir->canBeDivideByZero()) {
++ Label nonZero;
++ masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(0), &nonZero);
++ masm.wasmTrap(wasm::Trap::IntegerDivideByZero, lir->trapSiteDesc());
++ masm.bind(&nonZero);
++ }
++
++ if (lir->mir()->isDiv()) {
++ masm.as_divdu(output, lhs, rhs);
++ } else if (HasPOWER9()) {
++ masm.as_modud(output, lhs, rhs);
++ } else {
++ masm.as_divdu(output, lhs, rhs);
++ masm.as_mulld(output, output, rhs);
++ masm.as_subf(output, output, lhs);
++ }
++}
++
++// ===============================================================
++// Visitors: Bitwise
++
++void CodeGenerator::visitBitNotI(LBitNotI* ins) {
++ Register input = ToRegister(ins->input());
++ Register dest = ToRegister(ins->output());
++ masm.as_nor(dest, input, input);
++ masm.as_extsw(dest, dest);
++}
++
++void CodeGenerator::visitBitNotI64(LBitNotI64* ins) {
++ Register input = ToRegister64(ins->input()).reg;
++ Register dest = ToOutRegister64(ins).reg;
++ masm.as_nor(dest, input, input);
++}
++
++void CodeGenerator::visitBitOpI(LBitOpI* ins) {
++ Register dest = ToRegister(ins->getDef(0));
++ Register lhs = ToRegister(ins->getOperand(0));
++ const LAllocation* rhs = ins->getOperand(1);
++
++ switch (ins->bitop()) {
++ case JSOp::BitOr:
++ if (rhs->isConstant()) {
++ masm.or32(Imm32(ToInt32(rhs)), lhs, dest);
++ } else {
++ masm.as_or_(dest, lhs, ToRegister(rhs));
++ masm.as_extsw(dest, dest);
++ }
++ break;
++ case JSOp::BitXor:
++ if (rhs->isConstant()) {
++ masm.xor32(Imm32(ToInt32(rhs)), lhs, dest);
++ } else {
++ masm.as_xor_(dest, lhs, ToRegister(rhs));
++ masm.as_extsw(dest, dest);
++ }
++ break;
++ case JSOp::BitAnd:
++ if (rhs->isConstant()) {
++ masm.and32(Imm32(ToInt32(rhs)), lhs, dest);
++ } else {
++ masm.as_and_(dest, lhs, ToRegister(rhs));
++ masm.as_extsw(dest, dest);
++ }
++ break;
++ default:
++ MOZ_CRASH("unexpected binary opcode");
++ }
++}
++
++void CodeGenerator::visitBitOpI64(LBitOpI64* lir) {
++ Register dest = ToRegister(lir->getDef(0));
++ Register lhs = ToRegister(lir->getOperand(0));
++ const LAllocation* rhs = lir->getOperand(1);
++
++ switch (lir->bitop()) {
++ case JSOp::BitOr:
++ if (rhs->isConstant()) {
++ if (lhs != dest) {
++ masm.movePtr(lhs, dest);
++ }
++ masm.or64(Imm64(ToInt64(rhs)), Register64(dest));
++ } else {
++ masm.as_or_(dest, lhs, ToRegister(rhs));
++ }
++ break;
++ case JSOp::BitXor:
++ if (rhs->isConstant()) {
++ if (lhs != dest) {
++ masm.movePtr(lhs, dest);
++ }
++ masm.xor64(Imm64(ToInt64(rhs)), Register64(dest));
++ } else {
++ masm.as_xor_(dest, lhs, ToRegister(rhs));
++ }
++ break;
++ case JSOp::BitAnd:
++ if (rhs->isConstant()) {
++ if (lhs != dest) {
++ masm.movePtr(lhs, dest);
++ }
++ masm.and64(Imm64(ToInt64(rhs)), Register64(dest));
++ } else {
++ masm.as_and_(dest, lhs, ToRegister(rhs));
++ }
++ break;
++ default:
++ MOZ_CRASH("unexpected binary opcode");
++ }
++}
++
++void CodeGenerator::visitShiftI(LShiftI* ins) {
++ Register lhs = ToRegister(ins->lhs());
++ const LAllocation* rhs = ins->rhs();
++ Register dest = ToRegister(ins->output());
++
++ if (rhs->isConstant()) {
++ int32_t shift = ToInt32(rhs) & 0x1f;
++ switch (ins->bitop()) {
++ case JSOp::Lsh:
++ if (shift) {
++ masm.lshift32(Imm32(shift), lhs, dest);
++ } else {
++ masm.move32(lhs, dest);
++ }
++ break;
++ case JSOp::Rsh:
++ if (shift) {
++ masm.rshift32Arithmetic(Imm32(shift), lhs, dest);
++ } else {
++ masm.move32(lhs, dest);
++ }
++ break;
++ case JSOp::Ursh:
++ if (shift) {
++ masm.rshift32(Imm32(shift), lhs, dest);
++ } else {
++ // x >>> 0 can produce values that need to be treated as unsigned.
++ masm.move32(lhs, dest);
++ }
++ if (ins->mir()->toUrsh()->fallible()) {
++ // x >>> 0 can produce values that don't fit in signed int32.
++ bailoutCmp32(Assembler::LessThan, dest, Imm32(0), ins->snapshot());
++ }
++ break;
++ default:
++ MOZ_CRASH("unexpected shift opcode");
++ }
++ } else {
++ Register shiftReg = ToRegister(rhs);
++ // PPC slw/srw/sraw use 6 bits of shift amount; JS requires mod 32.
++ UseScratchRegisterScope temps(masm);
++ Register masked = temps.Acquire();
++ masm.as_rlwinm(masked, shiftReg, 0, 27, 31);
++ switch (ins->bitop()) {
++ case JSOp::Lsh:
++ masm.as_slw(dest, lhs, masked);
++ masm.as_extsw(dest, dest);
++ break;
++ case JSOp::Rsh:
++ masm.as_sraw(dest, lhs, masked);
++ break;
++ case JSOp::Ursh:
++ masm.as_srw(dest, lhs, masked);
++ masm.as_extsw(dest, dest);
++ if (ins->mir()->toUrsh()->fallible()) {
++ bailoutCmp32(Assembler::LessThan, dest, Imm32(0), ins->snapshot());
++ }
++ break;
++ default:
++ MOZ_CRASH("unexpected shift opcode");
++ }
++ }
++}
++
++void CodeGenerator::visitShiftIntPtr(LShiftIntPtr* ins) {
++ Register lhs = ToRegister(ins->lhs());
++ Register dest = ToRegister(ins->output());
++
++ if (ins->rhs()->isConstant()) {
++ // ShiftIntPtr's RHS constant is IntPtr- or Int32-typed, not Int64. Use
++ // ToIntPtr() which dispatches on the underlying MIRType (the previous
++ // MConstant::toInt64() call asserted when the constant wasn't Int64).
++ int32_t shift = int32_t(ToIntPtr(ins->rhs())) & 0x3f;
++ switch (ins->bitop()) {
++ case JSOp::Lsh:
++ if (shift) {
++ masm.lshiftPtr(Imm32(shift), lhs, dest);
++ } else {
++ masm.movePtr(lhs, dest);
++ }
++ break;
++ case JSOp::Rsh:
++ if (shift) {
++ masm.rshiftPtrArithmetic(Imm32(shift), lhs, dest);
++ } else {
++ masm.movePtr(lhs, dest);
++ }
++ break;
++ case JSOp::Ursh:
++ if (shift) {
++ masm.rshiftPtr(Imm32(shift), lhs, dest);
++ } else {
++ masm.movePtr(lhs, dest);
++ }
++ break;
++ default:
++ MOZ_CRASH("unexpected shift opcode");
++ }
++ } else {
++ Register shiftReg = ToRegister(ins->rhs());
++ // sld/srd/srad use the low 7 bits of the shift count: counts >= 64
++ // produce 0 (sign-fill for srad). Mask to 6 bits for mod-64 semantics.
++ UseScratchRegisterScope temps(masm);
++ Register masked = temps.Acquire();
++ masm.as_rldicl(masked, shiftReg, 0, 58);
++ switch (ins->bitop()) {
++ case JSOp::Lsh:
++ masm.as_sld(dest, lhs, masked);
++ break;
++ case JSOp::Rsh:
++ masm.as_srad(dest, lhs, masked);
++ break;
++ case JSOp::Ursh:
++ masm.as_srd(dest, lhs, masked);
++ break;
++ default:
++ MOZ_CRASH("unexpected shift opcode");
++ }
++ }
++}
++
++void CodeGenerator::visitShiftI64(LShiftI64* lir) {
++ Register lhs = ToRegister64(lir->lhs()).reg;
++ Register dest = ToOutRegister64(lir).reg;
++ const LAllocation* rhs = lir->rhs();
++
++ if (rhs->isConstant()) {
++ int32_t shift = int32_t(rhs->toConstant()->toInt64()) & 0x3f;
++ switch (lir->bitop()) {
++ case JSOp::Lsh:
++ if (shift) {
++ masm.lshiftPtr(Imm32(shift), lhs, dest);
++ } else {
++ masm.movePtr(lhs, dest);
++ }
++ break;
++ case JSOp::Rsh:
++ if (shift) {
++ masm.rshiftPtrArithmetic(Imm32(shift), lhs, dest);
++ } else {
++ masm.movePtr(lhs, dest);
++ }
++ break;
++ case JSOp::Ursh:
++ if (shift) {
++ masm.rshiftPtr(Imm32(shift), lhs, dest);
++ } else {
++ masm.movePtr(lhs, dest);
++ }
++ break;
++ default:
++ MOZ_CRASH("unexpected shift opcode");
++ }
++ } else {
++ Register shiftReg = ToRegister(rhs);
++ // Wasm i64 shifts require shift count modulo 64. PPC64 sld/srd/srad
++ // use a 7-bit shift field, so shifts >= 64 produce 0 (or sign-fill
++ // for srad). Mask to 6 bits first.
++ UseScratchRegisterScope temps(masm);
++ Register masked = temps.Acquire();
++ masm.as_rldicl(masked, shiftReg, 0, 58); // clrldi: keep low 6 bits
++ switch (lir->bitop()) {
++ case JSOp::Lsh:
++ masm.as_sld(dest, lhs, masked);
++ break;
++ case JSOp::Rsh:
++ masm.as_srad(dest, lhs, masked);
++ break;
++ case JSOp::Ursh:
++ masm.as_srd(dest, lhs, masked);
++ break;
++ default:
++ MOZ_CRASH("unexpected shift opcode");
++ }
++ }
++}
++
++void CodeGenerator::visitUrshD(LUrshD* ins) {
++ Register lhs = ToRegister(ins->lhs());
++ const LAllocation* rhs = ins->rhs();
++ FloatRegister dest = ToFloatRegister(ins->output());
++
++ Register temp = ToRegister(ins->temp0());
++
++ if (rhs->isConstant()) {
++ int32_t shift = ToInt32(rhs) & 0x1f;
++ if (shift) {
++ masm.rshift32(Imm32(shift), lhs, temp);
++ } else {
++ masm.move32(lhs, temp);
++ }
++ } else {
++ masm.move32(lhs, temp);
++ masm.rshift32(ToRegister(rhs), temp);
++ }
++
++ masm.convertUInt32ToDouble(temp, dest);
++}
++
++// ===============================================================
++// Visitors: Floating-point arithmetic
++
++void CodeGenerator::visitMathD(LMathD* math) {
++ FloatRegister lhs = ToFloatRegister(math->lhs());
++ FloatRegister rhs = ToFloatRegister(math->rhs());
++ FloatRegister dest = ToFloatRegister(math->output());
++
++ switch (math->jsop()) {
++ case JSOp::Add:
++ masm.as_fadd(dest, lhs, rhs);
++ break;
++ case JSOp::Sub:
++ masm.as_fsub(dest, lhs, rhs);
++ break;
++ case JSOp::Mul:
++ masm.as_fmul(dest, lhs, rhs);
++ break;
++ case JSOp::Div:
++ masm.as_fdiv(dest, lhs, rhs);
++ break;
++ default:
++ MOZ_CRASH("unexpected double opcode");
++ }
++}
++
++void CodeGenerator::visitMathF(LMathF* math) {
++ FloatRegister lhs = ToFloatRegister(math->lhs());
++ FloatRegister rhs = ToFloatRegister(math->rhs());
++ FloatRegister dest = ToFloatRegister(math->output());
++
++ switch (math->jsop()) {
++ case JSOp::Add:
++ masm.as_fadds(dest, lhs, rhs);
++ break;
++ case JSOp::Sub:
++ masm.as_fsubs(dest, lhs, rhs);
++ break;
++ case JSOp::Mul:
++ masm.as_fmuls(dest, lhs, rhs);
++ break;
++ case JSOp::Div:
++ masm.as_fdivs(dest, lhs, rhs);
++ break;
++ default:
++ MOZ_CRASH("unexpected float32 opcode");
++ }
++}
++
++void CodeGenerator::visitMinMaxD(LMinMaxD* ins) {
++ FloatRegister first = ToFloatRegister(ins->first());
++ FloatRegister second = ToFloatRegister(ins->second());
++ mozilla::DebugOnly<FloatRegister> output = ToFloatRegister(ins->output());
++
++ MOZ_ASSERT(first == output);
++ if (ins->mir()->isMax()) {
++ masm.maxDouble(second, first, /* handleNaN = */ true);
++ } else {
++ masm.minDouble(second, first, /* handleNaN = */ true);
++ }
++}
++
++void CodeGenerator::visitMinMaxF(LMinMaxF* ins) {
++ FloatRegister first = ToFloatRegister(ins->first());
++ FloatRegister second = ToFloatRegister(ins->second());
++ mozilla::DebugOnly<FloatRegister> output = ToFloatRegister(ins->output());
++
++ MOZ_ASSERT(first == output);
++ if (ins->mir()->isMax()) {
++ masm.maxFloat32(second, first, /* handleNaN = */ true);
++ } else {
++ masm.minFloat32(second, first, /* handleNaN = */ true);
++ }
++}
++
++void CodeGenerator::visitNegD(LNegD* ins) {
++ FloatRegister input = ToFloatRegister(ins->input());
++ FloatRegister output = ToFloatRegister(ins->output());
++ masm.as_fneg(output, input);
++}
++
++void CodeGenerator::visitNegF(LNegF* ins) {
++ FloatRegister input = ToFloatRegister(ins->input());
++ FloatRegister output = ToFloatRegister(ins->output());
++ masm.as_fneg(output, input);
++}
++
++void CodeGenerator::visitPowHalfD(LPowHalfD* ins) {
++ FloatRegister input = ToFloatRegister(ins->input());
++ FloatRegister output = ToFloatRegister(ins->output());
++
++ Label done, skip;
++
++ // Check for -Infinity.
++ masm.loadConstantDouble(NegativeInfinity<double>(), ScratchDoubleReg);
++ masm.branchDouble(Assembler::DoubleNotEqualOrUnordered, input,
++ ScratchDoubleReg, &skip);
++ masm.loadConstantDouble(std::numeric_limits<double>::infinity(), output);
++ masm.jump(&done);
++
++ masm.bind(&skip);
++ // Add 0.0 to handle -0.
++ masm.loadConstantDouble(0.0, ScratchDoubleReg);
++ masm.as_fadd(output, input, ScratchDoubleReg);
++ masm.as_fsqrt(output, output);
++
++ masm.bind(&done);
++}
++
++void CodeGenerator::visitNotD(LNotD* ins) {
++ FloatRegister input = ToFloatRegister(ins->input());
++ Register dest = ToRegister(ins->output());
++
++ masm.loadConstantDouble(0.0, ScratchDoubleReg);
++ masm.as_fcmpu(input, ScratchDoubleReg);
++ masm.ma_cmp_set_dbl(dest, Assembler::DoubleEqualOrUnordered);
++}
++
++void CodeGenerator::visitNotF(LNotF* ins) {
++ FloatRegister input = ToFloatRegister(ins->input());
++ Register dest = ToRegister(ins->output());
++
++ masm.loadConstantFloat32(0.0f, ScratchFloat32Reg);
++ masm.as_fcmpu(input, ScratchFloat32Reg);
++ masm.ma_cmp_set_dbl(dest, Assembler::DoubleEqualOrUnordered);
++}
++
++// ===============================================================
++// Visitors: FP comparisons and branches
++
++void CodeGenerator::visitCompareD(LCompareD* comp) {
++ FloatRegister lhs = ToFloatRegister(comp->left());
++ FloatRegister rhs = ToFloatRegister(comp->right());
++ Register dest = ToRegister(comp->output());
++ Assembler::DoubleCondition cond =
++ comp->mir()->jsop() == JSOp::StrictEq ? Assembler::DoubleEqual
++ : comp->mir()->jsop() == JSOp::StrictNe
++ ? Assembler::DoubleNotEqualOrUnordered
++ : JSOpToDoubleCondition(comp->mir()->jsop());
++
++ masm.as_fcmpu(lhs, rhs);
++ masm.ma_cmp_set_dbl(dest, cond);
++}
++
++void CodeGenerator::visitCompareF(LCompareF* comp) {
++ FloatRegister lhs = ToFloatRegister(comp->left());
++ FloatRegister rhs = ToFloatRegister(comp->right());
++ Register dest = ToRegister(comp->output());
++ Assembler::DoubleCondition cond =
++ comp->mir()->jsop() == JSOp::StrictEq ? Assembler::DoubleEqual
++ : comp->mir()->jsop() == JSOp::StrictNe
++ ? Assembler::DoubleNotEqualOrUnordered
++ : JSOpToDoubleCondition(comp->mir()->jsop());
++
++ masm.as_fcmpu(lhs, rhs);
++ masm.ma_cmp_set_dbl(dest, cond);
++}
++
++void CodeGenerator::visitCompareDAndBranch(LCompareDAndBranch* comp) {
++ FloatRegister lhs = ToFloatRegister(comp->left());
++ FloatRegister rhs = ToFloatRegister(comp->right());
++
++ Assembler::DoubleCondition cond =
++ JSOpToDoubleCondition(comp->cmpMir()->jsop());
++ MBasicBlock* ifTrue = comp->ifTrue();
++ MBasicBlock* ifFalse = comp->ifFalse();
++
++ if (isNextBlock(ifFalse->lir())) {
++ branchToBlock(Assembler::DoubleFloat, cond, lhs, rhs, ifTrue);
++ } else {
++ branchToBlock(Assembler::DoubleFloat, Assembler::InvertCondition(cond), lhs,
++ rhs, ifFalse);
++ jumpToBlock(ifTrue);
++ }
++}
++
++void CodeGenerator::visitCompareFAndBranch(LCompareFAndBranch* comp) {
++ FloatRegister lhs = ToFloatRegister(comp->left());
++ FloatRegister rhs = ToFloatRegister(comp->right());
++
++ Assembler::DoubleCondition cond =
++ JSOpToDoubleCondition(comp->cmpMir()->jsop());
++ MBasicBlock* ifTrue = comp->ifTrue();
++ MBasicBlock* ifFalse = comp->ifFalse();
++
++ if (isNextBlock(ifFalse->lir())) {
++ branchToBlock(Assembler::SingleFloat, cond, lhs, rhs, ifTrue);
++ } else {
++ branchToBlock(Assembler::SingleFloat, Assembler::InvertCondition(cond), lhs,
++ rhs, ifFalse);
++ jumpToBlock(ifTrue);
++ }
++}
++
++void CodeGenerator::visitTestDAndBranch(LTestDAndBranch* test) {
++ FloatRegister input = ToFloatRegister(test->input());
++
++ MBasicBlock* ifTrue = test->ifTrue();
++ MBasicBlock* ifFalse = test->ifFalse();
++
++ masm.loadConstantDouble(0.0, ScratchDoubleReg);
++
++ if (isNextBlock(ifFalse->lir())) {
++ branchToBlock(Assembler::DoubleFloat, Assembler::DoubleNotEqual, input,
++ ScratchDoubleReg, ifTrue);
++ } else {
++ branchToBlock(Assembler::DoubleFloat, Assembler::DoubleEqualOrUnordered,
++ input, ScratchDoubleReg, ifFalse);
++ jumpToBlock(ifTrue);
++ }
++}
++
++void CodeGenerator::visitTestFAndBranch(LTestFAndBranch* test) {
++ FloatRegister input = ToFloatRegister(test->input());
++
++ MBasicBlock* ifTrue = test->ifTrue();
++ MBasicBlock* ifFalse = test->ifFalse();
++
++ masm.loadConstantFloat32(0.0f, ScratchFloat32Reg);
++
++ if (isNextBlock(ifFalse->lir())) {
++ branchToBlock(Assembler::SingleFloat, Assembler::DoubleNotEqual, input,
++ ScratchFloat32Reg, ifTrue);
++ } else {
++ branchToBlock(Assembler::SingleFloat, Assembler::DoubleEqualOrUnordered,
++ input, ScratchFloat32Reg, ifFalse);
++ jumpToBlock(ifTrue);
++ }
++}
++
++// ===============================================================
++// Visitors: Truncation
++
++void CodeGenerator::visitTruncateDToInt32(LTruncateDToInt32* ins) {
++ emitTruncateDouble(ToFloatRegister(ins->input()), ToRegister(ins->output()),
++ ins->mir());
++}
++
++void CodeGenerator::visitTruncateFToInt32(LTruncateFToInt32* ins) {
++ emitTruncateFloat32(ToFloatRegister(ins->input()), ToRegister(ins->output()),
++ ins->mir());
++}
++
++// ===============================================================
++// Visitors: Int64 / Wasm type conversions
++
++void CodeGenerator::visitExtendInt32ToInt64(LExtendInt32ToInt64* lir) {
++ Register input = ToRegister(lir->input());
++ Register output = ToRegister(lir->output());
++
++ if (lir->mir()->isUnsigned()) {
++ masm.move32To64ZeroExtend(input, Register64(output));
++ } else {
++ masm.as_extsw(output, input);
++ }
++}
++
++void CodeGenerator::visitWrapInt64ToInt32(LWrapInt64ToInt32* lir) {
++ const LInt64Allocation input = lir->input();
++ Register output = ToRegister(lir->output());
++
++ if (lir->mir()->bottomHalf()) {
++ if (input.value().isMemory()) {
++ masm.load32(ToAddress(input), output);
++ } else {
++ masm.move64To32(ToRegister64(input), output);
++ }
++ } else {
++ // The only producer of `bottomHalf=false` MWrapInt64ToInt32 in the
++ // current MIR pipeline is the GPR-pair argument splitter in
++ // WasmIonCompile.cpp, which is gated on JS_CODEGEN_REGISTER_PAIR
++ // (32-bit ARM only). PPC64 is 64-bit and never reaches this path.
++ // Matches the same defensive crash in x64 / ARM64 backends.
++ MOZ_CRASH("Not implemented.");
++ }
++}
++
++void CodeGenerator::visitSignExtendInt64(LSignExtendInt64* lir) {
++ Register64 input = ToRegister64(lir->input());
++ Register64 output = ToOutRegister64(lir);
++
++ switch (lir->mir()->mode()) {
++ case MSignExtendInt64::Byte:
++ masm.as_extsb(output.reg, input.reg);
++ break;
++ case MSignExtendInt64::Half:
++ masm.as_extsh(output.reg, input.reg);
++ break;
++ case MSignExtendInt64::Word:
++ masm.as_extsw(output.reg, input.reg);
++ break;
++ }
++}
++
++void CodeGenerator::visitWasmExtendU32Index(LWasmExtendU32Index* lir) {
++ Register input = ToRegister(lir->input());
++ Register output = ToRegister(lir->output());
++ masm.move32To64ZeroExtend(input, Register64(output));
++}
++
++void CodeGenerator::visitWasmWrapU32Index(LWasmWrapU32Index* lir) {
++ Register input = ToRegister(lir->input());
++ Register output = ToRegister(lir->output());
++ masm.move32(input, output);
++}
++
++void CodeGenerator::visitWasmTruncateToInt32(LWasmTruncateToInt32* lir) {
++ auto input = ToFloatRegister(lir->input());
++ auto output = ToRegister(lir->output());
++
++ MWasmTruncateToInt32* mir = lir->mir();
++ MIRType fromType = mir->input()->type();
++
++ MOZ_ASSERT(fromType == MIRType::Double || fromType == MIRType::Float32);
++
++ auto* ool = new (alloc()) OutOfLineWasmTruncateCheck(mir, input, output);
++ addOutOfLineCode(ool, mir);
++
++ Label* oolEntry = ool->entry();
++ if (mir->isUnsigned()) {
++ if (fromType == MIRType::Double) {
++ masm.wasmTruncateDoubleToUInt32(input, output, mir->isSaturating(),
++ oolEntry);
++ } else if (fromType == MIRType::Float32) {
++ masm.wasmTruncateFloat32ToUInt32(input, output, mir->isSaturating(),
++ oolEntry);
++ } else {
++ MOZ_CRASH("unexpected type");
++ }
++
++ masm.bind(ool->rejoin());
++ return;
++ }
++
++ if (fromType == MIRType::Double) {
++ masm.wasmTruncateDoubleToInt32(input, output, mir->isSaturating(),
++ oolEntry);
++ } else if (fromType == MIRType::Float32) {
++ masm.wasmTruncateFloat32ToInt32(input, output, mir->isSaturating(),
++ oolEntry);
++ } else {
++ MOZ_CRASH("unexpected type");
++ }
++
++ masm.bind(ool->rejoin());
++}
++
++void CodeGenerator::visitWasmTruncateToInt64(LWasmTruncateToInt64* lir) {
++ FloatRegister input = ToFloatRegister(lir->input());
++ Register64 output = ToOutRegister64(lir);
++
++ MWasmTruncateToInt64* mir = lir->mir();
++ MIRType fromType = mir->input()->type();
++
++ MOZ_ASSERT(fromType == MIRType::Double || fromType == MIRType::Float32);
++
++ auto* ool = new (alloc()) OutOfLineWasmTruncateCheck(mir, input, output);
++ addOutOfLineCode(ool, mir);
++
++ Label* oolEntry = ool->entry();
++ Label* oolRejoin = ool->rejoin();
++ bool isSaturating = mir->isSaturating();
++
++ if (fromType == MIRType::Double) {
++ if (mir->isUnsigned()) {
++ masm.wasmTruncateDoubleToUInt64(input, output, isSaturating, oolEntry,
++ oolRejoin, InvalidFloatReg);
++ } else {
++ masm.wasmTruncateDoubleToInt64(input, output, isSaturating, oolEntry,
++ oolRejoin, InvalidFloatReg);
++ }
++ } else {
++ if (mir->isUnsigned()) {
++ masm.wasmTruncateFloat32ToUInt64(input, output, isSaturating, oolEntry,
++ oolRejoin, InvalidFloatReg);
++ } else {
++ masm.wasmTruncateFloat32ToInt64(input, output, isSaturating, oolEntry,
++ oolRejoin, InvalidFloatReg);
++ }
++ }
++}
++
++void CodeGenerator::visitInt64ToFloatingPoint(LInt64ToFloatingPoint* lir) {
++ Register64 input = ToRegister64(lir->input());
++ FloatRegister output = ToFloatRegister(lir->output());
++ MIRType outputType = lir->mir()->type();
++
++ if (outputType == MIRType::Double) {
++ if (lir->mir()->isUnsigned()) {
++ masm.convertUInt64ToDouble(input, output, Register::Invalid());
++ } else {
++ masm.convertInt64ToDouble(input, output);
++ }
++ } else {
++ if (lir->mir()->isUnsigned()) {
++ masm.convertUInt64ToFloat32(input, output, Register::Invalid());
++ } else {
++ masm.convertInt64ToFloat32(input, output);
++ }
++ }
++}
++
++void CodeGenerator::visitWasmUint32ToDouble(LWasmUint32ToDouble* lir) {
++ Register input = ToRegister(lir->input());
++ FloatRegister output = ToFloatRegister(lir->output());
++ masm.convertUInt32ToDouble(input, output);
++}
++
++void CodeGenerator::visitWasmUint32ToFloat32(LWasmUint32ToFloat32* lir) {
++ Register input = ToRegister(lir->input());
++ FloatRegister output = ToFloatRegister(lir->output());
++ masm.convertUInt32ToFloat32(input, output);
++}
++
++void CodeGenerator::visitWasmBuiltinTruncateDToInt32(
++ LWasmBuiltinTruncateDToInt32* lir) {
++ emitTruncateDouble(ToFloatRegister(lir->getOperand(0)),
++ ToRegister(lir->getDef(0)), lir->mir());
++}
++
++void CodeGenerator::visitWasmBuiltinTruncateFToInt32(
++ LWasmBuiltinTruncateFToInt32* lir) {
++ emitTruncateFloat32(ToFloatRegister(lir->getOperand(0)),
++ ToRegister(lir->getDef(0)), lir->mir());
++}
++
++// ===============================================================
++// Visitors: Wasm load/store
++
++template <typename T>
++void CodeGeneratorPPC64::emitWasmLoad(T* lir) {
++ const MWasmLoad* mir = lir->mir();
++ UseScratchRegisterScope temps(masm);
++ Register scratch = temps.Acquire();
++
++ Register memoryBase = ToRegister(lir->memoryBase());
++ Register ptr = ToRegister(lir->ptr());
++ Register ptrScratch = ToTempRegisterOrInvalid(lir->temp0());
++
++ if (mir->base()->type() == MIRType::Int32) {
++ masm.move32To64ZeroExtend(ptr, Register64(scratch));
++ ptr = scratch;
++ ptrScratch = ptrScratch != InvalidReg ? scratch : InvalidReg;
++ }
++
++ masm.wasmLoad(mir->access(), memoryBase, ptr, ptrScratch,
++ ToAnyRegister(lir->output()));
++}
++
++template <typename T>
++void CodeGeneratorPPC64::emitWasmStore(T* lir) {
++ const MWasmStore* mir = lir->mir();
++ UseScratchRegisterScope temps(masm);
++ Register scratch = temps.Acquire();
++
++ Register memoryBase = ToRegister(lir->memoryBase());
++ Register ptr = ToRegister(lir->ptr());
++ Register ptrScratch = ToTempRegisterOrInvalid(lir->temp0());
++
++ if (mir->base()->type() == MIRType::Int32) {
++ masm.move32To64ZeroExtend(ptr, Register64(scratch));
++ ptr = scratch;
++ ptrScratch = ptrScratch != InvalidReg ? scratch : InvalidReg;
++ }
++
++ masm.wasmStore(mir->access(), ToAnyRegister(lir->value()), memoryBase, ptr,
++ ptrScratch);
++}
++
++void CodeGenerator::visitWasmLoad(LWasmLoad* lir) { emitWasmLoad(lir); }
++
++void CodeGenerator::visitWasmStore(LWasmStore* lir) { emitWasmStore(lir); }
++
++void CodeGenerator::visitWasmLoadI64(LWasmLoadI64* lir) {
++ const MWasmLoad* mir = lir->mir();
++
++ Register memoryBase = ToRegister(lir->memoryBase());
++ Register ptrScratch = ToTempRegisterOrInvalid(lir->temp0());
++
++ Register ptrReg = ToRegister(lir->ptr());
++ if (mir->base()->type() == MIRType::Int32) {
++ masm.move32ZeroExtendToPtr(ptrReg, ptrReg);
++ }
++
++ masm.wasmLoadI64(mir->access(), memoryBase, ptrReg, ptrScratch,
++ ToOutRegister64(lir));
++}
++
++void CodeGenerator::visitWasmStoreI64(LWasmStoreI64* lir) {
++ const MWasmStore* mir = lir->mir();
++
++ Register memoryBase = ToRegister(lir->memoryBase());
++ Register ptrScratch = ToTempRegisterOrInvalid(lir->temp0());
++
++ Register ptrReg = ToRegister(lir->ptr());
++ if (mir->base()->type() == MIRType::Int32) {
++ masm.move32ZeroExtendToPtr(ptrReg, ptrReg);
++ }
++
++ masm.wasmStoreI64(mir->access(), ToRegister64(lir->value()), memoryBase,
++ ptrReg, ptrScratch);
++}
++
++void CodeGenerator::visitAsmJSLoadHeap(LAsmJSLoadHeap* ins) {
++ const MAsmJSLoadHeap* mir = ins->mir();
++ MOZ_ASSERT(!mir->hasMemoryBase());
++
++ const LAllocation* ptr = ins->ptr();
++ const LDefinition* output = ins->output();
++ const LAllocation* boundsCheckLimit = ins->boundsCheckLimit();
++
++ Register ptrReg = ToRegister(ptr);
++ Scalar::Type accessType = mir->accessType();
++ bool isFloat = accessType == Scalar::Float32 || accessType == Scalar::Float64;
++ Label done;
++
++ if (mir->needsBoundsCheck()) {
++ Label boundsCheckPassed;
++ Register boundsCheckLimitReg = ToRegister(boundsCheckLimit);
++ masm.wasmBoundsCheck32(Assembler::Below, ptrReg, boundsCheckLimitReg,
++ &boundsCheckPassed);
++ if (isFloat) {
++ if (accessType == Scalar::Float32) {
++ masm.loadConstantFloat32(GenericNaN(), ToFloatRegister(output));
++ } else {
++ masm.loadConstantDouble(GenericNaN(), ToFloatRegister(output));
++ }
++ } else {
++ masm.movePtr(ImmWord(0), ToRegister(output));
++ }
++ masm.jump(&done);
++ masm.bind(&boundsCheckPassed);
++ }
++
++ UseScratchRegisterScope temps(masm);
++ Register scratch = temps.Acquire();
++ masm.move32To64ZeroExtend(ptrReg, Register64(scratch));
++
++ switch (accessType) {
++ case Scalar::Int8:
++ masm.as_lbzx(ToRegister(output), HeapReg, scratch);
++ masm.as_extsb(ToRegister(output), ToRegister(output));
++ break;
++ case Scalar::Uint8:
++ masm.as_lbzx(ToRegister(output), HeapReg, scratch);
++ break;
++ case Scalar::Int16:
++ masm.as_lhax(ToRegister(output), HeapReg, scratch);
++ break;
++ case Scalar::Uint16:
++ masm.as_lhzx(ToRegister(output), HeapReg, scratch);
++ break;
++ case Scalar::Int32:
++ masm.as_lwzx(ToRegister(output), HeapReg, scratch);
++ masm.as_extsw(ToRegister(output), ToRegister(output));
++ break;
++ case Scalar::Uint32:
++ masm.as_lwzx(ToRegister(output), HeapReg, scratch);
++ break;
++ case Scalar::Float64:
++ masm.as_lfdx(ToFloatRegister(output), HeapReg, scratch);
++ break;
++ case Scalar::Float32:
++ masm.as_lfsx(ToFloatRegister(output), HeapReg, scratch);
++ break;
++ default:
++ MOZ_CRASH("unexpected array type");
++ }
++
++ if (done.used()) {
++ masm.bind(&done);
++ }
++}
++
++void CodeGenerator::visitAsmJSStoreHeap(LAsmJSStoreHeap* ins) {
++ const MAsmJSStoreHeap* mir = ins->mir();
++ MOZ_ASSERT(!mir->hasMemoryBase());
++
++ const LAllocation* value = ins->value();
++ const LAllocation* ptr = ins->ptr();
++ const LAllocation* boundsCheckLimit = ins->boundsCheckLimit();
++
++ Register ptrReg = ToRegister(ptr);
++
++ Label done;
++ if (mir->needsBoundsCheck()) {
++ Register boundsCheckLimitReg = ToRegister(boundsCheckLimit);
++ masm.wasmBoundsCheck32(Assembler::AboveOrEqual, ptrReg, boundsCheckLimitReg,
++ &done);
++ }
++
++ UseScratchRegisterScope temps(masm);
++ Register scratch = temps.Acquire();
++ masm.move32To64ZeroExtend(ptrReg, Register64(scratch));
++
++ switch (mir->accessType()) {
++ case Scalar::Int8:
++ case Scalar::Uint8:
++ masm.as_stbx(ToRegister(value), HeapReg, scratch);
++ break;
++ case Scalar::Int16:
++ case Scalar::Uint16:
++ masm.as_sthx(ToRegister(value), HeapReg, scratch);
++ break;
++ case Scalar::Int32:
++ case Scalar::Uint32:
++ masm.as_stwx(ToRegister(value), HeapReg, scratch);
++ break;
++ case Scalar::Float64:
++ masm.as_stfdx(ToFloatRegister(value), HeapReg, scratch);
++ break;
++ case Scalar::Float32:
++ masm.as_stfsx(ToFloatRegister(value), HeapReg, scratch);
++ break;
++ default:
++ MOZ_CRASH("unexpected array type");
++ }
++
++ if (done.used()) {
++ masm.bind(&done);
++ }
++}
++
++void CodeGenerator::visitWasmStackArg(LWasmStackArg* ins) {
++ const MWasmStackArg* mir = ins->mir();
++ if (ins->arg()->isConstant()) {
++ masm.storePtr(ImmWord(ToInt32(ins->arg())),
++ Address(StackPointer, mir->spOffset()));
++ } else {
++ if (ins->arg()->isGeneralReg()) {
++ masm.storePtr(ToRegister(ins->arg()),
++ Address(StackPointer, mir->spOffset()));
++ } else if (mir->input()->type() == MIRType::Double) {
++ masm.storeDouble(ToFloatRegister(ins->arg()),
++ Address(StackPointer, mir->spOffset()));
++#ifdef ENABLE_WASM_SIMD
++ } else if (mir->input()->type() == MIRType::Simd128) {
++ masm.storeUnalignedSimd128(ToFloatRegister(ins->arg()),
++ Address(StackPointer, mir->spOffset()));
++#endif
++ } else {
++ masm.storeFloat32(ToFloatRegister(ins->arg()),
++ Address(StackPointer, mir->spOffset()));
++ }
++ }
++}
++
++void CodeGenerator::visitWasmStackArgI64(LWasmStackArgI64* ins) {
++ const MWasmStackArg* mir = ins->mir();
++ Address dst(StackPointer, mir->spOffset());
++ if (IsConstant(ins->arg())) {
++ masm.store64(Imm64(ToInt64(ins->arg())), dst);
++ } else {
++ masm.store64(ToRegister64(ins->arg()), dst);
++ }
++}
++
++void CodeGenerator::visitWasmSelect(LWasmSelect* ins) {
++ MIRType mirType = ins->mir()->type();
++
++ Register cond = ToRegister(ins->condExpr());
++ const LAllocation* falseExpr = ins->falseExpr();
++
++ if (mirType == MIRType::Int32 || mirType == MIRType::WasmAnyRef) {
++ Register out = ToRegister(ins->output());
++ MOZ_ASSERT(ToRegister(ins->trueExpr()) == out,
++ "true expr input is reused for output");
++ if (falseExpr->isGeneralReg()) {
++ masm.moveIfZero(out, ToRegister(falseExpr), cond);
++ } else {
++ masm.cmp32Load32(Assembler::Zero, cond, cond, ToAddress(falseExpr), out);
++ }
++ return;
++ }
++
++ FloatRegister out = ToFloatRegister(ins->output());
++ MOZ_ASSERT(ToFloatRegister(ins->trueExpr()) == out,
++ "true expr input is reused for output");
++
++ if (falseExpr->isFloatReg()) {
++ Label done;
++ // The select condition is a 32-bit value; test 32 bits so high-bit garbage
++ // does not make a zero condition read as non-zero.
++ masm.branchTest32(Assembler::NonZero, cond, cond, &done);
++ if (mirType == MIRType::Float32) {
++ masm.moveFloat32(ToFloatRegister(falseExpr), out);
++ } else if (mirType == MIRType::Double) {
++ masm.moveDouble(ToFloatRegister(falseExpr), out);
++ } else if (mirType == MIRType::Simd128) {
++ masm.moveSimd128(ToFloatRegister(falseExpr), out);
++ } else {
++ MOZ_CRASH("unhandled type in visitWasmSelect!");
++ }
++ masm.bind(&done);
++ } else {
++ Label done;
++ // The select condition is a 32-bit value; test 32 bits so high-bit garbage
++ // does not make a zero condition read as non-zero.
++ masm.branchTest32(Assembler::NonZero, cond, cond, &done);
++
++ if (mirType == MIRType::Float32) {
++ masm.loadFloat32(ToAddress(falseExpr), out);
++ } else if (mirType == MIRType::Double) {
++ masm.loadDouble(ToAddress(falseExpr), out);
++ } else if (mirType == MIRType::Simd128) {
++ masm.loadUnalignedSimd128(ToAddress(falseExpr), out);
++ } else {
++ MOZ_CRASH("unhandled type in visitWasmSelect!");
++ }
++
++ masm.bind(&done);
++ }
++}
++
++void CodeGenerator::visitWasmSelectI64(LWasmSelectI64* lir) {
++ MOZ_ASSERT(lir->mir()->type() == MIRType::Int64);
++
++ Register cond = ToRegister(lir->condExpr());
++ LInt64Allocation falseExpr = lir->falseExpr();
++
++ Register64 out = ToOutRegister64(lir);
++ MOZ_ASSERT(ToRegister64(lir->trueExpr()) == out,
++ "true expr is reused for input");
++
++ if (falseExpr.value().isGeneralReg()) {
++ masm.moveIfZero(out.reg, ToRegister(falseExpr.value()), cond);
++ } else {
++ Label done;
++ // The select condition is a 32-bit value; test 32 bits so high-bit garbage
++ // does not make a zero condition read as non-zero.
++ masm.branchTest32(Assembler::NonZero, cond, cond, &done);
++ masm.loadPtr(ToAddress(falseExpr.value()), out.reg);
++ masm.bind(&done);
++ }
++}
++
++void CodeGenerator::visitWasmCompareAndSelect(LWasmCompareAndSelect* ins) {
++ MCompare::CompareType compTy = ins->compareType();
++ MIRType insTy = ins->mir()->type();
++ const bool cmpIs32 = compTy == MCompare::Compare_Int32 ||
++ compTy == MCompare::Compare_UInt32;
++ const bool cmpIs64 = compTy == MCompare::Compare_Int64 ||
++ compTy == MCompare::Compare_UInt64;
++ const bool selIsInt = insTy == MIRType::Int32 || insTy == MIRType::Int64;
++
++ MOZ_RELEASE_ASSERT(
++ (cmpIs32 || cmpIs64) && selIsInt,
++ "CodeGenerator::visitWasmCompareAndSelect: unexpected types");
++
++ Register trueExprAndDest = ToRegister(ins->output());
++ MOZ_ASSERT(ToRegister(ins->ifTrueExpr()) == trueExprAndDest,
++ "true expr input is reused for output");
++
++ Assembler::Condition cond =
++ Assembler::InvertCondition(JSOpToCondition(compTy, ins->jsop()));
++ Register lhs = ToRegister(ins->leftExpr());
++ Register rhs = ToRegister(ins->rightExpr());
++ Register falseExpr = ToRegister(ins->ifFalseExpr());
++
++ // isel operates on the whole 64-bit GPR regardless of compare width; only
++ // the compare instruction differs (cmpw/cmplw vs cmpd/cmpld).
++ if (cmpIs32) {
++ masm.cmp32Move32(cond, lhs, rhs, falseExpr, trueExprAndDest);
++ } else {
++ masm.cmpPtrMovePtr(cond, lhs, rhs, falseExpr, trueExprAndDest);
++ }
++}
++
++void CodeGenerator::visitWasmAddOffset(LWasmAddOffset* lir) {
++ MWasmAddOffset* mir = lir->mir();
++ Register base = ToRegister(lir->base());
++ Register out = ToRegister(lir->output());
++
++ Label ok;
++ masm.ma_add32TestCarry(Assembler::CarryClear, out, base, Imm32(mir->offset()),
++ &ok);
++ masm.wasmTrap(wasm::Trap::OutOfBounds, mir->trapSiteDesc());
++ masm.bind(&ok);
++}
++
++void CodeGenerator::visitWasmAddOffset64(LWasmAddOffset64* lir) {
++ MWasmAddOffset* mir = lir->mir();
++ Register64 base = ToRegister64(lir->base());
++ Register64 out = ToOutRegister64(lir);
++
++ Label ok;
++ masm.ma_addPtrTestCarry(Assembler::CarryClear, out.reg, base.reg,
++ ImmWord(mir->offset()), &ok);
++ masm.wasmTrap(wasm::Trap::OutOfBounds, mir->trapSiteDesc());
++ masm.bind(&ok);
++}
++
++// ===============================================================
++// Visitors: Effective Address
++
++void CodeGenerator::visitEffectiveAddress2(LEffectiveAddress2* ins) {
++ const MEffectiveAddress2* mir = ins->mir();
++ Register output = ToRegister(ins->output());
++
++ // EA = index * scale + displacement (no base register)
++ masm.movePtr(ImmWord(0), output);
++ BaseIndex addr(output, ToRegister(ins->index()), mir->scale(),
++ mir->displacement());
++ masm.computeEffectiveAddress(addr, output);
++ // Sign-extend to 32-bit
++ masm.as_extsw(output, output);
++}
++
++void CodeGenerator::visitEffectiveAddress3(LEffectiveAddress3* ins) {
++ const MEffectiveAddress3* mir = ins->mir();
++ Register output = ToRegister(ins->output());
++
++ BaseIndex addr(ToRegister(ins->base()), ToRegister(ins->index()),
++ mir->scale(), mir->displacement());
++ masm.computeEffectiveAddress(addr, output);
++ // Sign-extend to 32-bit
++ masm.as_extsw(output, output);
++}
++
++void CodeGenerator::visitWasmMulI64WideHI64(LWasmMulI64WideHI64* ins) {
++ Register lhs = ToRegister(ins->lhs());
++ Register rhs = ToRegister(ins->rhs());
++ Register output = ToRegister(ins->output());
++
++ if (ins->isSigned()) {
++ masm.as_mulhd(output, lhs, rhs);
++ } else {
++ masm.as_mulhdu(output, lhs, rhs);
++ }
++}
++
++// ===============================================================
++// Visitors: Typed Array Atomics
++
++void CodeGenerator::visitCompareExchangeTypedArrayElement(
++ LCompareExchangeTypedArrayElement* lir) {
++ Register elements = ToRegister(lir->elements());
++ AnyRegister output = ToAnyRegister(lir->output());
++ Register outTemp = ToTempRegisterOrInvalid(lir->temp0());
++
++ Register oldval = ToRegister(lir->oldval());
++ Register newval = ToRegister(lir->newval());
++ Register valueTemp = ToTempRegisterOrInvalid(lir->temp1());
++ Register offsetTemp = ToTempRegisterOrInvalid(lir->temp2());
++ Register maskTemp = ToTempRegisterOrInvalid(lir->temp3());
++ Scalar::Type arrayType = lir->mir()->arrayType();
++
++ auto dest = ToAddressOrBaseIndex(elements, lir->index(), arrayType);
++
++ dest.match([&](const auto& dest) {
++ masm.compareExchangeJS(arrayType, Synchronization::Full(), dest, oldval,
++ newval, valueTemp, offsetTemp, maskTemp, outTemp,
++ output);
++ });
++}
++
++void CodeGenerator::visitAtomicExchangeTypedArrayElement(
++ LAtomicExchangeTypedArrayElement* lir) {
++ Register elements = ToRegister(lir->elements());
++ AnyRegister output = ToAnyRegister(lir->output());
++ Register outTemp = ToTempRegisterOrInvalid(lir->temp0());
++
++ Register value = ToRegister(lir->value());
++ Register valueTemp = ToTempRegisterOrInvalid(lir->temp1());
++ Register offsetTemp = ToTempRegisterOrInvalid(lir->temp2());
++ Register maskTemp = ToTempRegisterOrInvalid(lir->temp3());
++ Scalar::Type arrayType = lir->mir()->arrayType();
++
++ auto dest = ToAddressOrBaseIndex(elements, lir->index(), arrayType);
++
++ dest.match([&](const auto& dest) {
++ masm.atomicExchangeJS(arrayType, Synchronization::Full(), dest, value,
++ valueTemp, offsetTemp, maskTemp, outTemp, output);
++ });
++}
++
++void CodeGenerator::visitAtomicTypedArrayElementBinop(
++ LAtomicTypedArrayElementBinop* lir) {
++ MOZ_ASSERT(!lir->mir()->isForEffect());
++
++ AnyRegister output = ToAnyRegister(lir->output());
++ Register elements = ToRegister(lir->elements());
++ Register outTemp = ToTempRegisterOrInvalid(lir->temp0());
++ Register valueTemp = ToTempRegisterOrInvalid(lir->temp1());
++ Register offsetTemp = ToTempRegisterOrInvalid(lir->temp2());
++ Register maskTemp = ToTempRegisterOrInvalid(lir->temp3());
++ Register value = ToRegister(lir->value());
++ Scalar::Type arrayType = lir->mir()->arrayType();
++
++ auto mem = ToAddressOrBaseIndex(elements, lir->index(), arrayType);
++
++ mem.match([&](const auto& mem) {
++ masm.atomicFetchOpJS(arrayType, Synchronization::Full(),
++ lir->mir()->operation(), value, mem, valueTemp,
++ offsetTemp, maskTemp, outTemp, output);
++ });
++}
++
++void CodeGenerator::visitAtomicTypedArrayElementBinopForEffect(
++ LAtomicTypedArrayElementBinopForEffect* lir) {
++ MOZ_ASSERT(lir->mir()->isForEffect());
++
++ Register elements = ToRegister(lir->elements());
++ Register valueTemp = ToTempRegisterOrInvalid(lir->temp0());
++ Register offsetTemp = ToTempRegisterOrInvalid(lir->temp1());
++ Register maskTemp = ToTempRegisterOrInvalid(lir->temp2());
++ Register value = ToRegister(lir->value());
++ Scalar::Type arrayType = lir->mir()->arrayType();
++
++ auto mem = ToAddressOrBaseIndex(elements, lir->index(), arrayType);
++
++ mem.match([&](const auto& mem) {
++ masm.atomicEffectOpJS(arrayType, Synchronization::Full(),
++ lir->mir()->operation(), value, mem, valueTemp,
++ offsetTemp, maskTemp);
++ });
++}
++
++void CodeGenerator::visitCompareExchangeTypedArrayElement64(
++ LCompareExchangeTypedArrayElement64* lir) {
++ Register elements = ToRegister(lir->elements());
++ Register64 oldval = ToRegister64(lir->oldval());
++ Register64 newval = ToRegister64(lir->newval());
++ Register64 out = ToOutRegister64(lir);
++ Scalar::Type arrayType = lir->mir()->arrayType();
++
++ auto dest = ToAddressOrBaseIndex(elements, lir->index(), arrayType);
++
++ dest.match([&](const auto& dest) {
++ masm.compareExchange64(Synchronization::Full(), dest, oldval, newval, out);
++ });
++}
++
++void CodeGenerator::visitAtomicExchangeTypedArrayElement64(
++ LAtomicExchangeTypedArrayElement64* lir) {
++ Register elements = ToRegister(lir->elements());
++ Register64 value = ToRegister64(lir->value());
++ Register64 out = ToOutRegister64(lir);
++ Scalar::Type arrayType = lir->mir()->arrayType();
++
++ auto dest = ToAddressOrBaseIndex(elements, lir->index(), arrayType);
++
++ dest.match([&](const auto& dest) {
++ masm.atomicExchange64(Synchronization::Full(), dest, value, out);
++ });
++}
++
++void CodeGenerator::visitAtomicTypedArrayElementBinop64(
++ LAtomicTypedArrayElementBinop64* lir) {
++ MOZ_ASSERT(lir->mir()->hasUses());
++
++ Register elements = ToRegister(lir->elements());
++ Register64 value = ToRegister64(lir->value());
++ Register64 temp = ToRegister64(lir->temp0());
++ Register64 out = ToOutRegister64(lir);
++
++ Scalar::Type arrayType = lir->mir()->arrayType();
++ AtomicOp atomicOp = lir->mir()->operation();
++
++ auto dest = ToAddressOrBaseIndex(elements, lir->index(), arrayType);
++
++ dest.match([&](const auto& dest) {
++ masm.atomicFetchOp64(Synchronization::Full(), atomicOp, value, dest, temp,
++ out);
++ });
++}
++
++void CodeGenerator::visitAtomicTypedArrayElementBinopForEffect64(
++ LAtomicTypedArrayElementBinopForEffect64* lir) {
++ MOZ_ASSERT(!lir->mir()->hasUses());
++
++ Register elements = ToRegister(lir->elements());
++ Register64 value = ToRegister64(lir->value());
++ Register64 temp = ToRegister64(lir->temp0());
++
++ Scalar::Type arrayType = lir->mir()->arrayType();
++ AtomicOp atomicOp = lir->mir()->operation();
++
++ auto dest = ToAddressOrBaseIndex(elements, lir->index(), arrayType);
++
++ dest.match([&](const auto& dest) {
++ masm.atomicEffectOp64(Synchronization::Full(), atomicOp, value, dest, temp);
++ });
++}
++
++void CodeGenerator::visitAtomicLoad64(LAtomicLoad64* lir) {
++ Register elements = ToRegister(lir->elements());
++ Register64 out = ToOutRegister64(lir);
++ Scalar::Type storageType = lir->mir()->storageType();
++
++ auto source = ToAddressOrBaseIndex(elements, lir->index(), storageType);
++
++ auto sync = Synchronization::Load();
++ masm.memoryBarrierBefore(sync);
++ source.match([&](const auto& source) { masm.load64(source, out); });
++ masm.memoryBarrierAfter(sync);
++}
++
++void CodeGenerator::visitAtomicStore64(LAtomicStore64* lir) {
++ Register elements = ToRegister(lir->elements());
++ Register64 value = ToRegister64(lir->value());
++ Scalar::Type writeType = lir->mir()->writeType();
++
++ auto dest = ToAddressOrBaseIndex(elements, lir->index(), writeType);
++
++ auto sync = Synchronization::Store();
++ masm.memoryBarrierBefore(sync);
++ dest.match([&](const auto& dest) { masm.store64(value, dest); });
++ masm.memoryBarrierAfter(sync);
++}
++
++// Wasm Atomics
++void CodeGenerator::visitWasmCompareExchangeHeap(
++ LWasmCompareExchangeHeap* ins) {
++ MWasmCompareExchangeHeap* mir = ins->mir();
++ Register memoryBase = ToRegister(ins->memoryBase());
++ Register ptrReg = ToRegister(ins->ptr());
++ BaseIndex srcAddr(memoryBase, ptrReg, TimesOne, mir->access().offset32());
++
++ Register oldval = ToRegister(ins->oldValue());
++ Register newval = ToRegister(ins->newValue());
++ Register valueTemp = ToTempRegisterOrInvalid(ins->temp0());
++ Register offsetTemp = ToTempRegisterOrInvalid(ins->temp1());
++ Register maskTemp = ToTempRegisterOrInvalid(ins->temp2());
++
++ masm.wasmCompareExchange(mir->access(), srcAddr, oldval, newval, valueTemp,
++ offsetTemp, maskTemp, ToRegister(ins->output()));
++}
++
++void CodeGenerator::visitWasmAtomicExchangeHeap(LWasmAtomicExchangeHeap* ins) {
++ MWasmAtomicExchangeHeap* mir = ins->mir();
++ Register memoryBase = ToRegister(ins->memoryBase());
++ Register ptrReg = ToRegister(ins->ptr());
++ Register value = ToRegister(ins->value());
++ BaseIndex srcAddr(memoryBase, ptrReg, TimesOne, mir->access().offset32());
++
++ Register valueTemp = ToTempRegisterOrInvalid(ins->temp0());
++ Register offsetTemp = ToTempRegisterOrInvalid(ins->temp1());
++ Register maskTemp = ToTempRegisterOrInvalid(ins->temp2());
++
++ masm.wasmAtomicExchange(mir->access(), srcAddr, value, valueTemp, offsetTemp,
++ maskTemp, ToRegister(ins->output()));
++}
++
++void CodeGenerator::visitWasmAtomicBinopHeap(LWasmAtomicBinopHeap* ins) {
++ MOZ_ASSERT(ins->mir()->hasUses());
++
++ MWasmAtomicBinopHeap* mir = ins->mir();
++ Register memoryBase = ToRegister(ins->memoryBase());
++ Register ptrReg = ToRegister(ins->ptr());
++ Register valueTemp = ToTempRegisterOrInvalid(ins->temp0());
++ Register offsetTemp = ToTempRegisterOrInvalid(ins->temp1());
++ Register maskTemp = ToTempRegisterOrInvalid(ins->temp2());
++
++ BaseIndex srcAddr(memoryBase, ptrReg, TimesOne, mir->access().offset32());
++
++ masm.wasmAtomicFetchOp(mir->access(), mir->operation(),
++ ToRegister(ins->value()), srcAddr, valueTemp,
++ offsetTemp, maskTemp, ToRegister(ins->output()));
++}
++
++void CodeGenerator::visitWasmAtomicBinopHeapForEffect(
++ LWasmAtomicBinopHeapForEffect* ins) {
++ MOZ_ASSERT(!ins->mir()->hasUses());
++
++ MWasmAtomicBinopHeap* mir = ins->mir();
++ Register memoryBase = ToRegister(ins->memoryBase());
++ Register ptrReg = ToRegister(ins->ptr());
++ Register valueTemp = ToTempRegisterOrInvalid(ins->temp0());
++ Register offsetTemp = ToTempRegisterOrInvalid(ins->temp1());
++ Register maskTemp = ToTempRegisterOrInvalid(ins->temp2());
++
++ BaseIndex srcAddr(memoryBase, ptrReg, TimesOne, mir->access().offset32());
++ masm.wasmAtomicEffectOp(mir->access(), mir->operation(),
++ ToRegister(ins->value()), srcAddr, valueTemp,
++ offsetTemp, maskTemp);
++}
++
++void CodeGenerator::visitWasmCompareExchangeI64(LWasmCompareExchangeI64* lir) {
++ Register memoryBase = ToRegister(lir->memoryBase());
++ Register ptr = ToRegister(lir->ptr());
++ Register64 oldValue = ToRegister64(lir->oldValue());
++ Register64 newValue = ToRegister64(lir->newValue());
++ Register64 output = ToOutRegister64(lir);
++ uint32_t offset = lir->mir()->access().offset32();
++
++ BaseIndex addr(memoryBase, ptr, TimesOne, offset);
++ masm.wasmCompareExchange64(lir->mir()->access(), addr, oldValue, newValue,
++ output);
++}
++
++void CodeGenerator::visitWasmAtomicExchangeI64(LWasmAtomicExchangeI64* lir) {
++ Register memoryBase = ToRegister(lir->memoryBase());
++ Register ptr = ToRegister(lir->ptr());
++ Register64 value = ToRegister64(lir->value());
++ Register64 output = ToOutRegister64(lir);
++ uint32_t offset = lir->mir()->access().offset32();
++
++ BaseIndex addr(memoryBase, ptr, TimesOne, offset);
++ masm.wasmAtomicExchange64(lir->mir()->access(), addr, value, output);
++}
++
++void CodeGenerator::visitWasmAtomicBinopI64(LWasmAtomicBinopI64* lir) {
++ Register memoryBase = ToRegister(lir->memoryBase());
++ Register ptr = ToRegister(lir->ptr());
++ Register64 value = ToRegister64(lir->value());
++ Register64 output = ToOutRegister64(lir);
++ Register64 temp = ToRegister64(lir->temp0());
++ uint32_t offset = lir->mir()->access().offset32();
++
++ BaseIndex addr(memoryBase, ptr, TimesOne, offset);
++
++ masm.wasmAtomicFetchOp64(lir->mir()->access(), lir->mir()->operation(), value,
++ addr, temp, output);
++}
++
++// SIMD code generators.
++void CodeGenerator::visitSimd128(LSimd128* ins) {
++ FloatRegister dest = ToFloatRegister(ins->output());
++ masm.loadConstantSimd128(ins->simd128(), dest);
++}
++void CodeGenerator::visitWasmTernarySimd128(LWasmTernarySimd128* ins) {
++ FloatRegister v0 = ToFloatRegister(ins->v0());
++ FloatRegister v1 = ToFloatRegister(ins->v1());
++ FloatRegister v2 = ToFloatRegister(ins->v2());
++ FloatRegister dest = ToFloatRegister(ins->output());
++ switch (ins->simdOp()) {
++ case wasm::SimdOp::V128Bitselect:
++ // bitselect(v0, v1, v2): result = (v0 & v2) | (v1 & ~v2)
++ // xxsel: XC=0→XA, XC=1→XB → (XA & ~XC) | (XB & XC)
++ // Need XA=v1, XB=v0, XC=v2.
++ masm.as_xxsel(dest, v1, v0, v2);
++ break;
++ case wasm::SimdOp::I8x16RelaxedLaneSelect:
++ case wasm::SimdOp::I16x8RelaxedLaneSelect:
++ case wasm::SimdOp::I32x4RelaxedLaneSelect:
++ case wasm::SimdOp::I64x2RelaxedLaneSelect:
++ // relaxed laneSelect(v0, v1, mask=v2): same as bitselect
++ masm.as_xxsel(dest, v1, v0, v2);
++ break;
++ // Lowering uses defineReuseInput on V2Index for ternary ops — the
++ // allocator is required to place `dest` in v2's slot. Assert that
++ // here; the FMA/dot helpers write their result through v2 in-place,
++ // so dest == v2 makes the trailing moveSimd128 unnecessary.
++ case wasm::SimdOp::I32x4RelaxedDotI8x16I7x16AddS:
++ MOZ_ASSERT(dest == v2);
++ masm.dotInt8x16Int7x16ThenAdd(v0, v1, v2,
++ ToFloatRegister(ins->temp0()));
++ break;
++ case wasm::SimdOp::F32x4RelaxedMadd:
++ MOZ_ASSERT(dest == v2);
++ masm.fmaFloat32x4(v0, v1, v2);
++ break;
++ case wasm::SimdOp::F64x2RelaxedMadd:
++ MOZ_ASSERT(dest == v2);
++ masm.fmaFloat64x2(v0, v1, v2);
++ break;
++ case wasm::SimdOp::F32x4RelaxedNmadd:
++ MOZ_ASSERT(dest == v2);
++ masm.fnmaFloat32x4(v0, v1, v2);
++ break;
++ case wasm::SimdOp::F64x2RelaxedNmadd:
++ MOZ_ASSERT(dest == v2);
++ masm.fnmaFloat64x2(v0, v1, v2);
++ break;
++ default:
++ MOZ_CRASH("PPC64: NYI SIMD ternary op");
++ }
++}
++void CodeGenerator::visitWasmBinarySimd128(LWasmBinarySimd128* ins) {
++ FloatRegister lhs = ToFloatRegister(ins->lhs());
++ FloatRegister rhs = ToFloatRegister(ins->rhs());
++ FloatRegister dest = ToFloatRegister(ins->output());
++ switch (ins->simdOp()) {
++ // Bitwise
++ case wasm::SimdOp::V128And:
++ masm.bitwiseAndSimd128(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::V128Or:
++ masm.bitwiseOrSimd128(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::V128Xor:
++ masm.bitwiseXorSimd128(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::V128AndNot:
++ masm.bitwiseAndNotSimd128(lhs, rhs, dest);
++ break;
++ // Integer add
++ case wasm::SimdOp::I8x16Add:
++ masm.addInt8x16(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8Add:
++ masm.addInt16x8(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I32x4Add:
++ masm.addInt32x4(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I64x2Add:
++ masm.addInt64x2(lhs, rhs, dest);
++ break;
++ // Integer sub
++ case wasm::SimdOp::I8x16Sub:
++ masm.subInt8x16(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8Sub:
++ masm.subInt16x8(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I32x4Sub:
++ masm.subInt32x4(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I64x2Sub:
++ masm.subInt64x2(lhs, rhs, dest);
++ break;
++ // Saturating add
++ case wasm::SimdOp::I8x16AddSatS:
++ masm.addSatInt8x16(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I8x16AddSatU:
++ masm.unsignedAddSatInt8x16(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8AddSatS:
++ masm.addSatInt16x8(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8AddSatU:
++ masm.unsignedAddSatInt16x8(lhs, rhs, dest);
++ break;
++ // Saturating sub
++ case wasm::SimdOp::I8x16SubSatS:
++ masm.subSatInt8x16(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I8x16SubSatU:
++ masm.unsignedSubSatInt8x16(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8SubSatS:
++ masm.subSatInt16x8(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8SubSatU:
++ masm.unsignedSubSatInt16x8(lhs, rhs, dest);
++ break;
++ // Integer multiply
++ case wasm::SimdOp::I16x8Mul:
++ masm.mulInt16x8(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I32x4Mul:
++ masm.mulInt32x4(lhs, rhs, dest);
++ break;
++ // Integer min/max signed
++ case wasm::SimdOp::I8x16MinS:
++ masm.minInt8x16(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I8x16MaxS:
++ masm.maxInt8x16(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8MinS:
++ masm.minInt16x8(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8MaxS:
++ masm.maxInt16x8(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I32x4MinS:
++ masm.minInt32x4(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I32x4MaxS:
++ masm.maxInt32x4(lhs, rhs, dest);
++ break;
++ // Integer min/max unsigned
++ case wasm::SimdOp::I8x16MinU:
++ masm.unsignedMinInt8x16(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I8x16MaxU:
++ masm.unsignedMaxInt8x16(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8MinU:
++ masm.unsignedMinInt16x8(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8MaxU:
++ masm.unsignedMaxInt16x8(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I32x4MinU:
++ masm.unsignedMinInt32x4(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I32x4MaxU:
++ masm.unsignedMaxInt32x4(lhs, rhs, dest);
++ break;
++ // Average unsigned
++ case wasm::SimdOp::I8x16AvgrU:
++ masm.unsignedAverageInt8x16(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8AvgrU:
++ masm.unsignedAverageInt16x8(lhs, rhs, dest);
++ break;
++ // Q15 multiply
++ case wasm::SimdOp::I16x8Q15MulrSatS:
++ masm.q15MulrSatInt16x8(lhs, rhs, dest);
++ break;
++ // Integer compare
++ case wasm::SimdOp::I8x16Eq:
++ masm.compareInt8x16(Assembler::Equal, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I8x16Ne:
++ masm.compareInt8x16(Assembler::NotEqual, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I8x16LtS:
++ masm.compareInt8x16(Assembler::LessThan, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I8x16GtS:
++ masm.compareInt8x16(Assembler::GreaterThan, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I8x16LeS:
++ masm.compareInt8x16(Assembler::LessThanOrEqual, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I8x16GeS:
++ masm.compareInt8x16(Assembler::GreaterThanOrEqual, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I8x16LtU:
++ masm.compareInt8x16(Assembler::Below, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I8x16GtU:
++ masm.compareInt8x16(Assembler::Above, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I8x16LeU:
++ masm.compareInt8x16(Assembler::BelowOrEqual, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I8x16GeU:
++ masm.compareInt8x16(Assembler::AboveOrEqual, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8Eq:
++ masm.compareInt16x8(Assembler::Equal, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8Ne:
++ masm.compareInt16x8(Assembler::NotEqual, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8LtS:
++ masm.compareInt16x8(Assembler::LessThan, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8GtS:
++ masm.compareInt16x8(Assembler::GreaterThan, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8LeS:
++ masm.compareInt16x8(Assembler::LessThanOrEqual, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8GeS:
++ masm.compareInt16x8(Assembler::GreaterThanOrEqual, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8LtU:
++ masm.compareInt16x8(Assembler::Below, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8GtU:
++ masm.compareInt16x8(Assembler::Above, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8LeU:
++ masm.compareInt16x8(Assembler::BelowOrEqual, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8GeU:
++ masm.compareInt16x8(Assembler::AboveOrEqual, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I32x4Eq:
++ masm.compareInt32x4(Assembler::Equal, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I32x4Ne:
++ masm.compareInt32x4(Assembler::NotEqual, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I32x4LtS:
++ masm.compareInt32x4(Assembler::LessThan, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I32x4GtS:
++ masm.compareInt32x4(Assembler::GreaterThan, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I32x4LeS:
++ masm.compareInt32x4(Assembler::LessThanOrEqual, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I32x4GeS:
++ masm.compareInt32x4(Assembler::GreaterThanOrEqual, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I32x4LtU:
++ masm.compareInt32x4(Assembler::Below, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I32x4GtU:
++ masm.compareInt32x4(Assembler::Above, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I32x4LeU:
++ masm.compareInt32x4(Assembler::BelowOrEqual, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I32x4GeU:
++ masm.compareInt32x4(Assembler::AboveOrEqual, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I64x2Eq:
++ masm.compareInt64x2(Assembler::Equal, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I64x2Ne:
++ masm.compareInt64x2(Assembler::NotEqual, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I64x2LtS:
++ masm.compareInt64x2(Assembler::LessThan, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I64x2GtS:
++ masm.compareInt64x2(Assembler::GreaterThan, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I64x2LeS:
++ masm.compareInt64x2(Assembler::LessThanOrEqual, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I64x2GeS:
++ masm.compareInt64x2(Assembler::GreaterThanOrEqual, lhs, rhs, dest);
++ break;
++ // Float compare
++ case wasm::SimdOp::F32x4Eq:
++ masm.compareFloat32x4(Assembler::Equal, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::F32x4Ne:
++ masm.compareFloat32x4(Assembler::NotEqual, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::F32x4Lt:
++ masm.compareFloat32x4(Assembler::LessThan, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::F32x4Gt:
++ masm.compareFloat32x4(Assembler::GreaterThan, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::F32x4Le:
++ masm.compareFloat32x4(Assembler::LessThanOrEqual, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::F32x4Ge:
++ masm.compareFloat32x4(Assembler::GreaterThanOrEqual, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::F64x2Eq:
++ masm.compareFloat64x2(Assembler::Equal, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::F64x2Ne:
++ masm.compareFloat64x2(Assembler::NotEqual, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::F64x2Lt:
++ masm.compareFloat64x2(Assembler::LessThan, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::F64x2Gt:
++ masm.compareFloat64x2(Assembler::GreaterThan, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::F64x2Le:
++ masm.compareFloat64x2(Assembler::LessThanOrEqual, lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::F64x2Ge:
++ masm.compareFloat64x2(Assembler::GreaterThanOrEqual, lhs, rhs, dest);
++ break;
++ // Float arithmetic
++ case wasm::SimdOp::F32x4Add:
++ masm.addFloat32x4(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::F32x4Sub:
++ masm.subFloat32x4(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::F32x4Mul:
++ masm.mulFloat32x4(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::F32x4Div:
++ masm.divFloat32x4(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::F32x4Min:
++ masm.minFloat32x4(lhs, rhs, dest, ToFloatRegister(ins->getTemp(0)),
++ ToFloatRegister(ins->getTemp(1)));
++ break;
++ case wasm::SimdOp::F32x4Max:
++ masm.maxFloat32x4(lhs, rhs, dest, ToFloatRegister(ins->getTemp(0)),
++ ToFloatRegister(ins->getTemp(1)));
++ break;
++ case wasm::SimdOp::F32x4PMin:
++ masm.pseudoMinFloat32x4(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::F32x4PMax:
++ masm.pseudoMaxFloat32x4(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::F64x2Add:
++ masm.addFloat64x2(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::F64x2Sub:
++ masm.subFloat64x2(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::F64x2Mul:
++ masm.mulFloat64x2(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::F64x2Div:
++ masm.divFloat64x2(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::F64x2Min:
++ masm.minFloat64x2(lhs, rhs, dest, ToFloatRegister(ins->getTemp(0)),
++ ToFloatRegister(ins->getTemp(1)));
++ break;
++ case wasm::SimdOp::F64x2Max:
++ masm.maxFloat64x2(lhs, rhs, dest, ToFloatRegister(ins->getTemp(0)),
++ ToFloatRegister(ins->getTemp(1)));
++ break;
++ case wasm::SimdOp::F64x2PMin:
++ masm.pseudoMinFloat64x2(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::F64x2PMax:
++ masm.pseudoMaxFloat64x2(lhs, rhs, dest);
++ break;
++ // Narrow
++ case wasm::SimdOp::I8x16NarrowI16x8S:
++ masm.narrowInt16x8(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I8x16NarrowI16x8U:
++ masm.unsignedNarrowInt16x8(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8NarrowI32x4S:
++ masm.narrowInt32x4(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8NarrowI32x4U:
++ masm.unsignedNarrowInt32x4(lhs, rhs, dest);
++ break;
++ // i64 multiply
++ case wasm::SimdOp::I64x2Mul: {
++ FloatRegister temp0 = ToTempFloatRegisterOrInvalid(ins->temp0());
++ FloatRegister temp1f = ToTempFloatRegisterOrInvalid(ins->temp1());
++ masm.mulInt64x2(lhs, rhs, dest, temp0, temp1f);
++ break;
++ }
++ // Extended multiply
++ case wasm::SimdOp::I16x8ExtmulLowI8x16S:
++ masm.extMulLowInt8x16(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8ExtmulHighI8x16S:
++ masm.extMulHighInt8x16(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8ExtmulLowI8x16U:
++ masm.unsignedExtMulLowInt8x16(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8ExtmulHighI8x16U:
++ masm.unsignedExtMulHighInt8x16(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I32x4ExtmulLowI16x8S:
++ masm.extMulLowInt16x8(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I32x4ExtmulHighI16x8S:
++ masm.extMulHighInt16x8(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I32x4ExtmulLowI16x8U:
++ masm.unsignedExtMulLowInt16x8(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I32x4ExtmulHighI16x8U:
++ masm.unsignedExtMulHighInt16x8(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I64x2ExtmulLowI32x4S:
++ masm.extMulLowInt32x4(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I64x2ExtmulHighI32x4S:
++ masm.extMulHighInt32x4(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I64x2ExtmulLowI32x4U:
++ masm.unsignedExtMulLowInt32x4(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I64x2ExtmulHighI32x4U:
++ masm.unsignedExtMulHighInt32x4(lhs, rhs, dest);
++ break;
++ // Dot product
++ case wasm::SimdOp::I32x4DotI16x8S:
++ masm.widenDotInt16x8(lhs, rhs, dest);
++ break;
++ // Relaxed binary ops
++ case wasm::SimdOp::F32x4RelaxedMin:
++ masm.minFloat32x4Relaxed(rhs, lhs);
++ if (dest != lhs) masm.moveSimd128(lhs, dest);
++ break;
++ case wasm::SimdOp::F32x4RelaxedMax:
++ masm.maxFloat32x4Relaxed(rhs, lhs);
++ if (dest != lhs) masm.moveSimd128(lhs, dest);
++ break;
++ case wasm::SimdOp::F64x2RelaxedMin:
++ masm.minFloat64x2Relaxed(rhs, lhs);
++ if (dest != lhs) masm.moveSimd128(lhs, dest);
++ break;
++ case wasm::SimdOp::F64x2RelaxedMax:
++ masm.maxFloat64x2Relaxed(rhs, lhs);
++ if (dest != lhs) masm.moveSimd128(lhs, dest);
++ break;
++ case wasm::SimdOp::I8x16RelaxedSwizzle:
++ masm.swizzleInt8x16Relaxed(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8RelaxedQ15MulrS:
++ masm.q15MulrInt16x8Relaxed(lhs, rhs, dest);
++ break;
++ // Swizzle
++ case wasm::SimdOp::I8x16Swizzle:
++ masm.swizzleInt8x16(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8RelaxedDotI8x16I7x16S:
++ masm.dotInt8x16Int7x16(lhs, rhs, dest);
++ break;
++ default:
++ MOZ_CRASH("PPC64: NYI SIMD binary op");
++ }
++}
++void CodeGenerator::visitWasmBinarySimd128WithConstant(
++ LWasmBinarySimd128WithConstant* ins) {
++ FloatRegister lhs = ToFloatRegister(ins->lhs());
++ FloatRegister dest = ToFloatRegister(ins->output());
++ SimdConstant rhs = ins->rhs();
++ // Load the constant into scratch, then use the binary op.
++ ScratchSimd128Scope scratch(masm);
++ masm.loadConstantSimd128(rhs, scratch);
++ switch (ins->mir()->simdOp()) {
++ // Bitwise
++ case wasm::SimdOp::V128And:
++ masm.bitwiseAndSimd128(lhs, scratch, dest);
++ break;
++ case wasm::SimdOp::V128Or:
++ masm.bitwiseOrSimd128(lhs, scratch, dest);
++ break;
++ case wasm::SimdOp::V128Xor:
++ masm.bitwiseXorSimd128(lhs, scratch, dest);
++ break;
++ case wasm::SimdOp::V128AndNot:
++ masm.bitwiseAndNotSimd128(lhs, scratch, dest);
++ break;
++ // Integer add
++ case wasm::SimdOp::I8x16Add:
++ masm.addInt8x16(lhs, scratch, dest);
++ break;
++ case wasm::SimdOp::I16x8Add:
++ masm.addInt16x8(lhs, scratch, dest);
++ break;
++ case wasm::SimdOp::I32x4Add:
++ masm.addInt32x4(lhs, scratch, dest);
++ break;
++ case wasm::SimdOp::I64x2Add:
++ masm.addInt64x2(lhs, scratch, dest);
++ break;
++ // Integer sub
++ case wasm::SimdOp::I8x16Sub:
++ masm.subInt8x16(lhs, scratch, dest);
++ break;
++ case wasm::SimdOp::I16x8Sub:
++ masm.subInt16x8(lhs, scratch, dest);
++ break;
++ case wasm::SimdOp::I32x4Sub:
++ masm.subInt32x4(lhs, scratch, dest);
++ break;
++ case wasm::SimdOp::I64x2Sub:
++ masm.subInt64x2(lhs, scratch, dest);
++ break;
++ // Integer multiply (16-/32-bit lanes; I64x2 unreachable, see below)
++ case wasm::SimdOp::I16x8Mul:
++ masm.mulInt16x8(lhs, scratch, dest);
++ break;
++ case wasm::SimdOp::I32x4Mul:
++ masm.mulInt32x4(lhs, scratch, dest);
++ break;
++ case wasm::SimdOp::I64x2Mul:
++ // Unreachable on PPC64: MWasmBinarySimd128::specializeForConstantRhs
++ // returns false in Lowering-ppc64.cpp, so MIR with a constant rhs
++ // to I64x2Mul is never created on this backend.
++ //
++ // The previous in-place implementation was broken in three ways:
++ // hard-coded VR0/VR1 staging assumed an ordering that didn't match
++ // the surrounding code; a dead `mfvsrd(a, f0)` clobbered `a`
++ // immediately before the next mfvsrd; and the trailing
++ // `xxpermdi(dest, scratch, dest, 0)` with DM=0 placed lane-0 in the
++ // wrong half. Rather than ship dead-but-broken code, crash loudly
++ // if reachability ever changes — the future enabler must write a
++ // correct lowering (e.g. via masm.mulInt64x2 with explicit temps).
++ MOZ_CRASH("PPC64: I64x2Mul with constant rhs unimplemented "
++ "(specializeForConstantRhs returns false)");
++ // Compare
++ case wasm::SimdOp::I8x16Eq:
++ masm.compareInt8x16(Assembler::Equal, lhs, scratch, dest);
++ break;
++ case wasm::SimdOp::I8x16Ne:
++ masm.compareInt8x16(Assembler::NotEqual, lhs, scratch, dest);
++ break;
++ default:
++ MOZ_CRASH("PPC64: NYI SIMD binary-with-constant op");
++ }
++}
++void CodeGenerator::visitWasmVariableShiftSimd128(
++ LWasmVariableShiftSimd128* ins) {
++ FloatRegister lhs = ToFloatRegister(ins->lhs());
++ Register rhs = ToRegister(ins->rhs());
++ FloatRegister dest = ToFloatRegister(ins->output());
++ switch (ins->mir()->simdOp()) {
++ case wasm::SimdOp::I8x16Shl:
++ masm.leftShiftInt8x16(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I8x16ShrS:
++ masm.rightShiftInt8x16(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I8x16ShrU:
++ masm.unsignedRightShiftInt8x16(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8Shl:
++ masm.leftShiftInt16x8(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8ShrS:
++ masm.rightShiftInt16x8(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I16x8ShrU:
++ masm.unsignedRightShiftInt16x8(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I32x4Shl:
++ masm.leftShiftInt32x4(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I32x4ShrS:
++ masm.rightShiftInt32x4(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I32x4ShrU:
++ masm.unsignedRightShiftInt32x4(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I64x2Shl:
++ masm.leftShiftInt64x2(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I64x2ShrS:
++ masm.rightShiftInt64x2(lhs, rhs, dest);
++ break;
++ case wasm::SimdOp::I64x2ShrU:
++ masm.unsignedRightShiftInt64x2(lhs, rhs, dest);
++ break;
++ default:
++ MOZ_CRASH("PPC64: NYI SIMD variable shift op");
++ }
++}
++void CodeGenerator::visitWasmConstantShiftSimd128(
++ LWasmConstantShiftSimd128* ins) {
++ FloatRegister src = ToFloatRegister(ins->src());
++ FloatRegister dest = ToFloatRegister(ins->output());
++ int32_t shift = ins->shift();
++ switch (ins->mir()->simdOp()) {
++ case wasm::SimdOp::I8x16Shl:
++ masm.leftShiftInt8x16(Imm32(shift), src, dest);
++ break;
++ case wasm::SimdOp::I8x16ShrS:
++ masm.rightShiftInt8x16(Imm32(shift), src, dest);
++ break;
++ case wasm::SimdOp::I8x16ShrU:
++ masm.unsignedRightShiftInt8x16(Imm32(shift), src, dest);
++ break;
++ case wasm::SimdOp::I16x8Shl:
++ masm.leftShiftInt16x8(Imm32(shift), src, dest);
++ break;
++ case wasm::SimdOp::I16x8ShrS:
++ masm.rightShiftInt16x8(Imm32(shift), src, dest);
++ break;
++ case wasm::SimdOp::I16x8ShrU:
++ masm.unsignedRightShiftInt16x8(Imm32(shift), src, dest);
++ break;
++ case wasm::SimdOp::I32x4Shl:
++ masm.leftShiftInt32x4(Imm32(shift), src, dest);
++ break;
++ case wasm::SimdOp::I32x4ShrS:
++ masm.rightShiftInt32x4(Imm32(shift), src, dest);
++ break;
++ case wasm::SimdOp::I32x4ShrU:
++ masm.unsignedRightShiftInt32x4(Imm32(shift), src, dest);
++ break;
++ case wasm::SimdOp::I64x2Shl:
++ masm.leftShiftInt64x2(Imm32(shift), src, dest);
++ break;
++ case wasm::SimdOp::I64x2ShrS:
++ masm.rightShiftInt64x2(Imm32(shift), src, dest);
++ break;
++ case wasm::SimdOp::I64x2ShrU:
++ masm.unsignedRightShiftInt64x2(Imm32(shift), src, dest);
++ break;
++ default:
++ MOZ_CRASH("PPC64: NYI SIMD constant shift op");
++ }
++}
++void CodeGenerator::visitWasmSignReplicationSimd128(
++ LWasmSignReplicationSimd128* ins) {
++ FloatRegister src = ToFloatRegister(ins->src());
++ FloatRegister dest = ToFloatRegister(ins->output());
++ // Sign replication = arithmetic right shift by max amount (all sign bits).
++ switch (ins->mir()->simdOp()) {
++ case wasm::SimdOp::I8x16ShrS:
++ masm.rightShiftInt8x16(Imm32(7), src, dest);
++ break;
++ case wasm::SimdOp::I16x8ShrS:
++ masm.rightShiftInt16x8(Imm32(15), src, dest);
++ break;
++ case wasm::SimdOp::I32x4ShrS:
++ masm.rightShiftInt32x4(Imm32(31), src, dest);
++ break;
++ case wasm::SimdOp::I64x2ShrS:
++ masm.rightShiftInt64x2(Imm32(63), src, dest);
++ break;
++ default:
++ MOZ_CRASH("Unexpected sign replication op");
++ }
++}
++void CodeGenerator::visitWasmShuffleSimd128(LWasmShuffleSimd128* ins) {
++ FloatRegister lhs = ToFloatRegister(ins->lhs());
++ FloatRegister rhs = ToFloatRegister(ins->rhs());
++ FloatRegister dest = ToFloatRegister(ins->output());
++ SimdConstant ctrl = ins->control();
++ const uint8_t* lanes = reinterpret_cast<const uint8_t*>(ctrl.bytes());
++ masm.shuffleInt8x16(lanes, lhs, rhs, dest);
++}
++void CodeGenerator::visitWasmPermuteSimd128(LWasmPermuteSimd128* ins) {
++ FloatRegister src = ToFloatRegister(ins->src());
++ FloatRegister dest = ToFloatRegister(ins->output());
++ // PPC64: the shuffle analysis transforms control bytes into specialized
++ // formats. Reconstruct raw Wasm byte indices for our vperm implementation.
++ SimdConstant ctrl = ins->control();
++ uint8_t rawLanes[16];
++ switch (ins->op()) {
++ case SimdPermuteOp::MOVE:
++ masm.moveSimd128(src, dest);
++ return;
++ case SimdPermuteOp::PERMUTE_32x4: {
++ const int32_t* words = reinterpret_cast<const int32_t*>(ctrl.bytes());
++ for (int i = 0; i < 4; i++)
++ for (int j = 0; j < 4; j++)
++ rawLanes[i * 4 + j] = words[i] * 4 + j;
++ break;
++ }
++ case SimdPermuteOp::PERMUTE_16x8: {
++ // control has int16 halfword indices. High byte of halfs[0] may have
++ // platform-specific flags (Perm16x8Action). Mask to get the index only.
++ const int16_t* halfs = reinterpret_cast<const int16_t*>(ctrl.bytes());
++ for (int i = 0; i < 8; i++) {
++ int hwIdx = halfs[i] & 0x7;
++ rawLanes[i * 2] = hwIdx * 2;
++ rawLanes[i * 2 + 1] = hwIdx * 2 + 1;
++ }
++ break;
++ }
++ case SimdPermuteOp::BROADCAST_8x16: {
++ uint8_t lane = reinterpret_cast<const int8_t*>(ctrl.bytes())[0];
++ for (int i = 0; i < 16; i++) rawLanes[i] = lane;
++ break;
++ }
++ case SimdPermuteOp::BROADCAST_16x8: {
++ uint8_t lane = reinterpret_cast<const int8_t*>(ctrl.bytes())[0];
++ for (int i = 0; i < 8; i++) {
++ rawLanes[i * 2] = lane * 2;
++ rawLanes[i * 2 + 1] = lane * 2 + 1;
++ }
++ break;
++ }
++ case SimdPermuteOp::ROTATE_RIGHT_8x16: {
++ uint8_t shift = reinterpret_cast<const int8_t*>(ctrl.bytes())[0];
++ for (int i = 0; i < 16; i++) rawLanes[i] = (i + shift) % 16;
++ break;
++ }
++ case SimdPermuteOp::SHIFT_LEFT_8x16: {
++ // Shifted-out positions must be zero. Use index 16+ to pick from zero.
++ uint8_t shift = reinterpret_cast<const int8_t*>(ctrl.bytes())[0];
++ for (int i = 0; i < 16; i++)
++ rawLanes[i] = (i >= shift) ? (i - shift) : (16 + i);
++ goto needsZeroRhs;
++ }
++ case SimdPermuteOp::SHIFT_RIGHT_8x16: {
++ uint8_t shift = reinterpret_cast<const int8_t*>(ctrl.bytes())[0];
++ for (int i = 0; i < 16; i++)
++ rawLanes[i] = (i + shift < 16) ? (i + shift) : (16 + i);
++ goto needsZeroRhs;
++ }
++ case SimdPermuteOp::REVERSE_16x8: {
++ // Reverse bytes within each 16-bit lane: [1,0,3,2,5,4,...]
++ for (int i = 0; i < 8; i++) {
++ rawLanes[i * 2] = i * 2 + 1;
++ rawLanes[i * 2 + 1] = i * 2;
++ }
++ break;
++ }
++ case SimdPermuteOp::REVERSE_32x4: {
++ // Reverse bytes within each 32-bit lane: [3,2,1,0,7,6,5,4,...]
++ for (int i = 0; i < 4; i++)
++ for (int j = 0; j < 4; j++)
++ rawLanes[i * 4 + j] = i * 4 + (3 - j);
++ break;
++ }
++ case SimdPermuteOp::REVERSE_64x2: {
++ // Reverse bytes within each 64-bit lane: [7,6,5,4,3,2,1,0,15,...]
++ for (int i = 0; i < 2; i++)
++ for (int j = 0; j < 8; j++)
++ rawLanes[i * 8 + j] = i * 8 + (7 - j);
++ break;
++ }
++ case SimdPermuteOp::ZERO_EXTEND_8x16_TO_16x8:
++ case SimdPermuteOp::ZERO_EXTEND_8x16_TO_32x4:
++ case SimdPermuteOp::ZERO_EXTEND_8x16_TO_64x2:
++ case SimdPermuteOp::ZERO_EXTEND_16x8_TO_32x4:
++ case SimdPermuteOp::ZERO_EXTEND_16x8_TO_64x2:
++ case SimdPermuteOp::ZERO_EXTEND_32x4_TO_64x2: {
++ const int8_t* bytes = reinterpret_cast<const int8_t*>(ctrl.bytes());
++ for (int i = 0; i < 16; i++) rawLanes[i] = bytes[i];
++ goto needsZeroRhs;
++ }
++ default: {
++ // PERMUTE_8x16 and others: control has raw byte indices.
++ const int8_t* bytes = reinterpret_cast<const int8_t*>(ctrl.bytes());
++ for (int i = 0; i < 16; i++) rawLanes[i] = bytes[i];
++ break;
++ }
++ }
++ masm.shuffleInt8x16(rawLanes, src, src, dest);
++ return;
++
++ needsZeroRhs: {
++ // Wasm convention: rawLanes[i] in 0..15 selects src.LE_byte[idx], and
++ // rawLanes[i] >= 16 means "zero". Without spilling, we can't satisfy
++ // vperm's three-input constraint AND keep src alive when dest == src.
++ // Strategy: vperm src with itself (any valid byte for the "zero"
++ // positions, bytes get masked out below), then AND with a mask that
++ // zeros those positions.
++ int8_t ctrl[16], mask[16];
++ for (unsigned i = 0; i < 16; i++) {
++ uint8_t idx = rawLanes[i];
++ if (idx < 16) {
++ ctrl[i] = 15 - idx;
++ mask[i] = -1;
++ } else {
++ ctrl[i] = 0;
++ mask[i] = 0;
++ }
++ }
++ ScratchSimd128Scope scratch(masm);
++ masm.loadConstantSimd128(SimdConstant::CreateX16(ctrl), scratch);
++ masm.as_vperm(dest.encoding() & 31,
++ src.encoding() & 31,
++ src.encoding() & 31,
++ scratch.encoding() & 31);
++ masm.loadConstantSimd128(SimdConstant::CreateX16(mask), scratch);
++ masm.as_xxland(dest, dest, scratch);
++ return;
++ }
++}
++void CodeGenerator::visitWasmReplaceLaneSimd128(LWasmReplaceLaneSimd128* ins) {
++ FloatRegister lhsDest = ToFloatRegister(ins->output());
++ MOZ_ASSERT(ToFloatRegister(ins->lhs()) == lhsDest);
++ uint32_t lane = ins->mir()->laneIndex();
++ switch (ins->mir()->simdOp()) {
++ case wasm::SimdOp::I8x16ReplaceLane:
++ masm.replaceLaneInt8x16(lane, ToRegister(ins->rhs()), lhsDest);
++ break;
++ case wasm::SimdOp::I16x8ReplaceLane:
++ masm.replaceLaneInt16x8(lane, ToRegister(ins->rhs()), lhsDest);
++ break;
++ case wasm::SimdOp::I32x4ReplaceLane:
++ masm.replaceLaneInt32x4(lane, ToRegister(ins->rhs()), lhsDest);
++ break;
++ case wasm::SimdOp::F32x4ReplaceLane:
++ masm.replaceLaneFloat32x4(lane, ToFloatRegister(ins->rhs()), lhsDest);
++ break;
++ case wasm::SimdOp::F64x2ReplaceLane:
++ masm.replaceLaneFloat64x2(lane, ToFloatRegister(ins->rhs()), lhsDest);
++ break;
++ default:
++ MOZ_CRASH("PPC64: NYI SIMD replace lane op");
++ }
++}
++void CodeGenerator::visitWasmReplaceInt64LaneSimd128(
++ LWasmReplaceInt64LaneSimd128* ins) {
++ MOZ_ASSERT(ins->mir()->simdOp() == wasm::SimdOp::I64x2ReplaceLane);
++ FloatRegister lhsDest = ToFloatRegister(ins->output());
++ MOZ_ASSERT(ToFloatRegister(ins->lhs()) == lhsDest);
++ masm.replaceLaneInt64x2(ins->mir()->laneIndex(),
++ ToRegister64(ins->rhs()), lhsDest);
++}
++void CodeGenerator::visitWasmScalarToSimd128(LWasmScalarToSimd128* ins) {
++ FloatRegister dest = ToFloatRegister(ins->output());
++ switch (ins->mir()->simdOp()) {
++ case wasm::SimdOp::I8x16Splat:
++ masm.splatX16(ToRegister(ins->src()), dest);
++ break;
++ case wasm::SimdOp::I16x8Splat:
++ masm.splatX8(ToRegister(ins->src()), dest);
++ break;
++ case wasm::SimdOp::I32x4Splat:
++ masm.splatX4(ToRegister(ins->src()), dest);
++ break;
++ case wasm::SimdOp::F32x4Splat:
++ masm.splatX4(ToFloatRegister(ins->src()), dest);
++ break;
++ case wasm::SimdOp::F64x2Splat:
++ masm.splatX2(ToFloatRegister(ins->src()), dest);
++ break;
++ default:
++ MOZ_CRASH("PPC64: NYI SIMD scalar-to-simd op");
++ }
++}
++void CodeGenerator::visitWasmInt64ToSimd128(LWasmInt64ToSimd128* ins) {
++ FloatRegister dest = ToFloatRegister(ins->output());
++ switch (ins->mir()->simdOp()) {
++ case wasm::SimdOp::I64x2Splat:
++ masm.splatX2(ToRegister64(ins->src()), dest);
++ break;
++ default:
++ MOZ_CRASH("PPC64: NYI SIMD int64-to-simd op");
++ }
++}
++void CodeGenerator::visitWasmUnarySimd128(LWasmUnarySimd128* ins) {
++ FloatRegister src = ToFloatRegister(ins->src());
++ FloatRegister dest = ToFloatRegister(ins->output());
++ switch (ins->mir()->simdOp()) {
++ case wasm::SimdOp::I8x16Neg:
++ masm.negInt8x16(src, dest);
++ break;
++ case wasm::SimdOp::I16x8Neg:
++ masm.negInt16x8(src, dest);
++ break;
++ case wasm::SimdOp::I32x4Neg:
++ masm.negInt32x4(src, dest);
++ break;
++ case wasm::SimdOp::I64x2Neg:
++ masm.negInt64x2(src, dest);
++ break;
++ case wasm::SimdOp::I8x16Abs:
++ masm.absInt8x16(src, dest);
++ break;
++ case wasm::SimdOp::I16x8Abs:
++ masm.absInt16x8(src, dest);
++ break;
++ case wasm::SimdOp::I32x4Abs:
++ masm.absInt32x4(src, dest);
++ break;
++ case wasm::SimdOp::I64x2Abs:
++ masm.absInt64x2(src, dest);
++ break;
++ case wasm::SimdOp::V128Not:
++ masm.bitwiseNotSimd128(src, dest);
++ break;
++ case wasm::SimdOp::F32x4Neg:
++ masm.negFloat32x4(src, dest);
++ break;
++ case wasm::SimdOp::F64x2Neg:
++ masm.negFloat64x2(src, dest);
++ break;
++ case wasm::SimdOp::F32x4Abs:
++ masm.absFloat32x4(src, dest);
++ break;
++ case wasm::SimdOp::F64x2Abs:
++ masm.absFloat64x2(src, dest);
++ break;
++ case wasm::SimdOp::F32x4Sqrt:
++ masm.sqrtFloat32x4(src, dest);
++ break;
++ case wasm::SimdOp::F64x2Sqrt:
++ masm.sqrtFloat64x2(src, dest);
++ break;
++ case wasm::SimdOp::F32x4Ceil:
++ masm.ceilFloat32x4(src, dest);
++ break;
++ case wasm::SimdOp::F64x2Ceil:
++ masm.ceilFloat64x2(src, dest);
++ break;
++ case wasm::SimdOp::F32x4Floor:
++ masm.floorFloat32x4(src, dest);
++ break;
++ case wasm::SimdOp::F64x2Floor:
++ masm.floorFloat64x2(src, dest);
++ break;
++ case wasm::SimdOp::F32x4Trunc:
++ masm.truncFloat32x4(src, dest);
++ break;
++ case wasm::SimdOp::F64x2Trunc:
++ masm.truncFloat64x2(src, dest);
++ break;
++ case wasm::SimdOp::F32x4Nearest:
++ masm.nearestFloat32x4(src, dest);
++ break;
++ case wasm::SimdOp::F64x2Nearest:
++ masm.nearestFloat64x2(src, dest);
++ break;
++ // Conversions
++ case wasm::SimdOp::F32x4ConvertI32x4S:
++ masm.convertInt32x4ToFloat32x4(src, dest);
++ break;
++ case wasm::SimdOp::F32x4ConvertI32x4U:
++ masm.unsignedConvertInt32x4ToFloat32x4(src, dest);
++ break;
++ case wasm::SimdOp::I32x4TruncSatF32x4S:
++ masm.truncSatFloat32x4ToInt32x4(src, dest);
++ break;
++ case wasm::SimdOp::I32x4TruncSatF32x4U:
++ masm.unsignedTruncSatFloat32x4ToInt32x4(src, dest);
++ break;
++ case wasm::SimdOp::F64x2ConvertLowI32x4S:
++ masm.convertInt32x4ToFloat64x2(src, dest);
++ break;
++ case wasm::SimdOp::F64x2ConvertLowI32x4U:
++ masm.unsignedConvertInt32x4ToFloat64x2(src, dest);
++ break;
++ case wasm::SimdOp::F32x4DemoteF64x2Zero:
++ masm.convertFloat64x2ToFloat32x4(src, dest);
++ break;
++ case wasm::SimdOp::F64x2PromoteLowF32x4:
++ masm.convertFloat32x4ToFloat64x2(src, dest);
++ break;
++ case wasm::SimdOp::I32x4TruncSatF64x2SZero:
++ masm.truncSatFloat64x2ToInt32x4(src, dest, ScratchSimd128Reg);
++ break;
++ case wasm::SimdOp::I32x4TruncSatF64x2UZero:
++ masm.unsignedTruncSatFloat64x2ToInt32x4(src, dest, ScratchSimd128Reg);
++ break;
++ // Widen
++ case wasm::SimdOp::I16x8ExtendLowI8x16S:
++ masm.widenLowInt8x16(src, dest);
++ break;
++ case wasm::SimdOp::I16x8ExtendHighI8x16S:
++ masm.widenHighInt8x16(src, dest);
++ break;
++ case wasm::SimdOp::I16x8ExtendLowI8x16U:
++ masm.unsignedWidenLowInt8x16(src, dest);
++ break;
++ case wasm::SimdOp::I16x8ExtendHighI8x16U:
++ masm.unsignedWidenHighInt8x16(src, dest);
++ break;
++ case wasm::SimdOp::I32x4ExtendLowI16x8S:
++ masm.widenLowInt16x8(src, dest);
++ break;
++ case wasm::SimdOp::I32x4ExtendHighI16x8S:
++ masm.widenHighInt16x8(src, dest);
++ break;
++ case wasm::SimdOp::I32x4ExtendLowI16x8U:
++ masm.unsignedWidenLowInt16x8(src, dest);
++ break;
++ case wasm::SimdOp::I32x4ExtendHighI16x8U:
++ masm.unsignedWidenHighInt16x8(src, dest);
++ break;
++ case wasm::SimdOp::I64x2ExtendLowI32x4S:
++ masm.widenLowInt32x4(src, dest);
++ break;
++ case wasm::SimdOp::I64x2ExtendHighI32x4S:
++ masm.widenHighInt32x4(src, dest);
++ break;
++ case wasm::SimdOp::I64x2ExtendLowI32x4U:
++ masm.unsignedWidenLowInt32x4(src, dest);
++ break;
++ case wasm::SimdOp::I64x2ExtendHighI32x4U:
++ masm.unsignedWidenHighInt32x4(src, dest);
++ break;
++ // Extended add pairwise
++ case wasm::SimdOp::I16x8ExtaddPairwiseI8x16S:
++ masm.extAddPairwiseInt8x16(src, dest);
++ break;
++ case wasm::SimdOp::I16x8ExtaddPairwiseI8x16U:
++ masm.unsignedExtAddPairwiseInt8x16(src, dest);
++ break;
++ case wasm::SimdOp::I32x4ExtaddPairwiseI16x8S:
++ masm.extAddPairwiseInt16x8(src, dest);
++ break;
++ case wasm::SimdOp::I32x4ExtaddPairwiseI16x8U:
++ masm.unsignedExtAddPairwiseInt16x8(src, dest);
++ break;
++ // Relaxed truncation
++ case wasm::SimdOp::I32x4RelaxedTruncF32x4S:
++ masm.truncFloat32x4ToInt32x4Relaxed(src, dest);
++ break;
++ case wasm::SimdOp::I32x4RelaxedTruncF32x4U:
++ masm.unsignedTruncFloat32x4ToInt32x4Relaxed(src, dest);
++ break;
++ case wasm::SimdOp::I32x4RelaxedTruncF64x2SZero:
++ masm.truncFloat64x2ToInt32x4Relaxed(src, dest);
++ break;
++ case wasm::SimdOp::I32x4RelaxedTruncF64x2UZero:
++ masm.unsignedTruncFloat64x2ToInt32x4Relaxed(src, dest);
++ break;
++ // Popcnt
++ case wasm::SimdOp::I8x16Popcnt:
++ masm.popcntInt8x16(src, dest);
++ break;
++ default:
++ MOZ_CRASH("PPC64: NYI SIMD unary op");
++ }
++}
++void CodeGenerator::visitWasmReduceSimd128(LWasmReduceSimd128* ins) {
++ FloatRegister src = ToFloatRegister(ins->src());
++ uint32_t imm = ins->mir()->imm();
++ switch (ins->mir()->simdOp()) {
++ case wasm::SimdOp::I8x16ExtractLaneS:
++ masm.extractLaneInt8x16(imm, src, ToRegister(ins->output()));
++ break;
++ case wasm::SimdOp::I8x16ExtractLaneU:
++ masm.unsignedExtractLaneInt8x16(imm, src, ToRegister(ins->output()));
++ break;
++ case wasm::SimdOp::I16x8ExtractLaneS:
++ masm.extractLaneInt16x8(imm, src, ToRegister(ins->output()));
++ break;
++ case wasm::SimdOp::I16x8ExtractLaneU:
++ masm.unsignedExtractLaneInt16x8(imm, src, ToRegister(ins->output()));
++ break;
++ case wasm::SimdOp::I32x4ExtractLane:
++ masm.extractLaneInt32x4(imm, src, ToRegister(ins->output()));
++ break;
++ case wasm::SimdOp::F32x4ExtractLane:
++ masm.extractLaneFloat32x4(imm, src, ToFloatRegister(ins->output()));
++ break;
++ case wasm::SimdOp::F64x2ExtractLane:
++ masm.extractLaneFloat64x2(imm, src, ToFloatRegister(ins->output()));
++ break;
++ case wasm::SimdOp::V128AnyTrue:
++ masm.anyTrueSimd128(src, ToRegister(ins->output()));
++ break;
++ case wasm::SimdOp::I8x16AllTrue:
++ masm.allTrueInt8x16(src, ToRegister(ins->output()));
++ break;
++ case wasm::SimdOp::I16x8AllTrue:
++ masm.allTrueInt16x8(src, ToRegister(ins->output()));
++ break;
++ case wasm::SimdOp::I32x4AllTrue:
++ masm.allTrueInt32x4(src, ToRegister(ins->output()));
++ break;
++ case wasm::SimdOp::I64x2AllTrue:
++ masm.allTrueInt64x2(src, ToRegister(ins->output()));
++ break;
++ case wasm::SimdOp::I8x16Bitmask:
++ masm.bitmaskInt8x16(src, ToRegister(ins->output()), ScratchSimd128Reg);
++ break;
++ case wasm::SimdOp::I16x8Bitmask:
++ masm.bitmaskInt16x8(src, ToRegister(ins->output()), ScratchSimd128Reg);
++ break;
++ case wasm::SimdOp::I32x4Bitmask:
++ masm.bitmaskInt32x4(src, ToRegister(ins->output()), ScratchSimd128Reg);
++ break;
++ case wasm::SimdOp::I64x2Bitmask:
++ masm.bitmaskInt64x2(src, ToRegister(ins->output()), ScratchSimd128Reg);
++ break;
++ default:
++ MOZ_CRASH("PPC64: NYI SIMD reduce op");
++ }
++}
++void CodeGenerator::visitWasmReduceAndBranchSimd128(
++ LWasmReduceAndBranchSimd128* ins) {
++ FloatRegister src = ToFloatRegister(ins->src());
++ UseScratchRegisterScope temps(masm);
++ Register tmp = temps.Acquire();
++ switch (ins->simdOp()) {
++ case wasm::SimdOp::V128AnyTrue:
++ masm.anyTrueSimd128(src, tmp);
++ break;
++ case wasm::SimdOp::I8x16AllTrue:
++ masm.allTrueInt8x16(src, tmp);
++ break;
++ case wasm::SimdOp::I16x8AllTrue:
++ masm.allTrueInt16x8(src, tmp);
++ break;
++ case wasm::SimdOp::I32x4AllTrue:
++ masm.allTrueInt32x4(src, tmp);
++ break;
++ case wasm::SimdOp::I64x2AllTrue:
++ masm.allTrueInt64x2(src, tmp);
++ break;
++ case wasm::SimdOp::I8x16Bitmask:
++ masm.bitmaskInt8x16(src, tmp, ScratchSimd128Reg);
++ break;
++ case wasm::SimdOp::I16x8Bitmask:
++ masm.bitmaskInt16x8(src, tmp, ScratchSimd128Reg);
++ break;
++ case wasm::SimdOp::I32x4Bitmask:
++ masm.bitmaskInt32x4(src, tmp, ScratchSimd128Reg);
++ break;
++ case wasm::SimdOp::I64x2Bitmask:
++ masm.bitmaskInt64x2(src, tmp, ScratchSimd128Reg);
++ break;
++ default:
++ MOZ_CRASH("PPC64: NYI SIMD reduce-and-branch op");
++ }
++ masm.as_cmpdi(tmp, 0);
++ // Branch to ifTrue if nonzero, fall through to ifFalse.
++ Label* ifTrue = skipTrivialBlocks(ins->ifTrue())->lir()->label();
++ Label* ifFalse = skipTrivialBlocks(ins->ifFalse())->lir()->label();
++ masm.ma_b(Assembler::NotEqual, ifTrue);
++ masm.jump(ifFalse);
++}
++void CodeGenerator::visitWasmReduceSimd128ToInt64(
++ LWasmReduceSimd128ToInt64* ins) {
++ FloatRegister src = ToFloatRegister(ins->src());
++ Register64 dest = ToOutRegister64(ins);
++ switch (ins->mir()->simdOp()) {
++ case wasm::SimdOp::I64x2ExtractLane:
++ masm.extractLaneInt64x2(ins->mir()->imm(), src, dest);
++ break;
++ default:
++ MOZ_CRASH("PPC64: NYI SIMD reduce-to-int64 op");
++ }
++}
++static inline wasm::MemoryAccessDesc DeriveMemoryAccessDesc(
++ const wasm::MemoryAccessDesc& access, Scalar::Type type) {
++ return wasm::MemoryAccessDesc(access.memoryIndex(), type, access.align(),
++ access.offset32(), access.trapDesc(),
++ access.isHugeMemory());
++}
++
++void CodeGenerator::visitWasmLoadLaneSimd128(LWasmLoadLaneSimd128* ins) {
++ const MWasmLoadLaneSimd128* mir = ins->mir();
++ Register memoryBase = ToRegister(ins->memoryBase());
++ Register ptr = ToRegister(ins->ptr());
++ FloatRegister src = ToFloatRegister(ins->src());
++ FloatRegister dest = ToFloatRegister(ins->output());
++ UseScratchRegisterScope temps(masm);
++ Register tmp = temps.Acquire();
++ masm.moveSimd128(src, dest);
++ switch (mir->laneSize()) {
++ case 1:
++ masm.wasmLoad(DeriveMemoryAccessDesc(mir->access(), Scalar::Int8),
++ memoryBase, ptr, ptr, AnyRegister(tmp));
++ masm.replaceLaneInt8x16(mir->laneIndex(), tmp, dest);
++ break;
++ case 2:
++ masm.wasmLoad(DeriveMemoryAccessDesc(mir->access(), Scalar::Int16),
++ memoryBase, ptr, ptr, AnyRegister(tmp));
++ masm.replaceLaneInt16x8(mir->laneIndex(), tmp, dest);
++ break;
++ case 4:
++ masm.wasmLoad(DeriveMemoryAccessDesc(mir->access(), Scalar::Int32),
++ memoryBase, ptr, ptr, AnyRegister(tmp));
++ masm.replaceLaneInt32x4(mir->laneIndex(), tmp, dest);
++ break;
++ case 8: {
++ masm.wasmLoadI64(DeriveMemoryAccessDesc(mir->access(), Scalar::Int64),
++ memoryBase, ptr, ptr,
++ Register64(tmp));
++ masm.replaceLaneInt64x2(mir->laneIndex(), Register64(tmp), dest);
++ break;
++ }
++ default:
++ MOZ_CRASH("Unexpected lane size");
++ }
++}
++void CodeGenerator::visitWasmStoreLaneSimd128(LWasmStoreLaneSimd128* ins) {
++ const MWasmStoreLaneSimd128* mir = ins->mir();
++ Register memoryBase = ToRegister(ins->memoryBase());
++ Register ptr = ToRegister(ins->ptr());
++ FloatRegister src = ToFloatRegister(ins->src());
++ UseScratchRegisterScope temps(masm);
++ Register tmp = temps.Acquire();
++ switch (mir->laneSize()) {
++ case 1:
++ masm.unsignedExtractLaneInt8x16(mir->laneIndex(), src, tmp);
++ masm.wasmStore(DeriveMemoryAccessDesc(mir->access(), Scalar::Int8),
++ AnyRegister(tmp), memoryBase, ptr, ptr);
++ break;
++ case 2:
++ masm.unsignedExtractLaneInt16x8(mir->laneIndex(), src, tmp);
++ masm.wasmStore(DeriveMemoryAccessDesc(mir->access(), Scalar::Int16),
++ AnyRegister(tmp), memoryBase, ptr, ptr);
++ break;
++ case 4:
++ masm.extractLaneInt32x4(mir->laneIndex(), src, tmp);
++ masm.wasmStore(DeriveMemoryAccessDesc(mir->access(), Scalar::Int32),
++ AnyRegister(tmp), memoryBase, ptr, ptr);
++ break;
++ case 8:
++ masm.extractLaneInt64x2(mir->laneIndex(), src, Register64(tmp));
++ masm.wasmStoreI64(DeriveMemoryAccessDesc(mir->access(), Scalar::Int64),
++ Register64(tmp), memoryBase, ptr, ptr);
++ break;
++ default:
++ MOZ_CRASH("Unexpected lane size");
++ }
++}
++
++} // namespace jit
++} // namespace js
+diff --git a/js/src/jit/ppc64/CodeGenerator-ppc64.h b/js/src/jit/ppc64/CodeGenerator-ppc64.h
+new file mode 100644
+index 000000000000..3414eceb5ac4
+--- /dev/null
++++ b/js/src/jit/ppc64/CodeGenerator-ppc64.h
+@@ -0,0 +1,101 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifndef jit_ppc64_CodeGenerator_ppc64_h
++#define jit_ppc64_CodeGenerator_ppc64_h
++
++#include "jit/ppc64/Assembler-ppc64.h"
++#include "jit/shared/CodeGenerator-shared.h"
++
++namespace js {
++namespace jit {
++
++class CodeGeneratorPPC64;
++class OutOfLineTableSwitch;
++
++using OutOfLineWasmTruncateCheck =
++ OutOfLineWasmTruncateCheckBase<CodeGeneratorPPC64>;
++
++class CodeGeneratorPPC64 : public CodeGeneratorShared {
++ friend class MoveResolverPPC64;
++
++ protected:
++ CodeGeneratorPPC64(MIRGenerator* gen, LIRGraph* graph, MacroAssembler* masm,
++ const wasm::CodeMetadata* codeMeta);
++
++ NonAssertingLabel deoptLabel_;
++
++ Operand ToOperand(const LAllocation& a);
++ Operand ToOperand(const LAllocation* a);
++ MoveOperand toMoveOperand(LAllocation a) const;
++
++ template <typename T1, typename T2>
++ void bailoutCmp32(Assembler::Condition c, T1 lhs, T2 rhs,
++ LSnapshot* snapshot) {
++ Label bail;
++ masm.branch32(c, lhs, rhs, &bail);
++ bailoutFrom(&bail, snapshot);
++ }
++ template <typename T1, typename T2>
++ void bailoutCmpPtr(Assembler::Condition c, T1 lhs, T2 rhs,
++ LSnapshot* snapshot) {
++ Label bail;
++ masm.branchPtr(c, lhs, rhs, &bail);
++ bailoutFrom(&bail, snapshot);
++ }
++ template <typename T1, typename T2>
++ void bailoutTest32(Assembler::Condition c, T1 lhs, T2 rhs,
++ LSnapshot* snapshot) {
++ Label bail;
++ masm.branchTest32(c, lhs, rhs, &bail);
++ bailoutFrom(&bail, snapshot);
++ }
++ void bailoutIfFalseBool(Register lhs, LSnapshot* snapshot);
++ void bailoutFrom(Label* label, LSnapshot* snapshot);
++ void bailout(LSnapshot* snapshot);
++
++ protected:
++ bool generateOutOfLineCode();
++ void branchToBlock(MBasicBlock* block);
++
++ template <typename T>
++ void branchToBlock(Assembler::Condition cond, Register lhs, T rhs,
++ MBasicBlock* mir) {
++ Label* label = skipTrivialBlocks(mir)->lir()->label();
++ masm.branch32(cond, lhs, rhs, label);
++ }
++ void branchToBlock(Assembler::DoubleCondition cond, FloatRegister lhs,
++ FloatRegister rhs, MBasicBlock* mir);
++ void branchToBlock(Assembler::FloatFormat fmt,
++ Assembler::DoubleCondition cond, FloatRegister lhs,
++ FloatRegister rhs, MBasicBlock* mir);
++
++ void emitTableSwitchDispatch(MTableSwitch* mir, Register index,
++ Register base);
++
++ void emitBigIntPtrDiv(LBigIntPtrDiv* ins, Register dividend, Register divisor,
++ Register output);
++ void emitBigIntPtrMod(LBigIntPtrMod* ins, Register dividend, Register divisor,
++ Register output);
++
++ void generateInvalidateEpilogue();
++
++ template <typename T>
++ void emitWasmLoad(T* lir);
++ template <typename T>
++ void emitWasmStore(T* lir);
++
++ public:
++ void visitOutOfLineTableSwitch(OutOfLineTableSwitch* ool);
++ void visitOutOfLineWasmTruncateCheck(OutOfLineWasmTruncateCheck* ool);
++};
++
++typedef CodeGeneratorPPC64 CodeGeneratorSpecific;
++
++} // namespace jit
++} // namespace js
++
++#endif /* jit_ppc64_CodeGenerator_ppc64_h */
+diff --git a/js/src/jit/ppc64/LIR-ppc64.h b/js/src/jit/ppc64/LIR-ppc64.h
+new file mode 100644
+index 000000000000..686875056127
+--- /dev/null
++++ b/js/src/jit/ppc64/LIR-ppc64.h
+@@ -0,0 +1,135 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifndef jit_ppc64_LIR_ppc64_h
++#define jit_ppc64_LIR_ppc64_h
++
++namespace js {
++namespace jit {
++
++class LUnbox : public LInstructionHelper<1, BOX_PIECES, 0> {
++ public:
++ LIR_HEADER(Unbox);
++
++ explicit LUnbox(const LAllocation& input) : LInstructionHelper(classOpcode) {
++ setOperand(0, input);
++ }
++
++ static const size_t Input = 0;
++
++ LBoxAllocation input() const { return getBoxOperand(Input); }
++
++ MUnbox* mir() const { return mir_->toUnbox(); }
++ const char* extraName() const { return StringFromMIRType(mir()->type()); }
++};
++
++class LUDivOrMod : public LBinaryMath<0> {
++ public:
++ LIR_HEADER(UDivOrMod);
++
++ LUDivOrMod() : LBinaryMath(classOpcode) {}
++
++ MBinaryArithInstruction* mir() const {
++ MOZ_ASSERT(mir_->isDiv() || mir_->isMod());
++ return static_cast<MBinaryArithInstruction*>(mir_);
++ }
++
++ bool canBeDivideByZero() const {
++ if (mir_->isMod()) {
++ return mir_->toMod()->canBeDivideByZero();
++ }
++ return mir_->toDiv()->canBeDivideByZero();
++ }
++
++ bool trapOnError() const {
++ if (mir_->isMod()) {
++ return mir_->toMod()->trapOnError();
++ }
++ return mir_->toDiv()->trapOnError();
++ }
++
++ wasm::TrapSiteDesc trapSiteDesc() const {
++ MOZ_ASSERT(mir_->isDiv() || mir_->isMod());
++ if (mir_->isMod()) {
++ return mir_->toMod()->trapSiteDesc();
++ }
++ return mir_->toDiv()->trapSiteDesc();
++ }
++};
++
++class LDivOrModI64 : public LBinaryMath<0> {
++ public:
++ LIR_HEADER(DivOrModI64);
++
++ LDivOrModI64(const LAllocation& lhs, const LAllocation& rhs)
++ : LBinaryMath(classOpcode) {
++ setOperand(0, lhs);
++ setOperand(1, rhs);
++ }
++
++ MBinaryArithInstruction* mir() const {
++ MOZ_ASSERT(mir_->isDiv() || mir_->isMod());
++ return static_cast<MBinaryArithInstruction*>(mir_);
++ }
++
++ bool canBeDivideByZero() const {
++ if (mir_->isMod()) {
++ return mir_->toMod()->canBeDivideByZero();
++ }
++ return mir_->toDiv()->canBeDivideByZero();
++ }
++ bool canBeNegativeOverflow() const {
++ if (mir_->isMod()) {
++ return mir_->toMod()->canBeNegativeDividend();
++ }
++ return mir_->toDiv()->canBeNegativeOverflow();
++ }
++ wasm::TrapSiteDesc trapSiteDesc() const {
++ MOZ_ASSERT(mir_->isDiv() || mir_->isMod());
++ if (mir_->isMod()) {
++ return mir_->toMod()->trapSiteDesc();
++ }
++ return mir_->toDiv()->trapSiteDesc();
++ }
++};
++
++class LUDivOrModI64 : public LBinaryMath<0> {
++ public:
++ LIR_HEADER(UDivOrModI64);
++
++ LUDivOrModI64(const LAllocation& lhs, const LAllocation& rhs)
++ : LBinaryMath(classOpcode) {
++ setOperand(0, lhs);
++ setOperand(1, rhs);
++ }
++
++ const char* extraName() const {
++ return mir()->isTruncated() ? "Truncated" : nullptr;
++ }
++
++ MBinaryArithInstruction* mir() const {
++ MOZ_ASSERT(mir_->isDiv() || mir_->isMod());
++ return static_cast<MBinaryArithInstruction*>(mir_);
++ }
++ bool canBeDivideByZero() const {
++ if (mir_->isMod()) {
++ return mir_->toMod()->canBeDivideByZero();
++ }
++ return mir_->toDiv()->canBeDivideByZero();
++ }
++ wasm::TrapSiteDesc trapSiteDesc() const {
++ MOZ_ASSERT(mir_->isDiv() || mir_->isMod());
++ if (mir_->isMod()) {
++ return mir_->toMod()->trapSiteDesc();
++ }
++ return mir_->toDiv()->trapSiteDesc();
++ }
++};
++
++} // namespace jit
++} // namespace js
++
++#endif /* jit_ppc64_LIR_ppc64_h */
+diff --git a/js/src/jit/ppc64/Lowering-ppc64.cpp b/js/src/jit/ppc64/Lowering-ppc64.cpp
+new file mode 100644
+index 000000000000..be0ead19d273
+--- /dev/null
++++ b/js/src/jit/ppc64/Lowering-ppc64.cpp
+@@ -0,0 +1,1324 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#include "jit/ppc64/Lowering-ppc64.h"
++
++#include "mozilla/MathAlgorithms.h"
++
++#include "jit/Lowering.h"
++#include "jit/MIR-wasm.h"
++#include "jit/MIR.h"
++#include "jit/ppc64/Assembler-ppc64.h"
++#include "wasm/WasmFeatures.h" // for wasm::ReportSimdAnalysis
++
++#include "jit/shared/Lowering-shared-inl.h"
++
++using namespace js;
++using namespace js::jit;
++
++using mozilla::FloorLog2;
++
++namespace js {
++namespace jit {
++
++LTableSwitch* LIRGeneratorPPC64::newLTableSwitch(const LAllocation& in,
++ const LDefinition& inputCopy) {
++ return new (alloc()) LTableSwitch(in, inputCopy, temp());
++}
++
++LTableSwitchV* LIRGeneratorPPC64::newLTableSwitchV(const LBoxAllocation& in) {
++ return new (alloc()) LTableSwitchV(in, temp(), tempDouble(), temp());
++}
++
++void LIRGeneratorPPC64::lowerForShift(LInstructionHelper<1, 2, 0>* ins,
++ MDefinition* mir, MDefinition* lhs,
++ MDefinition* rhs) {
++ lowerForALU(ins, mir, lhs, rhs);
++}
++
++template <class LInstr>
++void LIRGeneratorPPC64::lowerForShiftInt64(LInstr* ins, MDefinition* mir,
++ MDefinition* lhs, MDefinition* rhs) {
++ if constexpr (std::is_same_v<LInstr, LShiftI64>) {
++ ins->setLhs(useInt64RegisterAtStart(lhs));
++ ins->setRhs(useRegisterOrConstantAtStart(rhs));
++ } else {
++ ins->setInput(useInt64RegisterAtStart(lhs));
++ ins->setCount(useRegisterOrConstantAtStart(rhs));
++ }
++ defineInt64(ins, mir);
++}
++
++template void LIRGeneratorPPC64::lowerForShiftInt64(LShiftI64* ins,
++ MDefinition* mir,
++ MDefinition* lhs,
++ MDefinition* rhs);
++template void LIRGeneratorPPC64::lowerForShiftInt64(LRotateI64* ins,
++ MDefinition* mir,
++ MDefinition* lhs,
++ MDefinition* rhs);
++
++void LIRGeneratorPPC64::lowerForALU(LInstructionHelper<1, 1, 0>* ins,
++ MDefinition* mir, MDefinition* input) {
++ ins->setOperand(0, useRegisterAtStart(input));
++ define(ins, mir);
++}
++
++void LIRGeneratorPPC64::lowerForALU(LInstructionHelper<1, 2, 0>* ins,
++ MDefinition* mir, MDefinition* lhs,
++ MDefinition* rhs) {
++ ins->setOperand(0, useRegisterAtStart(lhs));
++ ins->setOperand(1, useRegisterOrConstantAtStart(rhs));
++ define(ins, mir);
++}
++
++void LIRGeneratorPPC64::lowerForALUInt64(
++ LInstructionHelper<INT64_PIECES, INT64_PIECES, 0>* ins, MDefinition* mir,
++ MDefinition* input) {
++ ins->setInt64Operand(0, useInt64RegisterAtStart(input));
++ defineInt64(ins, mir);
++}
++
++void LIRGeneratorPPC64::lowerForALUInt64(
++ LInstructionHelper<INT64_PIECES, 2 * INT64_PIECES, 0>* ins,
++ MDefinition* mir, MDefinition* lhs, MDefinition* rhs) {
++ ins->setInt64Operand(0, useInt64RegisterAtStart(lhs));
++ ins->setInt64Operand(INT64_PIECES, useInt64RegisterOrConstantAtStart(rhs));
++ defineInt64(ins, mir);
++}
++
++void LIRGeneratorPPC64::lowerForMulInt64(LMulI64* ins, MMul* mir,
++ MDefinition* lhs, MDefinition* rhs) {
++ lowerForALUInt64(ins, mir, lhs, rhs);
++}
++
++void LIRGeneratorPPC64::lowerForFPU(LInstructionHelper<1, 1, 0>* ins,
++ MDefinition* mir, MDefinition* input) {
++ ins->setOperand(0, useRegisterAtStart(input));
++ define(ins, mir);
++}
++
++void LIRGeneratorPPC64::lowerForFPU(LInstructionHelper<1, 2, 0>* ins,
++ MDefinition* mir, MDefinition* lhs,
++ MDefinition* rhs) {
++ ins->setOperand(0, useRegisterAtStart(lhs));
++ ins->setOperand(1, useRegisterAtStart(rhs));
++ define(ins, mir);
++}
++
++LBoxAllocation LIRGeneratorPPC64::useBoxFixed(MDefinition* mir, Register reg1,
++ Register reg2, bool useAtStart) {
++ MOZ_ASSERT(mir->type() == MIRType::Value);
++
++ ensureDefined(mir);
++ return LBoxAllocation(LUse(reg1, mir->virtualRegister(), useAtStart));
++}
++
++LAllocation LIRGeneratorPPC64::useByteOpRegister(MDefinition* mir) {
++ return useRegister(mir);
++}
++
++LAllocation LIRGeneratorPPC64::useByteOpRegisterAtStart(MDefinition* mir) {
++ return useRegisterAtStart(mir);
++}
++
++LAllocation LIRGeneratorPPC64::useByteOpRegisterOrNonDoubleConstant(
++ MDefinition* mir) {
++ return useRegisterOrNonDoubleConstant(mir);
++}
++
++LDefinition LIRGeneratorPPC64::tempByteOpRegister() { return temp(); }
++
++LDefinition LIRGeneratorPPC64::tempToUnbox() { return temp(); }
++
++void LIRGeneratorPPC64::lowerUntypedPhiInput(MPhi* phi, uint32_t inputPosition,
++ LBlock* block, size_t lirIndex) {
++ lowerTypedPhiInput(phi, inputPosition, block, lirIndex);
++}
++
++void LIRGeneratorPPC64::lowerInt64PhiInput(MPhi* phi, uint32_t inputPosition,
++ LBlock* block, size_t lirIndex) {
++ lowerTypedPhiInput(phi, inputPosition, block, lirIndex);
++}
++
++void LIRGeneratorPPC64::defineInt64Phi(MPhi* phi, size_t lirIndex) {
++ defineTypedPhi(phi, lirIndex);
++}
++
++void LIRGeneratorPPC64::lowerMulI(MMul* mul, MDefinition* lhs,
++ MDefinition* rhs) {
++ LMulI* lir = new (alloc()) LMulI;
++ if (mul->fallible()) {
++ assignSnapshot(lir, mul->bailoutKind());
++ }
++ if (mul->canBeNegativeZero() && !rhs->isConstant()) {
++ lir->setOperand(0, useRegister(lhs));
++ lir->setOperand(1, useRegister(rhs));
++ define(lir, mul);
++ return;
++ }
++ lowerForALU(lir, mul, lhs, rhs);
++}
++
++void LIRGeneratorPPC64::lowerDivI(MDiv* div) {
++ if (div->rhs()->isConstant()) {
++ int32_t rhs = div->rhs()->toConstant()->toInt32();
++ int32_t shift = FloorLog2(uint32_t(rhs));
++ if (rhs > 0 && 1 << shift == rhs) {
++ LDivPowTwoI* lir =
++ new (alloc()) LDivPowTwoI(useRegister(div->lhs()), shift);
++ if (div->fallible()) {
++ assignSnapshot(lir, div->bailoutKind());
++ }
++ define(lir, div);
++ return;
++ }
++ }
++ LDivI* lir = new (alloc())
++ LDivI(useRegister(div->lhs()), useRegister(div->rhs()), temp());
++ if (div->fallible()) {
++ assignSnapshot(lir, div->bailoutKind());
++ }
++ define(lir, div);
++}
++
++void LIRGeneratorPPC64::lowerDivI64(MDiv* div) {
++ auto* lir = new (alloc())
++ LDivOrModI64(useRegister(div->lhs()), useRegister(div->rhs()));
++ defineInt64(lir, div);
++}
++
++void LIRGeneratorPPC64::lowerModI(MMod* mod) {
++ if (mod->rhs()->isConstant()) {
++ int32_t rhs = mod->rhs()->toConstant()->toInt32();
++ int32_t shift = FloorLog2(uint32_t(rhs));
++ if (rhs > 0 && 1 << shift == rhs) {
++ LModPowTwoI* lir =
++ new (alloc()) LModPowTwoI(useRegister(mod->lhs()), shift);
++ if (mod->fallible()) {
++ assignSnapshot(lir, mod->bailoutKind());
++ }
++ define(lir, mod);
++ return;
++ } else if (shift < 31 && (1 << (shift + 1)) - 1 == rhs) {
++ LModMaskI* lir = new (alloc())
++ LModMaskI(useRegister(mod->lhs()), temp(), temp(), shift + 1);
++ if (mod->fallible()) {
++ assignSnapshot(lir, mod->bailoutKind());
++ }
++ define(lir, mod);
++ return;
++ }
++ }
++ auto* lir =
++ new (alloc()) LModI(useRegister(mod->lhs()), useRegister(mod->rhs()));
++ if (mod->fallible()) {
++ assignSnapshot(lir, mod->bailoutKind());
++ }
++ define(lir, mod);
++}
++
++void LIRGeneratorPPC64::lowerModI64(MMod* mod) {
++ auto* lir = new (alloc())
++ LDivOrModI64(useRegister(mod->lhs()), useRegister(mod->rhs()));
++ defineInt64(lir, mod);
++}
++
++void LIRGeneratorPPC64::lowerUDiv(MDiv* div) {
++ MDefinition* lhs = div->getOperand(0);
++ MDefinition* rhs = div->getOperand(1);
++ LUDivOrMod* lir = new (alloc()) LUDivOrMod;
++ // useRegisterAtStart: CodeGenerator-ppc64's visitUDivOrMod zero-extends
++ // lhs/rhs into their own slots in place before the 32-bit divwu, so the
++ // inputs must not be required live after the LIR op begins.
++ lir->setOperand(0, useRegisterAtStart(lhs));
++ lir->setOperand(1, useRegisterAtStart(rhs));
++ if (div->fallible()) {
++ assignSnapshot(lir, div->bailoutKind());
++ }
++ define(lir, div);
++}
++
++void LIRGeneratorPPC64::lowerUDivI64(MDiv* div) {
++ auto* lir = new (alloc())
++ LUDivOrModI64(useRegister(div->lhs()), useRegister(div->rhs()));
++ defineInt64(lir, div);
++}
++
++void LIRGeneratorPPC64::lowerUMod(MMod* mod) {
++ MDefinition* lhs = mod->getOperand(0);
++ MDefinition* rhs = mod->getOperand(1);
++ LUDivOrMod* lir = new (alloc()) LUDivOrMod;
++ // See lowerUDiv above for why useRegisterAtStart is required here.
++ lir->setOperand(0, useRegisterAtStart(lhs));
++ lir->setOperand(1, useRegisterAtStart(rhs));
++ if (mod->fallible()) {
++ assignSnapshot(lir, mod->bailoutKind());
++ }
++ define(lir, mod);
++}
++
++void LIRGeneratorPPC64::lowerUModI64(MMod* mod) {
++ auto* lir = new (alloc())
++ LUDivOrModI64(useRegister(mod->lhs()), useRegister(mod->rhs()));
++ defineInt64(lir, mod);
++}
++
++void LIRGeneratorPPC64::lowerUrshD(MUrsh* mir) {
++ MDefinition* lhs = mir->lhs();
++ MDefinition* rhs = mir->rhs();
++ MOZ_ASSERT(lhs->type() == MIRType::Int32);
++ MOZ_ASSERT(rhs->type() == MIRType::Int32);
++ auto* lir = new (alloc()) LUrshD(useRegisterAtStart(lhs),
++ useRegisterOrConstantAtStart(rhs), temp());
++ define(lir, mir);
++}
++
++void LIRGeneratorPPC64::lowerPowOfTwoI(MPow* mir) {
++ int32_t base = mir->input()->toConstant()->toInt32();
++ MDefinition* power = mir->power();
++ auto* lir = new (alloc()) LPowOfTwoI(useRegister(power), base);
++ assignSnapshot(lir, mir->bailoutKind());
++ define(lir, mir);
++}
++
++void LIRGeneratorPPC64::lowerBigIntPtrDiv(MBigIntPtrDiv* ins) {
++ auto* lir = new (alloc())
++ LBigIntPtrDiv(useRegister(ins->lhs()), useRegister(ins->rhs()),
++ LDefinition::BogusTemp(), LDefinition::BogusTemp());
++ assignSnapshot(lir, ins->bailoutKind());
++ define(lir, ins);
++}
++
++void LIRGeneratorPPC64::lowerBigIntPtrMod(MBigIntPtrMod* ins) {
++ auto* lir = new (alloc())
++ LBigIntPtrMod(useRegister(ins->lhs()), useRegister(ins->rhs()), temp(),
++ LDefinition::BogusTemp());
++ if (ins->canBeDivideByZero()) {
++ assignSnapshot(lir, ins->bailoutKind());
++ }
++ define(lir, ins);
++}
++
++void LIRGeneratorPPC64::lowerBigIntPtrLsh(MBigIntPtrLsh* ins) {
++ auto* lir = new (alloc()) LBigIntPtrLsh(
++ useRegister(ins->lhs()), useRegister(ins->rhs()), temp(), temp());
++ assignSnapshot(lir, ins->bailoutKind());
++ define(lir, ins);
++}
++
++void LIRGeneratorPPC64::lowerBigIntPtrRsh(MBigIntPtrRsh* ins) {
++ auto* lir = new (alloc()) LBigIntPtrRsh(
++ useRegister(ins->lhs()), useRegister(ins->rhs()), temp(), temp());
++ assignSnapshot(lir, ins->bailoutKind());
++ define(lir, ins);
++}
++
++void LIRGeneratorPPC64::lowerTruncateDToInt32(MTruncateToInt32* ins) {
++ MDefinition* opd = ins->input();
++ MOZ_ASSERT(opd->type() == MIRType::Double);
++ define(new (alloc()) LTruncateDToInt32(useRegister(opd), tempDouble()), ins);
++}
++
++void LIRGeneratorPPC64::lowerTruncateFToInt32(MTruncateToInt32* ins) {
++ MDefinition* opd = ins->input();
++ MOZ_ASSERT(opd->type() == MIRType::Float32);
++ define(new (alloc()) LTruncateFToInt32(useRegister(opd), tempFloat32()), ins);
++}
++
++void LIRGeneratorPPC64::lowerBuiltinInt64ToFloatingPoint(
++ MBuiltinInt64ToFloatingPoint* ins) {
++ MOZ_CRASH("We don't use it for this architecture");
++}
++
++void LIRGeneratorPPC64::lowerWasmSelectI(MWasmSelect* select) {
++ auto* lir = new (alloc())
++ LWasmSelect(useRegisterAtStart(select->trueExpr()),
++ useAny(select->falseExpr()), useRegister(select->condExpr()));
++ defineReuseInput(lir, select, LWasmSelect::TrueExprIndex);
++}
++
++void LIRGeneratorPPC64::lowerWasmSelectI64(MWasmSelect* select) {
++ auto* lir = new (alloc()) LWasmSelectI64(
++ useInt64RegisterAtStart(select->trueExpr()),
++ useInt64(select->falseExpr()), useRegister(select->condExpr()));
++ defineInt64ReuseInput(lir, select, LWasmSelectI64::TrueExprIndex);
++}
++
++void LIRGeneratorPPC64::lowerWasmBuiltinTruncateToInt32(
++ MWasmBuiltinTruncateToInt32* ins) {
++ MDefinition* opd = ins->input();
++ MOZ_ASSERT(opd->type() == MIRType::Double || opd->type() == MIRType::Float32);
++
++ if (opd->type() == MIRType::Double) {
++ define(new (alloc()) LWasmBuiltinTruncateDToInt32(
++ useRegister(opd), useFixed(ins->instance(), InstanceReg),
++ LDefinition::BogusTemp()),
++ ins);
++ return;
++ }
++
++ define(new (alloc()) LWasmBuiltinTruncateFToInt32(
++ useRegister(opd), useFixed(ins->instance(), InstanceReg),
++ LDefinition::BogusTemp()),
++ ins);
++}
++
++void LIRGeneratorPPC64::lowerWasmBuiltinTruncateToInt64(
++ MWasmBuiltinTruncateToInt64* ins) {
++ MOZ_CRASH("We don't use it for this architecture");
++}
++
++void LIRGeneratorPPC64::lowerWasmBuiltinDivI64(MWasmBuiltinDivI64* div) {
++ MOZ_CRASH("We don't use runtime div for this architecture");
++}
++
++void LIRGeneratorPPC64::lowerWasmBuiltinModI64(MWasmBuiltinModI64* mod) {
++ MOZ_CRASH("We don't use runtime mod for this architecture");
++}
++
++void LIRGeneratorPPC64::lowerAtomicLoad64(MLoadUnboxedScalar* ins) {
++ const LUse elements = useRegister(ins->elements());
++ const LAllocation index =
++ useRegisterOrIndexConstant(ins->index(), ins->storageType());
++ auto* lir = new (alloc()) LAtomicLoad64(elements, index);
++ defineInt64(lir, ins);
++}
++
++void LIRGeneratorPPC64::lowerAtomicStore64(MStoreUnboxedScalar* ins) {
++ LUse elements = useRegister(ins->elements());
++ LAllocation index =
++ useRegisterOrIndexConstant(ins->index(), ins->writeType());
++ LInt64Allocation value = useInt64Register(ins->value());
++ add(new (alloc()) LAtomicStore64(elements, index, value), ins);
++}
++
++// ===============================================================
++// LIRGenerator::visit* implementations
++
++void LIRGenerator::visitBox(MBox* box) {
++ MDefinition* opd = box->getOperand(0);
++
++ if (opd->isConstant() && box->canEmitAtUses()) {
++ emitAtUses(box);
++ return;
++ }
++
++ if (opd->isConstant()) {
++ define(new (alloc()) LValue(opd->toConstant()->toJSValue()), box,
++ LDefinition(LDefinition::BOX));
++ } else {
++ LBox* ins = new (alloc()) LBox(useRegisterAtStart(opd), opd->type());
++ define(ins, box, LDefinition(LDefinition::BOX));
++ }
++}
++
++void LIRGenerator::visitUnbox(MUnbox* unbox) {
++ MDefinition* box = unbox->getOperand(0);
++ MOZ_ASSERT(box->type() == MIRType::Value);
++
++ LInstructionHelper<1, BOX_PIECES, 0>* lir;
++ if (IsFloatingPointType(unbox->type())) {
++ MOZ_ASSERT(unbox->type() == MIRType::Double);
++ lir = new (alloc()) LUnboxFloatingPoint(useBoxAtStart(box));
++ } else if (unbox->fallible()) {
++ lir = new (alloc()) LUnbox(useRegisterAtStart(box));
++ } else {
++ lir = new (alloc()) LUnbox(useAtStart(box));
++ }
++
++ if (unbox->fallible()) {
++ assignSnapshot(lir, unbox->bailoutKind());
++ }
++
++ define(lir, unbox);
++}
++
++void LIRGenerator::visitCopySign(MCopySign* ins) {
++ MDefinition* lhs = ins->lhs();
++ MDefinition* rhs = ins->rhs();
++
++ MOZ_ASSERT(IsFloatingPointType(lhs->type()));
++ MOZ_ASSERT(lhs->type() == rhs->type());
++ MOZ_ASSERT(lhs->type() == ins->type());
++
++ LInstructionHelper<1, 2, 0>* lir;
++ if (lhs->type() == MIRType::Double) {
++ lir = new (alloc()) LCopySignD();
++ } else {
++ lir = new (alloc()) LCopySignF();
++ }
++
++ lowerForFPU(lir, ins, lhs, rhs);
++}
++
++void LIRGenerator::visitExtendInt32ToInt64(MExtendInt32ToInt64* ins) {
++ defineInt64(
++ new (alloc()) LExtendInt32ToInt64(useRegisterAtStart(ins->input())), ins);
++}
++
++void LIRGenerator::visitSignExtendInt64(MSignExtendInt64* ins) {
++ defineInt64(new (alloc())
++ LSignExtendInt64(useInt64RegisterAtStart(ins->input())),
++ ins);
++}
++
++void LIRGenerator::visitInt64ToFloatingPoint(MInt64ToFloatingPoint* ins) {
++ MDefinition* opd = ins->input();
++ MOZ_ASSERT(opd->type() == MIRType::Int64);
++ MOZ_ASSERT(IsFloatingPointType(ins->type()));
++ define(new (alloc()) LInt64ToFloatingPoint(useInt64Register(opd)), ins);
++}
++
++void LIRGenerator::visitSubstr(MSubstr* ins) {
++ LSubstr* lir = new (alloc())
++ LSubstr(useRegister(ins->string()), useRegister(ins->begin()),
++ useRegister(ins->length()), temp(), temp(), temp());
++ define(lir, ins);
++ assignSafepoint(lir, ins);
++}
++
++void LIRGenerator::visitReturnImpl(MDefinition* opd, bool isGenerator) {
++ MOZ_ASSERT(opd->type() == MIRType::Value);
++ LReturn* ins = new (alloc()) LReturn(isGenerator);
++ ins->setOperand(0, useFixed(opd, JSReturnReg));
++ add(ins);
++}
++void LIRGenerator::visitCompareExchangeTypedArrayElement(
++ MCompareExchangeTypedArrayElement* ins) {
++ MOZ_ASSERT(!Scalar::isFloatingType(ins->arrayType()));
++ MOZ_ASSERT(ins->elements()->type() == MIRType::Elements);
++ MOZ_ASSERT(ins->index()->type() == MIRType::IntPtr);
++
++ const LUse elements = useRegister(ins->elements());
++ const LAllocation index =
++ useRegisterOrIndexConstant(ins->index(), ins->arrayType());
++
++ if (Scalar::isBigIntType(ins->arrayType())) {
++ LInt64Allocation oldval = useInt64Register(ins->oldval());
++ LInt64Allocation newval = useInt64Register(ins->newval());
++
++ auto* lir = new (alloc())
++ LCompareExchangeTypedArrayElement64(elements, index, oldval, newval);
++ defineInt64(lir, ins);
++ return;
++ }
++
++ const LAllocation oldval = useRegister(ins->oldval());
++ const LAllocation newval = useRegister(ins->newval());
++
++ LDefinition outTemp = LDefinition::BogusTemp();
++ LDefinition valueTemp = LDefinition::BogusTemp();
++ LDefinition offsetTemp = LDefinition::BogusTemp();
++ LDefinition maskTemp = LDefinition::BogusTemp();
++
++ if (ins->arrayType() == Scalar::Uint32 && IsFloatingPointType(ins->type())) {
++ outTemp = temp();
++ }
++
++ if (Scalar::byteSize(ins->arrayType()) < 4) {
++ // PPC64 sub-word CAS uses lbarx/lharx + stbcx./sthcx. (POWER7+); only
++ // valueTemp is needed, to hold the extsb/extsh-canonicalised oldval
++ // for the 32-bit cmpw. offsetTemp/maskTemp are unused (no round-down
++ // + bit-isolate dance), and remain BogusTemp.
++ valueTemp = temp();
++ }
++
++ LCompareExchangeTypedArrayElement* lir = new (alloc())
++ LCompareExchangeTypedArrayElement(elements, index, oldval, newval,
++ outTemp, valueTemp, offsetTemp,
++ maskTemp);
++
++ define(lir, ins);
++}
++
++void LIRGenerator::visitAtomicExchangeTypedArrayElement(
++ MAtomicExchangeTypedArrayElement* ins) {
++ MOZ_ASSERT(ins->elements()->type() == MIRType::Elements);
++ MOZ_ASSERT(ins->index()->type() == MIRType::IntPtr);
++
++ const LUse elements = useRegister(ins->elements());
++ const LAllocation index =
++ useRegisterOrIndexConstant(ins->index(), ins->arrayType());
++
++ if (Scalar::isBigIntType(ins->arrayType())) {
++ LInt64Allocation value = useInt64Register(ins->value());
++
++ auto* lir = new (alloc())
++ LAtomicExchangeTypedArrayElement64(elements, index, value);
++ defineInt64(lir, ins);
++ return;
++ }
++
++ MOZ_ASSERT(ins->arrayType() <= Scalar::Uint32);
++
++ const LAllocation value = useRegister(ins->value());
++
++ LDefinition outTemp = LDefinition::BogusTemp();
++ LDefinition valueTemp = LDefinition::BogusTemp();
++ LDefinition offsetTemp = LDefinition::BogusTemp();
++ LDefinition maskTemp = LDefinition::BogusTemp();
++
++ if (ins->arrayType() == Scalar::Uint32) {
++ MOZ_ASSERT(ins->type() == MIRType::Double);
++ outTemp = temp();
++ }
++
++ // PPC64 sub-word atomic exchange uses lbarx/lharx + stbcx./sthcx. directly
++ // (POWER7+); valueTemp/offsetTemp/maskTemp are never read by the
++ // implementation (see MacroAssembler-ppc64.cpp's AtomicExchange template).
++ // Leave them as BogusTemp.
++
++ LAtomicExchangeTypedArrayElement* lir =
++ new (alloc()) LAtomicExchangeTypedArrayElement(
++ elements, index, value, outTemp, valueTemp, offsetTemp, maskTemp);
++
++ define(lir, ins);
++}
++
++void LIRGenerator::visitAtomicTypedArrayElementBinop(
++ MAtomicTypedArrayElementBinop* ins) {
++ MOZ_ASSERT(ins->arrayType() != Scalar::Uint8Clamped);
++ MOZ_ASSERT(!Scalar::isFloatingType(ins->arrayType()));
++ MOZ_ASSERT(ins->elements()->type() == MIRType::Elements);
++ MOZ_ASSERT(ins->index()->type() == MIRType::IntPtr);
++
++ const LUse elements = useRegister(ins->elements());
++ const LAllocation index =
++ useRegisterOrIndexConstant(ins->index(), ins->arrayType());
++
++ if (Scalar::isBigIntType(ins->arrayType())) {
++ LInt64Allocation value = useInt64Register(ins->value());
++ LInt64Definition temp = tempInt64();
++
++ if (ins->isForEffect()) {
++ auto* lir = new (alloc()) LAtomicTypedArrayElementBinopForEffect64(
++ elements, index, value, temp);
++ add(lir, ins);
++ return;
++ }
++
++ auto* lir = new (alloc())
++ LAtomicTypedArrayElementBinop64(elements, index, value, temp);
++ defineInt64(lir, ins);
++ return;
++ }
++
++ LAllocation value = useRegister(ins->value());
++ LDefinition valueTemp = LDefinition::BogusTemp();
++ LDefinition offsetTemp = LDefinition::BogusTemp();
++ LDefinition maskTemp = LDefinition::BogusTemp();
++
++ // PPC64 sub-word atomic-binop uses lbarx/lharx + stbcx./sthcx. (POWER7+).
++ // The fetch-op variant needs valueTemp to hold the post-op value being
++ // condition-stored (MacroAssembler-ppc64.cpp's AtomicFetchOp); the
++ // for-effect variant uses an internal scratch and needs no temps at
++ // all. offsetTemp/maskTemp are unused in either path.
++ if (Scalar::byteSize(ins->arrayType()) < 4 && !ins->isForEffect()) {
++ valueTemp = temp();
++ }
++
++ if (ins->isForEffect()) {
++ LAtomicTypedArrayElementBinopForEffect* lir =
++ new (alloc()) LAtomicTypedArrayElementBinopForEffect(
++ elements, index, value, valueTemp, offsetTemp, maskTemp);
++ add(lir, ins);
++ return;
++ }
++
++ LDefinition outTemp = LDefinition::BogusTemp();
++
++ if (ins->arrayType() == Scalar::Uint32 && IsFloatingPointType(ins->type())) {
++ outTemp = temp();
++ }
++
++ LAtomicTypedArrayElementBinop* lir =
++ new (alloc()) LAtomicTypedArrayElementBinop(
++ elements, index, value, outTemp, valueTemp, offsetTemp, maskTemp);
++ define(lir, ins);
++}
++void LIRGenerator::visitAsmJSLoadHeap(MAsmJSLoadHeap* ins) {
++ MDefinition* base = ins->base();
++ MOZ_ASSERT(base->type() == MIRType::Int32);
++
++ MDefinition* boundsCheckLimit = ins->boundsCheckLimit();
++ MOZ_ASSERT_IF(ins->needsBoundsCheck(),
++ boundsCheckLimit->type() == MIRType::Int32);
++
++ LAllocation baseAlloc = useRegisterAtStart(base);
++
++ LAllocation limitAlloc = ins->needsBoundsCheck()
++ ? useRegisterAtStart(boundsCheckLimit)
++ : LAllocation();
++
++ MOZ_ASSERT(!ins->hasMemoryBase());
++ auto* lir =
++ new (alloc()) LAsmJSLoadHeap(baseAlloc, limitAlloc, LAllocation());
++ define(lir, ins);
++}
++void LIRGenerator::visitAsmJSStoreHeap(MAsmJSStoreHeap* ins) {
++ MDefinition* base = ins->base();
++ MOZ_ASSERT(base->type() == MIRType::Int32);
++
++ MDefinition* boundsCheckLimit = ins->boundsCheckLimit();
++ MOZ_ASSERT_IF(ins->needsBoundsCheck(),
++ boundsCheckLimit->type() == MIRType::Int32);
++
++ LAllocation baseAlloc = useRegisterAtStart(base);
++
++ LAllocation limitAlloc = ins->needsBoundsCheck()
++ ? useRegisterAtStart(boundsCheckLimit)
++ : LAllocation();
++
++ MOZ_ASSERT(!ins->hasMemoryBase());
++ add(new (alloc()) LAsmJSStoreHeap(baseAlloc, useRegisterAtStart(ins->value()),
++ limitAlloc, LAllocation()),
++ ins);
++}
++void LIRGenerator::visitWasmLoad(MWasmLoad* ins) {
++ MDefinition* base = ins->base();
++ MOZ_ASSERT(base->type() == MIRType::Int32 || base->type() == MIRType::Int64);
++
++ LAllocation memoryBase =
++ ins->hasMemoryBase() ? LAllocation(useRegisterAtStart(ins->memoryBase()))
++ : LGeneralReg(HeapReg);
++
++ LAllocation ptr = useRegisterAtStart(base);
++
++ LDefinition ptrCopy = LDefinition::BogusTemp();
++ if (ins->access().offset32()) {
++ ptrCopy = tempCopy(base, 0);
++ }
++
++ if (ins->type() == MIRType::Int64) {
++ auto* lir = new (alloc()) LWasmLoadI64(ptr, memoryBase, ptrCopy);
++ defineInt64(lir, ins);
++ return;
++ }
++
++ auto* lir = new (alloc()) LWasmLoad(ptr, memoryBase, ptrCopy);
++ define(lir, ins);
++}
++void LIRGenerator::visitWasmStore(MWasmStore* ins) {
++ MDefinition* base = ins->base();
++ MOZ_ASSERT(base->type() == MIRType::Int32 || base->type() == MIRType::Int64);
++
++ MDefinition* value = ins->value();
++ LAllocation memoryBase =
++ ins->hasMemoryBase() ? LAllocation(useRegisterAtStart(ins->memoryBase()))
++ : LGeneralReg(HeapReg);
++
++ LAllocation baseAlloc = useRegisterAtStart(base);
++
++ LDefinition ptrCopy = LDefinition::BogusTemp();
++ if (ins->access().offset32()) {
++ ptrCopy = tempCopy(base, 0);
++ }
++
++ if (ins->access().type() == Scalar::Int64) {
++ LInt64Allocation valueAlloc = useInt64RegisterAtStart(value);
++ auto* lir =
++ new (alloc()) LWasmStoreI64(baseAlloc, valueAlloc, memoryBase, ptrCopy);
++ add(lir, ins);
++ return;
++ }
++
++ LAllocation valueAlloc = useRegisterAtStart(value);
++ auto* lir =
++ new (alloc()) LWasmStore(baseAlloc, valueAlloc, memoryBase, ptrCopy);
++ add(lir, ins);
++}
++void LIRGenerator::visitWasmTruncateToInt64(MWasmTruncateToInt64* ins) {
++ MDefinition* opd = ins->input();
++ MOZ_ASSERT(opd->type() == MIRType::Double || opd->type() == MIRType::Float32);
++
++ defineInt64(new (alloc()) LWasmTruncateToInt64(useRegister(opd)), ins);
++}
++void LIRGenerator::visitWasmUnsignedToDouble(MWasmUnsignedToDouble* ins) {
++ MOZ_ASSERT(ins->input()->type() == MIRType::Int32);
++ LWasmUint32ToDouble* lir =
++ new (alloc()) LWasmUint32ToDouble(useRegisterAtStart(ins->input()));
++ define(lir, ins);
++}
++void LIRGenerator::visitWasmUnsignedToFloat32(MWasmUnsignedToFloat32* ins) {
++ MOZ_ASSERT(ins->input()->type() == MIRType::Int32);
++ LWasmUint32ToFloat32* lir =
++ new (alloc()) LWasmUint32ToFloat32(useRegisterAtStart(ins->input()));
++ define(lir, ins);
++}
++void LIRGenerator::visitWasmCompareExchangeHeap(MWasmCompareExchangeHeap* ins) {
++ MDefinition* base = ins->base();
++ MOZ_ASSERT(base->type() == MIRType::Int32 || base->type() == MIRType::Int64);
++ LAllocation memoryBase = ins->hasMemoryBase()
++ ? LAllocation(useRegister(ins->memoryBase()))
++ : LGeneralReg(HeapReg);
++
++ if (ins->access().type() == Scalar::Int64) {
++ auto* lir = new (alloc()) LWasmCompareExchangeI64(
++ useRegister(base), useInt64Register(ins->oldValue()),
++ useInt64Register(ins->newValue()), memoryBase);
++ defineInt64(lir, ins);
++ return;
++ }
++
++ LDefinition valueTemp = LDefinition::BogusTemp();
++ LDefinition offsetTemp = LDefinition::BogusTemp();
++ LDefinition maskTemp = LDefinition::BogusTemp();
++
++ // PPC64 sub-word wasm CAS uses lbarx/lharx + stbcx./sthcx. (POWER7+);
++ // valueTemp holds the extsb/extsh-canonicalised oldval for cmpw, while
++ // offsetTemp/maskTemp are unused (no round-down + bit-isolate dance).
++ if (ins->access().byteSize() < 4) {
++ valueTemp = temp();
++ }
++
++ auto* lir = new (alloc())
++ LWasmCompareExchangeHeap(useRegister(base), useRegister(ins->oldValue()),
++ useRegister(ins->newValue()), memoryBase,
++ valueTemp, offsetTemp, maskTemp);
++
++ define(lir, ins);
++}
++void LIRGenerator::visitWasmAtomicExchangeHeap(MWasmAtomicExchangeHeap* ins) {
++ MDefinition* base = ins->base();
++ MOZ_ASSERT(base->type() == MIRType::Int32 || base->type() == MIRType::Int64);
++ LAllocation memoryBase = ins->hasMemoryBase()
++ ? LAllocation(useRegister(ins->memoryBase()))
++ : LGeneralReg(HeapReg);
++
++ if (ins->access().type() == Scalar::Int64) {
++ auto* lir = new (alloc()) LWasmAtomicExchangeI64(
++ useRegister(base), useInt64Register(ins->value()), memoryBase);
++ defineInt64(lir, ins);
++ return;
++ }
++
++ // PPC64 sub-word wasm atomic exchange uses lbarx/lharx + stbcx./sthcx.
++ // (POWER7+); valueTemp/offsetTemp/maskTemp are never read by the
++ // implementation (see MacroAssembler-ppc64.cpp's AtomicExchange template).
++ // Pass BogusTemp for all three.
++ LDefinition valueTemp = LDefinition::BogusTemp();
++ LDefinition offsetTemp = LDefinition::BogusTemp();
++ LDefinition maskTemp = LDefinition::BogusTemp();
++
++ auto* lir = new (alloc())
++ LWasmAtomicExchangeHeap(useRegister(base), useRegister(ins->value()),
++ memoryBase, valueTemp, offsetTemp, maskTemp);
++ define(lir, ins);
++}
++void LIRGenerator::visitWasmAtomicBinopHeap(MWasmAtomicBinopHeap* ins) {
++ MDefinition* base = ins->base();
++ MOZ_ASSERT(base->type() == MIRType::Int32 || base->type() == MIRType::Int64);
++ LAllocation memoryBase = ins->hasMemoryBase()
++ ? LAllocation(useRegister(ins->memoryBase()))
++ : LGeneralReg(HeapReg);
++
++ if (ins->access().type() == Scalar::Int64) {
++ auto* lir = new (alloc())
++ LWasmAtomicBinopI64(useRegister(base), useInt64Register(ins->value()),
++ memoryBase, tempInt64());
++ defineInt64(lir, ins);
++ return;
++ }
++
++ LDefinition valueTemp = LDefinition::BogusTemp();
++ LDefinition offsetTemp = LDefinition::BogusTemp();
++ LDefinition maskTemp = LDefinition::BogusTemp();
++
++ // PPC64 sub-word wasm atomic-binop uses lbarx/lharx + stbcx./sthcx.
++ // (POWER7+). The fetch-op variant needs valueTemp for the post-op value
++ // being condition-stored; the for-effect variant uses an internal
++ // scratch and needs no temps at all. offsetTemp/maskTemp are unused
++ // in either path.
++ if (ins->access().byteSize() < 4 && ins->hasUses()) {
++ valueTemp = temp();
++ }
++
++ if (!ins->hasUses()) {
++ LWasmAtomicBinopHeapForEffect* lir = new (alloc())
++ LWasmAtomicBinopHeapForEffect(useRegister(base),
++ useRegister(ins->value()), memoryBase,
++ valueTemp, offsetTemp, maskTemp);
++ add(lir, ins);
++ return;
++ }
++
++ auto* lir = new (alloc())
++ LWasmAtomicBinopHeap(useRegister(base), useRegister(ins->value()),
++ memoryBase, valueTemp, offsetTemp, maskTemp);
++
++ define(lir, ins);
++}
++
++// SIMD lowering
++void LIRGenerator::visitWasmTernarySimd128(MWasmTernarySimd128* ins) {
++#ifdef ENABLE_WASM_SIMD
++ MOZ_ASSERT(ins->type() == MIRType::Simd128);
++ // useRegister for v0/v1 and useRegisterAtStart only for v2 — matches
++ // ARM64's V128Bitselect policy. defineReuseInput requires the reused
++ // input to be useRegisterAtStart and the others to remain alive
++ // (useRegister); reusing all three policies as useRegisterAtStart
++ // trips the allocator's "*def->output() != alloc" assertion because
++ // v0/v1 may then share the slot with the output.
++ LDefinition temp0 = LDefinition::BogusTemp();
++ if (ins->simdOp() == wasm::SimdOp::I32x4RelaxedDotI8x16I7x16AddS) {
++ temp0 = tempSimd128();
++ }
++ auto* lir = new (alloc()) LWasmTernarySimd128(
++ useRegister(ins->v0()), useRegister(ins->v1()),
++ useRegisterAtStart(ins->v2()), temp0,
++ ins->simdOp());
++ // The PPC64 visitor (CodeGenerator-ppc64.cpp:visitWasmTernarySimd128)
++ // emits the FMA / DOT_THEN_ADD chain with v2 as the implicit
++ // accumulator. defineReuseInput tells the allocator to put `dest`
++ // in v2's slot, eliminating the previous conditional moveSimd128.
++ defineReuseInput(lir, ins, LWasmTernarySimd128::V2Index);
++#else
++ MOZ_CRASH("No SIMD");
++#endif
++}
++void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) {
++#ifdef ENABLE_WASM_SIMD
++ MOZ_ASSERT(ins->type() == MIRType::Simd128);
++ LDefinition temp0 = LDefinition::BogusTemp();
++ LDefinition temp1 = LDefinition::BogusTemp();
++ // mulInt64x2 (i64x2.mul) routes through GPRs (mfvsrd/mulld/mtvsrd) and
++ // uses an internal ScratchSimd128 + GPR scratches; its FloatRegister
++ // temp1/temp2 parameters are inherited from the shared ARM64+PPC64
++ // signature but unused on PPC64. Only FP min/max need SIMD temps for
++ // the wasm NaN-canonicalisation dance.
++ if (ins->simdOp() == wasm::SimdOp::F32x4Min ||
++ ins->simdOp() == wasm::SimdOp::F32x4Max ||
++ ins->simdOp() == wasm::SimdOp::F64x2Min ||
++ ins->simdOp() == wasm::SimdOp::F64x2Max) {
++ temp0 = tempSimd128();
++ temp1 = tempSimd128();
++ }
++ auto* lir = new (alloc()) LWasmBinarySimd128(
++ useRegisterAtStart(ins->lhs()), useRegisterAtStart(ins->rhs()),
++ temp0, temp1, ins->simdOp());
++ define(lir, ins);
++#else
++ MOZ_CRASH("No SIMD");
++#endif
++}
++void LIRGenerator::visitWasmBinarySimd128WithConstant(
++ MWasmBinarySimd128WithConstant* ins) {
++#ifdef ENABLE_WASM_SIMD
++ MOZ_ASSERT(ins->type() == MIRType::Simd128);
++ auto* lir = new (alloc()) LWasmBinarySimd128WithConstant(
++ useRegisterAtStart(ins->lhs()), LDefinition::BogusTemp(), ins->rhs());
++ define(lir, ins);
++#else
++ MOZ_CRASH("No SIMD");
++#endif
++}
++void LIRGenerator::visitWasmShiftSimd128(MWasmShiftSimd128* ins) {
++#ifdef ENABLE_WASM_SIMD
++ MOZ_ASSERT(ins->type() == MIRType::Simd128);
++ MOZ_ASSERT(ins->rhs()->type() == MIRType::Int32);
++
++ if (ins->rhs()->isConstant()) {
++ int32_t shiftCountMask;
++ switch (ins->simdOp()) {
++ case wasm::SimdOp::I8x16Shl:
++ case wasm::SimdOp::I8x16ShrU:
++ case wasm::SimdOp::I8x16ShrS:
++ shiftCountMask = 7;
++ break;
++ case wasm::SimdOp::I16x8Shl:
++ case wasm::SimdOp::I16x8ShrU:
++ case wasm::SimdOp::I16x8ShrS:
++ shiftCountMask = 15;
++ break;
++ case wasm::SimdOp::I32x4Shl:
++ case wasm::SimdOp::I32x4ShrU:
++ case wasm::SimdOp::I32x4ShrS:
++ shiftCountMask = 31;
++ break;
++ case wasm::SimdOp::I64x2Shl:
++ case wasm::SimdOp::I64x2ShrU:
++ case wasm::SimdOp::I64x2ShrS:
++ shiftCountMask = 63;
++ break;
++ default:
++ MOZ_CRASH("Unexpected shift operation");
++ }
++ int32_t shiftCount = ins->rhs()->toConstant()->toInt32() & shiftCountMask;
++#ifdef DEBUG
++ js::wasm::ReportSimdAnalysis("shift -> constant shift");
++#endif
++ auto* lir = new (alloc())
++ LWasmConstantShiftSimd128(useRegisterAtStart(ins->lhs()), shiftCount);
++ define(lir, ins);
++ } else {
++#ifdef DEBUG
++ js::wasm::ReportSimdAnalysis("shift -> variable shift");
++#endif
++ auto* lir = new (alloc()) LWasmVariableShiftSimd128(
++ useRegisterAtStart(ins->lhs()), useRegisterAtStart(ins->rhs()));
++ define(lir, ins);
++ }
++#else
++ MOZ_CRASH("No SIMD");
++#endif
++}
++#ifdef ENABLE_WASM_SIMD
++// Helper: reconstruct raw Wasm byte lane indices from analyzed SimdShuffle.
++static SimdConstant ReconstructShuffleBytes(const SimdShuffle& s) {
++ int8_t bytes[16];
++ if (s.permuteOp) {
++ switch (*s.permuteOp) {
++ case SimdPermuteOp::MOVE:
++ for (int i = 0; i < 16; i++) bytes[i] = i;
++ return SimdConstant::CreateX16(bytes);
++ case SimdPermuteOp::PERMUTE_32x4: {
++ const int32_t* w = reinterpret_cast<const int32_t*>(s.control.bytes());
++ for (int i = 0; i < 4; i++)
++ for (int j = 0; j < 4; j++) bytes[i*4+j] = w[i]*4+j;
++ return SimdConstant::CreateX16(bytes);
++ }
++ case SimdPermuteOp::PERMUTE_16x8: {
++ const int16_t* h = reinterpret_cast<const int16_t*>(s.control.bytes());
++ for (int i = 0; i < 8; i++) {
++ int idx = h[i] & 0x7;
++ bytes[i*2] = idx*2;
++ bytes[i*2+1] = idx*2+1;
++ }
++ return SimdConstant::CreateX16(bytes);
++ }
++ case SimdPermuteOp::BROADCAST_8x16: {
++ int8_t lane = reinterpret_cast<const int8_t*>(s.control.bytes())[0];
++ for (int i = 0; i < 16; i++) bytes[i] = lane;
++ return SimdConstant::CreateX16(bytes);
++ }
++ case SimdPermuteOp::BROADCAST_16x8: {
++ int8_t lane = reinterpret_cast<const int8_t*>(s.control.bytes())[0];
++ for (int i = 0; i < 8; i++) {
++ bytes[i*2] = lane*2; bytes[i*2+1] = lane*2+1;
++ }
++ return SimdConstant::CreateX16(bytes);
++ }
++ case SimdPermuteOp::ROTATE_RIGHT_8x16: {
++ uint8_t shift = reinterpret_cast<const int8_t*>(s.control.bytes())[0];
++ for (int i = 0; i < 16; i++) bytes[i] = (i + shift) % 16;
++ return SimdConstant::CreateX16(bytes);
++ }
++ case SimdPermuteOp::SHIFT_RIGHT_8x16: {
++ uint8_t shift = reinterpret_cast<const int8_t*>(s.control.bytes())[0];
++ for (int i = 0; i < 16; i++) bytes[i] = (i+shift < 16) ? (i+shift) : 0;
++ return SimdConstant::CreateX16(bytes);
++ }
++ case SimdPermuteOp::SHIFT_LEFT_8x16: {
++ uint8_t shift = reinterpret_cast<const int8_t*>(s.control.bytes())[0];
++ for (int i = 0; i < 16; i++) bytes[i] = (i >= shift) ? (i-shift) : 0;
++ return SimdConstant::CreateX16(bytes);
++ }
++ case SimdPermuteOp::REVERSE_16x8:
++ // Reverse bytes within each 16-bit lane: [1,0,3,2,5,4,...]
++ for (int i = 0; i < 8; i++) {
++ bytes[i*2] = i*2+1; bytes[i*2+1] = i*2;
++ }
++ return SimdConstant::CreateX16(bytes);
++ case SimdPermuteOp::REVERSE_32x4:
++ // Reverse bytes within each 32-bit lane: [3,2,1,0,7,6,5,4,...]
++ for (int i = 0; i < 4; i++)
++ for (int j = 0; j < 4; j++) bytes[i*4+j] = i*4+(3-j);
++ return SimdConstant::CreateX16(bytes);
++ case SimdPermuteOp::REVERSE_64x2:
++ // Reverse bytes within each 64-bit lane: [7,6,5,4,3,2,1,0,15,...]
++ for (int i = 0; i < 2; i++)
++ for (int j = 0; j < 8; j++) bytes[i*8+j] = i*8+(7-j);
++ return SimdConstant::CreateX16(bytes);
++ default:
++ break;
++ }
++ }
++ // Handle SimdShuffleOp (two-operand patterns).
++ if (s.shuffleOp) {
++ switch (*s.shuffleOp) {
++ case SimdShuffleOp::CONCAT_RIGHT_SHIFT_8x16: {
++ // control[0] = suffix length. ARM64 uses 16-count as the EXT shift.
++ // Reconstruct raw byte indices: EXT(rhs, lhs, 16-count) =
++ // take (16-count) bytes from rhs end, then count bytes from lhs start.
++ uint8_t count = 16 - reinterpret_cast<const int8_t*>(s.control.bytes())[0];
++ for (int i = 0; i < 16; i++) {
++ int idx = i + count;
++ bytes[i] = (idx < 16) ? (idx + 16) : (idx - 16);
++ }
++ return SimdConstant::CreateX16(bytes);
++ }
++ case SimdShuffleOp::BLEND_8x16: {
++ // control has 0 (lhs) or -1 (rhs) per byte.
++ const int8_t* mask = reinterpret_cast<const int8_t*>(s.control.bytes());
++ for (int i = 0; i < 16; i++)
++ bytes[i] = mask[i] ? (i + 16) : i;
++ return SimdConstant::CreateX16(bytes);
++ }
++ case SimdShuffleOp::BLEND_16x8: {
++ const int16_t* mask = reinterpret_cast<const int16_t*>(s.control.bytes());
++ for (int i = 0; i < 8; i++) {
++ int base = mask[i] ? (i * 2 + 16) : (i * 2);
++ bytes[i * 2] = base;
++ bytes[i * 2 + 1] = base + 1;
++ }
++ return SimdConstant::CreateX16(bytes);
++ }
++#define INTERLEAVE(name, width, low_start, count) \
++ case SimdShuffleOp::name: { \
++ for (int i = 0; i < count; i++) { \
++ int lhsIdx = low_start + i * width; \
++ int rhsIdx = lhsIdx + 16; \
++ for (int j = 0; j < width; j++) { \
++ bytes[(i * 2) * width + j] = lhsIdx + j; \
++ bytes[(i * 2 + 1) * width + j] = rhsIdx + j; \
++ } \
++ } \
++ return SimdConstant::CreateX16(bytes); \
++ }
++ INTERLEAVE(INTERLEAVE_LOW_8x16, 1, 0, 8)
++ INTERLEAVE(INTERLEAVE_HIGH_8x16, 1, 8, 8)
++ INTERLEAVE(INTERLEAVE_LOW_16x8, 2, 0, 4)
++ INTERLEAVE(INTERLEAVE_HIGH_16x8, 2, 8, 4)
++ INTERLEAVE(INTERLEAVE_LOW_32x4, 4, 0, 2)
++ INTERLEAVE(INTERLEAVE_HIGH_32x4, 4, 8, 2)
++ INTERLEAVE(INTERLEAVE_LOW_64x2, 8, 0, 1)
++ INTERLEAVE(INTERLEAVE_HIGH_64x2, 8, 8, 1)
++#undef INTERLEAVE
++ default:
++ break;
++ }
++ }
++ // PERMUTE_8x16, SHUFFLE_BLEND_8x16, etc: control should have raw byte indices.
++ // Force to Int8x16 type to avoid assertions from mismatched types.
++ if (s.control.type() == SimdConstant::Int8x16) {
++ return s.control;
++ }
++ // Fallback: re-create as Int8x16 from raw bytes.
++ memcpy(bytes, s.control.bytes(), 16);
++ return SimdConstant::CreateX16(bytes);
++}
++
++#endif // ENABLE_WASM_SIMD
++
++void LIRGenerator::visitWasmShuffleSimd128(MWasmShuffleSimd128* ins) {
++#ifdef ENABLE_WASM_SIMD
++ MOZ_ASSERT(ins->type() == MIRType::Simd128);
++ SimdShuffle s = ins->shuffle();
++ switch (s.opd) {
++ case SimdShuffle::Operand::LEFT:
++ case SimdShuffle::Operand::RIGHT: {
++ // Single-operand permute: the analysis has identified that only one
++ // input matters (the other is zero or unused).
++ LAllocation src;
++ if (s.opd == SimdShuffle::Operand::LEFT) {
++ src = useRegisterAtStart(ins->lhs());
++ } else {
++ src = useRegisterAtStart(ins->rhs());
++ }
++ auto* lir =
++ new (alloc()) LWasmPermuteSimd128(src, *s.permuteOp, s.control);
++ define(lir, ins);
++ break;
++ }
++ case SimdShuffle::Operand::BOTH:
++ case SimdShuffle::Operand::BOTH_SWAPPED: {
++ SimdConstant ctrl = ReconstructShuffleBytes(s);
++ LAllocation lhs, rhs;
++ if (s.opd == SimdShuffle::Operand::BOTH_SWAPPED) {
++ lhs = useRegisterAtStart(ins->rhs());
++ rhs = useRegisterAtStart(ins->lhs());
++ } else {
++ lhs = useRegisterAtStart(ins->lhs());
++ rhs = useRegisterAtStart(ins->rhs());
++ }
++ auto* lir = new (alloc()) LWasmShuffleSimd128(
++ lhs, rhs, *s.shuffleOp, ctrl);
++ define(lir, ins);
++ break;
++ }
++ }
++#else
++ MOZ_CRASH("No SIMD");
++#endif
++}
++void LIRGenerator::visitWasmReplaceLaneSimd128(MWasmReplaceLaneSimd128* ins) {
++#ifdef ENABLE_WASM_SIMD
++ MOZ_ASSERT(ins->type() == MIRType::Simd128);
++ if (ins->rhs()->type() == MIRType::Int64) {
++ auto* lir = new (alloc()) LWasmReplaceInt64LaneSimd128(
++ useRegisterAtStart(ins->lhs()), useInt64Register(ins->rhs()));
++ defineReuseInput(lir, ins, LWasmReplaceInt64LaneSimd128::LhsIndex);
++ } else {
++ auto* lir = new (alloc()) LWasmReplaceLaneSimd128(
++ useRegisterAtStart(ins->lhs()), useRegister(ins->rhs()));
++ defineReuseInput(lir, ins, LWasmReplaceLaneSimd128::LhsIndex);
++ }
++#else
++ MOZ_CRASH("No SIMD");
++#endif
++}
++void LIRGenerator::visitWasmScalarToSimd128(MWasmScalarToSimd128* ins) {
++#ifdef ENABLE_WASM_SIMD
++ MOZ_ASSERT(ins->type() == MIRType::Simd128);
++ if (ins->input()->type() == MIRType::Int64) {
++ auto* lir =
++ new (alloc()) LWasmInt64ToSimd128(useInt64RegisterAtStart(ins->input()));
++ define(lir, ins);
++ } else {
++ auto* lir =
++ new (alloc()) LWasmScalarToSimd128(useRegisterAtStart(ins->input()));
++ define(lir, ins);
++ }
++#else
++ MOZ_CRASH("No SIMD");
++#endif
++}
++void LIRGenerator::visitWasmUnarySimd128(MWasmUnarySimd128* ins) {
++#ifdef ENABLE_WASM_SIMD
++ MOZ_ASSERT(ins->type() == MIRType::Simd128);
++ auto* lir = new (alloc())
++ LWasmUnarySimd128(useRegisterAtStart(ins->input()),
++ LDefinition::BogusTemp());
++ define(lir, ins);
++#else
++ MOZ_CRASH("No SIMD");
++#endif
++}
++#ifdef ENABLE_WASM_SIMD
++bool LIRGeneratorPPC64::canFoldReduceSimd128AndBranch(wasm::SimdOp op) {
++ switch (op) {
++ case wasm::SimdOp::V128AnyTrue:
++ case wasm::SimdOp::I8x16AllTrue:
++ case wasm::SimdOp::I16x8AllTrue:
++ case wasm::SimdOp::I32x4AllTrue:
++ case wasm::SimdOp::I64x2AllTrue:
++ return true;
++ default:
++ return false;
++ }
++}
++
++bool LIRGeneratorPPC64::canEmitWasmReduceSimd128AtUses(
++ MWasmReduceSimd128* ins) {
++ if (!ins->canEmitAtUses()) {
++ return false;
++ }
++ if (ins->type() != MIRType::Int32) {
++ return false;
++ }
++ if (!canFoldReduceSimd128AndBranch(ins->simdOp())) {
++ return false;
++ }
++ MUseIterator iter(ins->usesBegin());
++ if (iter == ins->usesEnd()) {
++ return true;
++ }
++ MNode* node = iter->consumer();
++ if (!node->isDefinition() || !node->toDefinition()->isTest()) {
++ return false;
++ }
++ iter++;
++ return iter == ins->usesEnd();
++}
++#endif // ENABLE_WASM_SIMD
++
++void LIRGenerator::visitWasmReduceSimd128(MWasmReduceSimd128* ins) {
++#ifdef ENABLE_WASM_SIMD
++ if (canEmitWasmReduceSimd128AtUses(ins)) {
++ emitAtUses(ins);
++ return;
++ }
++ if (ins->type() == MIRType::Int64) {
++ auto* lir = new (alloc())
++ LWasmReduceSimd128ToInt64(useRegisterAtStart(ins->input()));
++ defineInt64(lir, ins);
++ } else {
++ auto* lir =
++ new (alloc()) LWasmReduceSimd128(useRegisterAtStart(ins->input()));
++ define(lir, ins);
++ }
++#else
++ MOZ_CRASH("No SIMD");
++#endif
++}
++void LIRGenerator::visitWasmLoadLaneSimd128(MWasmLoadLaneSimd128* ins) {
++#ifdef ENABLE_WASM_SIMD
++ LUse base = useRegisterAtStart(ins->base());
++ LUse inputUse = useRegisterAtStart(ins->value());
++ LAllocation memoryBase =
++ ins->hasMemoryBase() ? LAllocation(useRegisterAtStart(ins->memoryBase()))
++ : LGeneralReg(HeapReg);
++ auto* lir = new (alloc()) LWasmLoadLaneSimd128(base, inputUse, memoryBase);
++ define(lir, ins);
++#else
++ MOZ_CRASH("No SIMD");
++#endif
++}
++void LIRGenerator::visitWasmStoreLaneSimd128(MWasmStoreLaneSimd128* ins) {
++#ifdef ENABLE_WASM_SIMD
++ LUse base = useRegisterAtStart(ins->base());
++ LUse input = useRegisterAtStart(ins->value());
++ LAllocation memoryBase =
++ ins->hasMemoryBase() ? LAllocation(useRegisterAtStart(ins->memoryBase()))
++ : LGeneralReg(HeapReg);
++ auto* lir = new (alloc()) LWasmStoreLaneSimd128(base, input, memoryBase);
++ add(lir, ins);
++#else
++ MOZ_CRASH("No SIMD");
++#endif
++}
++
++// PPC64 specializes compare+select for {U,}Int32 / {U,}Int64 compare with
++// Int32 / Int64 result. The CodeGen visitor
++// (CodeGenerator-ppc64.cpp:visitWasmCompareAndSelect) emits
++// cmpw/cmplw/cmpd/cmpld + isel = 2 insns, replacing the ~5-7 insns the
++// generic path would emit (boolean materialization + test + isel). FP
++// specialization is not worthwhile — the generic FP select path already
++// runs faster than the specialized integer one and PPC64 lacks a true
++// fcsel equivalent (fsel only compares against zero).
++bool LIRGeneratorShared::canSpecializeWasmCompareAndSelect(
++ MCompare::CompareType compTy, MIRType insTy) {
++ const bool insOk = insTy == MIRType::Int32 || insTy == MIRType::Int64;
++ const bool cmpOk = compTy == MCompare::Compare_Int32 ||
++ compTy == MCompare::Compare_UInt32 ||
++ compTy == MCompare::Compare_Int64 ||
++ compTy == MCompare::Compare_UInt64;
++ return insOk && cmpOk;
++}
++
++void LIRGeneratorShared::lowerWasmCompareAndSelect(MWasmSelect* ins,
++ MDefinition* lhs,
++ MDefinition* rhs,
++ MCompare::CompareType compTy,
++ JSOp jsop) {
++ MOZ_ASSERT(canSpecializeWasmCompareAndSelect(compTy, ins->type()));
++ auto* lir = new (alloc()) LWasmCompareAndSelect(
++ useRegister(lhs), useRegister(rhs), useRegisterAtStart(ins->trueExpr()),
++ useRegister(ins->falseExpr()), compTy, jsop);
++ defineReuseInput(lir, ins, LWasmCompareAndSelect::IfTrueExprIndex);
++}
++
++// MIR helpers needed by the linker
++#ifdef ENABLE_WASM_SIMD
++bool MWasmTernarySimd128::specializeBitselectConstantMaskAsShuffle(
++ int8_t shuffle[16]) {
++ return false;
++}
++#endif
++
++bool MWasmBinarySimd128::specializeForConstantRhs() { return false; }
++
++#ifdef ENABLE_WASM_SIMD
++bool MWasmTernarySimd128::canRelaxBitselect() { return false; }
++#endif
++
++#ifdef ENABLE_WASM_SIMD
++bool MWasmBinarySimd128::canPmaddubsw() { return false; }
++#endif
++
++} // namespace jit
++} // namespace js
+diff --git a/js/src/jit/ppc64/Lowering-ppc64.h b/js/src/jit/ppc64/Lowering-ppc64.h
+new file mode 100644
+index 000000000000..9c3519a7bb23
+--- /dev/null
++++ b/js/src/jit/ppc64/Lowering-ppc64.h
+@@ -0,0 +1,105 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifndef jit_ppc64_Lowering_ppc64_h
++#define jit_ppc64_Lowering_ppc64_h
++
++#include "jit/shared/Lowering-shared.h"
++
++namespace js {
++namespace jit {
++
++class LIRGeneratorPPC64 : public LIRGeneratorShared {
++ protected:
++ LIRGeneratorPPC64(MIRGenerator* gen, MIRGraph& graph, LIRGraph& lirGraph)
++ : LIRGeneratorShared(gen, graph, lirGraph) {}
++
++ LTableSwitch* newLTableSwitch(const LAllocation& in,
++ const LDefinition& inputCopy);
++ LTableSwitchV* newLTableSwitchV(const LBoxAllocation& in);
++
++ void lowerForShift(LInstructionHelper<1, 2, 0>* ins, MDefinition* mir,
++ MDefinition* lhs, MDefinition* rhs);
++ template <class LInstr>
++ void lowerForShiftInt64(LInstr* ins, MDefinition* mir, MDefinition* lhs,
++ MDefinition* rhs);
++ void lowerForALU(LInstructionHelper<1, 1, 0>* ins, MDefinition* mir,
++ MDefinition* input);
++ void lowerForALU(LInstructionHelper<1, 2, 0>* ins, MDefinition* mir,
++ MDefinition* lhs, MDefinition* rhs);
++ void lowerForALUInt64(LInstructionHelper<INT64_PIECES, INT64_PIECES, 0>* ins,
++ MDefinition* mir, MDefinition* input);
++ void lowerForALUInt64(
++ LInstructionHelper<INT64_PIECES, 2 * INT64_PIECES, 0>* ins,
++ MDefinition* mir, MDefinition* lhs, MDefinition* rhs);
++ void lowerForMulInt64(LMulI64* ins, MMul* mir, MDefinition* lhs,
++ MDefinition* rhs);
++ void lowerForFPU(LInstructionHelper<1, 1, 0>* ins, MDefinition* mir,
++ MDefinition* input);
++ void lowerForFPU(LInstructionHelper<1, 2, 0>* ins, MDefinition* mir,
++ MDefinition* lhs, MDefinition* rhs);
++
++ template <size_t Temps>
++ void lowerForCompareI64(LInstructionHelper<1, 2 * INT64_PIECES, Temps>* lir,
++ MDefinition* mir, MDefinition* lhs, MDefinition* rhs);
++
++ LBoxAllocation useBoxFixed(MDefinition* mir, Register reg1, Register reg2,
++ bool useAtStart = false);
++
++ LAllocation useByteOpRegister(MDefinition* mir);
++ LAllocation useByteOpRegisterAtStart(MDefinition* mir);
++ LAllocation useByteOpRegisterOrNonDoubleConstant(MDefinition* mir);
++ LDefinition tempByteOpRegister();
++
++ LDefinition tempToUnbox();
++
++ bool needTempForPostBarrier() { return true; }
++
++ void lowerUntypedPhiInput(MPhi* phi, uint32_t inputPosition, LBlock* block,
++ size_t lirIndex);
++ void lowerInt64PhiInput(MPhi* phi, uint32_t inputPosition, LBlock* block,
++ size_t lirIndex);
++ void defineInt64Phi(MPhi* phi, size_t lirIndex);
++
++ void lowerMulI(MMul* mul, MDefinition* lhs, MDefinition* rhs);
++ void lowerDivI(MDiv* div);
++ void lowerDivI64(MDiv* div);
++ void lowerModI(MMod* mod);
++ void lowerModI64(MMod* mod);
++ void lowerUDiv(MDiv* div);
++ void lowerUDivI64(MDiv* div);
++ void lowerUMod(MMod* mod);
++ void lowerUModI64(MMod* mod);
++ void lowerUrshD(MUrsh* mir);
++ void lowerPowOfTwoI(MPow* mir);
++ void lowerBigIntPtrDiv(MBigIntPtrDiv* ins);
++ void lowerBigIntPtrMod(MBigIntPtrMod* ins);
++ void lowerBigIntPtrLsh(MBigIntPtrLsh* ins);
++ void lowerBigIntPtrRsh(MBigIntPtrRsh* ins);
++ void lowerTruncateDToInt32(MTruncateToInt32* ins);
++ void lowerTruncateFToInt32(MTruncateToInt32* ins);
++ void lowerBuiltinInt64ToFloatingPoint(MBuiltinInt64ToFloatingPoint* ins);
++ void lowerWasmSelectI(MWasmSelect* select);
++ void lowerWasmSelectI64(MWasmSelect* select);
++ void lowerWasmBuiltinTruncateToInt64(MWasmBuiltinTruncateToInt64* ins);
++ void lowerWasmBuiltinTruncateToInt32(MWasmBuiltinTruncateToInt32* ins);
++ void lowerWasmBuiltinDivI64(MWasmBuiltinDivI64* div);
++ void lowerWasmBuiltinModI64(MWasmBuiltinModI64* mod);
++ void lowerAtomicLoad64(MLoadUnboxedScalar* ins);
++ void lowerAtomicStore64(MStoreUnboxedScalar* ins);
++
++#ifdef ENABLE_WASM_SIMD
++ bool canFoldReduceSimd128AndBranch(wasm::SimdOp op);
++ bool canEmitWasmReduceSimd128AtUses(MWasmReduceSimd128* ins);
++#endif
++};
++
++typedef LIRGeneratorPPC64 LIRGeneratorSpecific;
++
++} // namespace jit
++} // namespace js
++
++#endif /* jit_ppc64_Lowering_ppc64_h */
+diff --git a/js/src/jit/ppc64/MacroAssembler-ppc64-inl.h b/js/src/jit/ppc64/MacroAssembler-ppc64-inl.h
+new file mode 100644
+index 000000000000..f82ca36b4e40
+--- /dev/null
++++ b/js/src/jit/ppc64/MacroAssembler-ppc64-inl.h
+@@ -0,0 +1,6142 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifndef jit_ppc64_MacroAssembler_ppc64_inl_h
++#define jit_ppc64_MacroAssembler_ppc64_inl_h
++
++#include "jit/ppc64/MacroAssembler-ppc64.h"
++
++namespace js {
++namespace jit {
++
++//{{{ check_macroassembler_style
++
++// ===============================================================
++// Move instructions
++
++void MacroAssembler::move64(Register64 src, Register64 dest) {
++ movePtr(src.reg, dest.reg);
++}
++
++void MacroAssembler::move64(Imm64 imm, Register64 dest) {
++ movePtr(ImmWord(imm.value), dest.reg);
++}
++
++void MacroAssembler::moveDoubleToGPR64(FloatRegister src, Register64 dest) {
++ as_mfvsrd(dest.reg, src);
++}
++
++void MacroAssembler::moveGPR64ToDouble(Register64 src, FloatRegister dest) {
++ as_mtvsrd(dest, src.reg);
++}
++
++void MacroAssembler::moveLowDoubleToGPR(FloatRegister src, Register dest) {
++ MOZ_CRASH("Not supported for this target");
++}
++
++void MacroAssembler::move64To32(Register64 src, Register dest) {
++ as_extsw(dest, src.reg);
++}
++
++void MacroAssembler::move32To64ZeroExtend(Register src, Register64 dest) {
++ // clrldi dest, src, 32 — clear upper 32 bits.
++ as_rldicl(dest.reg, src, 0, 32);
++}
++
++void MacroAssembler::move8To64SignExtend(Register src, Register64 dest) {
++ as_extsb(dest.reg, src);
++}
++
++void MacroAssembler::move16To64SignExtend(Register src, Register64 dest) {
++ as_extsh(dest.reg, src);
++}
++
++void MacroAssembler::move32To64SignExtend(Register src, Register64 dest) {
++ as_extsw(dest.reg, src);
++}
++
++void MacroAssembler::moveFloat32ToGPR(FloatRegister src, Register dest) {
++ // FPR holds double-format value (PPC convention). Convert to
++ // single-precision bits in bits 0:31 of the VSR, then extract.
++ as_xscvdpspn(ScratchDoubleReg, src);
++ as_mfvsrd(dest, ScratchDoubleReg);
++ x_srdi(dest, dest, 32);
++}
++
++void MacroAssembler::moveGPRToFloat32(Register src, FloatRegister dest) {
++ // Place raw single-precision bits in VSR bits 0:31, then convert
++ // to double-precision format (matching PPC's FPR convention, like lfs).
++ if (HasPOWER9()) {
++ // mtvsrws splats the 32-bit word to both halves of the VSR.
++ as_mtvsrws(dest, src);
++ } else {
++ // POWER8: shift GPR left 32 bits to place float bits in upper word,
++ // then move to VSR. xscvspdpn reads from bits 0:31.
++ UseScratchRegisterScope temps(*this);
++ Register tmp = temps.Acquire();
++ x_sldi(tmp, src, 32);
++ as_mtvsrd(dest, tmp);
++ }
++ as_xscvspdpn(dest, dest);
++}
++
++void MacroAssembler::moveFloat16ToGPR(FloatRegister src, Register dest) {
++ MOZ_ASSERT(HasPOWER9());
++ // src has FP16 in dw0 bits 48:63; rest of dw0 is 0 (per xscvdphp /
++ // lxsihzx / mtvsrwz contract). mfvsrd reads dw0 → dest = 0x...0000_HHHH.
++ // Mask defensively in case a future caller hands us a non-canonical FP16.
++ as_mfvsrd(dest, src);
++ as_rldicl(dest, dest, 0, 48); // clrldi 48: keep low 16 bits
++}
++
++void MacroAssembler::moveGPRToFloat16(Register src, FloatRegister dest) {
++ MOZ_ASSERT(HasPOWER9());
++ // mtvsrwz zeros dw0 word 0 and copies src's low 32 to dw0 word 1; mask
++ // src to its low 16 first so dw0 bits 32:47 stay zero (canonical FP16).
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ as_rldicl(scratch, src, 0, 48); // clrldi 48: keep only low 16
++ as_mtvsrwz(dest, scratch);
++}
++
++void MacroAssembler::move8ZeroExtend(Register src, Register dest) {
++ // rlwinm dest, src, 0, 24, 31 — mask to low 8 bits.
++ as_rlwinm(dest, src, 0, 24, 31);
++}
++
++void MacroAssembler::move8SignExtend(Register src, Register dest) {
++ as_extsb(dest, src);
++}
++
++void MacroAssembler::move16SignExtend(Register src, Register dest) {
++ as_extsh(dest, src);
++}
++
++void MacroAssembler::move8SignExtendToPtr(Register src, Register dest) {
++ as_extsb(dest, src);
++}
++
++void MacroAssembler::move16SignExtendToPtr(Register src, Register dest) {
++ as_extsh(dest, src);
++}
++
++void MacroAssembler::move32SignExtendToPtr(Register src, Register dest) {
++ as_extsw(dest, src);
++}
++
++void MacroAssembler::move32ZeroExtendToPtr(Register src, Register dest) {
++ as_rldicl(dest, src, 0, 32);
++}
++
++// ===============================================================
++// Load instructions
++
++void MacroAssembler::load32SignExtendToPtr(const Address& src, Register dest) {
++ load32(src, dest);
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::loadAbiReturnAddress(Register dest) { xs_mflr(dest); }
++
++// ===============================================================
++// Logical instructions
++
++void MacroAssembler::not32(Register reg) {
++ x_not(reg, reg);
++ as_extsw(reg, reg);
++}
++
++void MacroAssembler::notPtr(Register reg) { x_not(reg, reg); }
++
++void MacroAssembler::andPtr(Register src, Register dest) {
++ as_and_(dest, dest, src);
++}
++
++// If `mask` is a non-zero, non-all-ones contiguous run of 1-bits in a
++// 32-bit value (LSB-numbering), set MB/ME to the BE bit positions
++// (PPC convention: bit 0 = MSB) needed by `rlwinm SH=0` and return true.
++// Otherwise return false. Run-time cost is at JIT emit time only.
++static inline bool IsContigMask32(uint32_t mask, unsigned& mb, unsigned& me) {
++ if (mask == 0 || mask == 0xFFFFFFFFu) return false;
++ unsigned tz = (unsigned)__builtin_ctz(mask);
++ uint32_t shifted = mask >> tz;
++ if ((shifted & (shifted + 1)) != 0) return false; // Has a 0 between 1s.
++ unsigned width = 32 - (unsigned)__builtin_clz(shifted);
++ // LSB bits set: [tz, tz+width-1]. BE bits: [32-tz-width, 31-tz].
++ mb = 32 - tz - width;
++ me = 31 - tz;
++ return true;
++}
++
++// 64-bit contiguous-mask classification for AND-with-imm via PPC's
++// rotate-and-mask family (SH=0). On success, sets `lsb` (LSB-numbering
++// of lowest set bit) and `width` (number of contiguous 1-bits).
++// Caller picks the encoding:
++// - lsb == 0: low `width` bits set → rldicl
++// (mb6=64-width)
++// - lsb + width == 64: high `width` bits set → rldicr
++// (me6=width-1)
++// - lsb + width <= 32: contig mask within low 32 → rlwinm (zeros high
++// 32)
++// - otherwise (mid-run mask straddling bit 32 with lsb>0): no SH=0 single
++// insn fits, return false to fall back to scratch+and.
++static inline bool IsContigMask64(uint64_t mask, unsigned& lsb,
++ unsigned& width) {
++ if (mask == 0 || mask == ~uint64_t(0)) return false;
++ unsigned tz = (unsigned)__builtin_ctzll(mask);
++ uint64_t shifted = mask >> tz;
++ if ((shifted & (shifted + 1)) != 0) return false; // Has a 0 between 1s.
++ width = 64 - (unsigned)__builtin_clzll(shifted);
++ lsb = tz;
++ return true;
++}
++
++void MacroAssembler::andPtr(Imm32 imm, Register dest) {
++ // andi. handles 16-bit unsigned immediates in 1 insn (sets CR0).
++ // For wider positive immediates, IsContigMask32 → rlwinm (1 insn,
++ // also sets CR0). NOTE: andPtr sign-extends Imm32 to 64-bit before
++ // ANDing, so contig-mask is only safe when the immediate is
++ // non-negative (high bit clear) — rlwinm always zeros the high 32.
++ uint32_t uimm = uint32_t(imm.value);
++ if (is_uintN(uimm, 16)) {
++ as_andi_rc(dest, dest, uimm);
++ return;
++ }
++ unsigned mb, me;
++ if (imm.value >= 0 && IsContigMask32(uimm, mb, me)) {
++ as_rlwinm_rc(dest, dest, 0, mb, me);
++ return;
++ }
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord(uintptr_t(intptr_t(imm.value))), scratch);
++ as_and_(dest, dest, scratch);
++}
++
++void MacroAssembler::andPtr(Imm32 imm, Register src, Register dest) {
++ if (src != dest) {
++ xs_mr(dest, src);
++ }
++ andPtr(imm, dest);
++}
++
++void MacroAssembler::and64(Imm64 imm, Register64 dest) {
++ uint64_t u = imm.value;
++ // 16-bit unsigned → andi. (1 insn).
++ if (u <= 0xFFFFu) {
++ as_andi_rc(dest.reg, dest.reg, uint16_t(u));
++ return;
++ }
++ unsigned lsb, width;
++ if (IsContigMask64(u, lsb, width)) {
++ if (lsb == 0) {
++ // low `width` bits set: rldicl SH=0 MB=64-width.
++ as_rldicl_rc(dest.reg, dest.reg, 0, 64 - width);
++ return;
++ }
++ if (lsb + width == 64) {
++ // high `width` bits set: rldicr SH=0 ME=width-1.
++ as_rldicr_rc(dest.reg, dest.reg, 0, width - 1);
++ return;
++ }
++ if (lsb + width <= 32) {
++ // contig mask within low 32: rlwinm SH=0 zeros bits 0..31 too.
++ // BE positions: mb = 32 - lsb - width, me = 31 - lsb.
++ as_rlwinm_rc(dest.reg, dest.reg, 0, 32 - lsb - width, 31 - lsb);
++ return;
++ }
++ // mid-run mask straddling bit 32 (lsb>0, lsb+width>32, lsb+width<64):
++ // not encodable as SH=0 mask. Fall through to scratch+and.
++ }
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord(u), scratch);
++ as_and_(dest.reg, dest.reg, scratch);
++}
++
++void MacroAssembler::and64(Register64 src, Register64 dest) {
++ as_and_(dest.reg, dest.reg, src.reg);
++}
++
++void MacroAssembler::and32(Register src, Register dest) {
++ as_and_(dest, dest, src);
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::and32(Imm32 imm, Register dest) {
++ uint32_t uimm = uint32_t(imm.value);
++ if (is_uintN(uimm, 16)) {
++ as_andi_rc(dest, dest, uimm);
++ } else {
++ unsigned mb, me;
++ if (IsContigMask32(uimm, mb, me)) {
++ // rlwinm.SH=0 ANDs with the contiguous mask; record form sets CR0
++ // to match the side-effect of the andi. fast path above.
++ as_rlwinm_rc(dest, dest, 0, mb, me);
++ } else {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ move32(imm, scratch);
++ as_and_(dest, dest, scratch);
++ }
++ }
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::and32(Imm32 imm, Register src, Register dest) {
++ if (src != dest) {
++ xs_mr(dest, src);
++ }
++ and32(imm, dest);
++}
++
++void MacroAssembler::and32(Imm32 imm, const Address& dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ load32(dest, scratch);
++ and32(imm, scratch);
++ store32(scratch, dest);
++}
++
++void MacroAssembler::and32(const Address& src, Register dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ load32(src, scratch);
++ as_and_(dest, dest, scratch);
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::or64(Imm64 imm, Register64 dest) {
++ uint64_t u = imm.value;
++ // ori/oris zero-extend their immediates and don't touch other bits, so
++ // when imm fits in unsigned 32 (high 32 == 0) the pair handles it in
++ // 1-2 insns with no scratch.
++ if (u <= 0xFFFFFFFFu) {
++ uint16_t lo = uint16_t(u);
++ uint16_t hi = uint16_t(u >> 16);
++ if (hi == 0) {
++ as_ori(dest.reg, dest.reg, lo);
++ } else if (lo == 0) {
++ as_oris(dest.reg, dest.reg, hi);
++ } else {
++ as_ori(dest.reg, dest.reg, lo);
++ as_oris(dest.reg, dest.reg, hi);
++ }
++ return;
++ }
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord(u), scratch);
++ as_or_(dest.reg, dest.reg, scratch);
++}
++
++void MacroAssembler::or32(Register src, Register dest) {
++ as_or_(dest, dest, src);
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::or32(Imm32 imm, Register dest) {
++ uint32_t uimm = uint32_t(imm.value);
++ uint16_t lo = uimm & 0xFFFF;
++ uint16_t hi = (uimm >> 16) & 0xFFFF;
++ if (hi == 0) {
++ as_ori(dest, dest, lo);
++ } else if (lo == 0) {
++ as_oris(dest, dest, hi);
++ } else {
++ // ori + oris pair handles arbitrary 32-bit unsigned imm in 2 insns
++ // without a scratch GPR. ori/oris are non-record forms (don't touch
++ // CR0), matching the behavior of the previous scratch+or_ path
++ // (or_ is the record form, but the value-only result is what callers
++ // observe through dest).
++ as_ori(dest, dest, lo);
++ as_oris(dest, dest, hi);
++ }
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::or32(Imm32 imm, Register src, Register dest) {
++ if (src != dest) {
++ xs_mr(dest, src);
++ }
++ or32(imm, dest);
++}
++
++void MacroAssembler::or32(Imm32 imm, const Address& dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ load32(dest, scratch);
++ or32(imm, scratch);
++ store32(scratch, dest);
++}
++
++void MacroAssembler::xor64(Imm64 imm, Register64 dest) {
++ uint64_t u = imm.value;
++ // xori/xoris zero-extend their immediates; for unsigned-32-fit values
++ // they replace the scratch+xor sequence with 1-2 insns.
++ if (u <= 0xFFFFFFFFu) {
++ uint16_t lo = uint16_t(u);
++ uint16_t hi = uint16_t(u >> 16);
++ if (hi == 0) {
++ as_xori(dest.reg, dest.reg, lo);
++ } else if (lo == 0) {
++ as_xoris(dest.reg, dest.reg, hi);
++ } else {
++ as_xori(dest.reg, dest.reg, lo);
++ as_xoris(dest.reg, dest.reg, hi);
++ }
++ return;
++ }
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord(u), scratch);
++ as_xor_(dest.reg, dest.reg, scratch);
++}
++
++void MacroAssembler::orPtr(Register src, Register dest) {
++ as_or_(dest, dest, src);
++}
++
++void MacroAssembler::orPtr(Imm32 imm, Register dest) {
++ uint32_t uimm = uint32_t(imm.value);
++ uint16_t lo = uimm & 0xFFFF;
++ uint16_t hi = (uimm >> 16) & 0xFFFF;
++ // ori/oris zero-extend their immediates, so for non-negative Imm32 (high
++ // 32 of sign-extended value = 0) we can use ori+oris to OR the full
++ // 32-bit pattern in 1-2 insns. Negative Imm32 sign-extends to set high
++ // bits 32..63 in the OR — those bits would be lost with ori+oris alone.
++ if (imm.value >= 0) {
++ if (hi == 0) {
++ as_ori(dest, dest, lo);
++ } else if (lo == 0) {
++ as_oris(dest, dest, hi);
++ } else {
++ as_ori(dest, dest, lo);
++ as_oris(dest, dest, hi);
++ }
++ return;
++ }
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord(uintptr_t(intptr_t(imm.value))), scratch);
++ as_or_(dest, dest, scratch);
++}
++
++void MacroAssembler::orPtr(Imm32 imm, Register src, Register dest) {
++ if (src != dest) {
++ xs_mr(dest, src);
++ }
++ orPtr(imm, dest);
++}
++
++void MacroAssembler::or64(Register64 src, Register64 dest) {
++ as_or_(dest.reg, dest.reg, src.reg);
++}
++
++void MacroAssembler::xor64(Register64 src, Register64 dest) {
++ as_xor_(dest.reg, dest.reg, src.reg);
++}
++
++void MacroAssembler::xorPtr(Register src, Register dest) {
++ as_xor_(dest, dest, src);
++}
++
++void MacroAssembler::xorPtr(Imm32 imm, Register dest) {
++ uint32_t uimm = uint32_t(imm.value);
++ uint16_t lo = uimm & 0xFFFF;
++ uint16_t hi = (uimm >> 16) & 0xFFFF;
++ if (imm.value >= 0) {
++ if (hi == 0) {
++ as_xori(dest, dest, lo);
++ } else if (lo == 0) {
++ as_xoris(dest, dest, hi);
++ } else {
++ as_xori(dest, dest, lo);
++ as_xoris(dest, dest, hi);
++ }
++ return;
++ }
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord(uintptr_t(intptr_t(imm.value))), scratch);
++ as_xor_(dest, dest, scratch);
++}
++
++void MacroAssembler::xorPtr(Imm32 imm, Register src, Register dest) {
++ if (src != dest) {
++ xs_mr(dest, src);
++ }
++ xorPtr(imm, dest);
++}
++
++void MacroAssembler::xor32(Register src, Register dest) {
++ as_xor_(dest, dest, src);
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::xor32(Imm32 imm, Register dest) {
++ uint32_t uimm = uint32_t(imm.value);
++ uint16_t lo = uimm & 0xFFFF;
++ uint16_t hi = (uimm >> 16) & 0xFFFF;
++ if (hi == 0) {
++ as_xori(dest, dest, lo);
++ } else if (lo == 0) {
++ as_xoris(dest, dest, hi);
++ } else {
++ // xori + xoris pair — 2 insns, no scratch GPR.
++ as_xori(dest, dest, lo);
++ as_xoris(dest, dest, hi);
++ }
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::xor32(Imm32 imm, Register src, Register dest) {
++ if (src != dest) {
++ xs_mr(dest, src);
++ }
++ xor32(imm, dest);
++}
++
++void MacroAssembler::xor32(Imm32 imm, const Address& dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ load32(dest, scratch);
++ xor32(imm, scratch);
++ store32(scratch, dest);
++}
++
++void MacroAssembler::xor32(const Address& src, Register dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ load32(src, scratch);
++ xor32(scratch, dest);
++}
++
++// ===============================================================
++// Swap instructions
++
++void MacroAssembler::byteSwap16SignExtend(Register reg) {
++ if (HasPOWER10()) {
++ // brh byte-reverses every halfword in reg; extsh keeps just the
++ // low halfword's byte-reversed value, sign-extended to 64 bits.
++ as_brh(reg, reg);
++ as_extsh(reg, reg);
++ return;
++ }
++ // POWER8/9: rotate-and-mask synthesis. Swap bytes in low halfword via
++ // (reg<<8)&0xFF00 | (reg>>8)&0xFF, then sign-extend.
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ as_rlwinm(scratch, reg, 8, 16, 23); // scratch = (reg<<8) & 0xFF00
++ as_rlwinm(reg, reg, 24, 24, 31); // reg = (reg>>8) & 0xFF
++ as_or_(reg, reg, scratch);
++ as_extsh(reg, reg);
++}
++
++void MacroAssembler::byteSwap16ZeroExtend(Register reg) {
++ if (HasPOWER10()) {
++ // brh byte-reverses every halfword; rldicl with sh=0,mb=48 zeroes
++ // the upper 48 bits — no CR0 side effect (vs andi.).
++ as_brh(reg, reg);
++ as_rldicl(reg, reg, 0, 48);
++ return;
++ }
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ // Both rlwinm forms zero-extend the 64-bit destination per ISA v3.0B
++ // (mask M = MASK(MB+32, ME+32) is 0 above bit 31), so after the OR the
++ // upper 48 bits are already zero — no follow-up clearing needed.
++ as_rlwinm(scratch, reg, 8, 16, 23);
++ as_rlwinm(reg, reg, 24, 24, 31);
++ as_or_(reg, reg, scratch);
++}
++
++void MacroAssembler::byteSwap32(Register reg) {
++ if (HasPOWER10()) {
++ // brw byte-reverses both 32-bit halves; extsw drops the upper half
++ // and sign-extends the byte-reversed low word to 64 bits.
++ as_brw(reg, reg);
++ as_extsw(reg, reg);
++ return;
++ }
++ // POWER8/9: rotate-with-insert synthesis (4 insns).
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ // scratch = rotate reg left 8, mask bytes 0,3
++ as_rlwinm(scratch, reg, 8, 0, 31); // rotl32 by 8
++ as_rlwimi(scratch, reg, 24, 0, 7); // insert byte 0
++ as_rlwimi(scratch, reg, 24, 16, 23); // insert byte 2
++ // Sign-extend to 64 bits (as 32-bit value).
++ as_extsw(reg, scratch);
++}
++
++void MacroAssembler::byteSwap64(Register64 reg64) {
++ if (HasPOWER10()) {
++ // 1 insn, no FPR round-trip.
++ as_brd(reg64.reg, reg64.reg);
++ } else if (HasPOWER9()) {
++ as_mtvsrd(ScratchDoubleReg, reg64.reg);
++ as_xxbrd(ScratchDoubleReg, ScratchDoubleReg);
++ as_mfvsrd(reg64.reg, ScratchDoubleReg);
++ } else {
++ // POWER8: byte-swap via stack using stwbrx (word byte-reverse store).
++ // stwbrx RS,RA,RB stores RS byte-reversed at RA+RB.
++ // For 64-bit swap: store high word reversed at addr+0, low word at addr+4.
++ Register r = reg64.reg;
++ UseScratchRegisterScope temps(*this);
++ Register tmp = temps.Acquire();
++ as_stdu(StackPointer, StackPointer, -16);
++ // Store low 32 bits byte-reversed at SP+12.
++ as_addi(tmp, StackPointer, 12);
++ as_stwbrx(r, r0, tmp); // r0 as RA = 0, so addr = tmp
++ // Store high 32 bits byte-reversed at SP+8.
++ x_srdi(r, r, 32);
++ as_addi(tmp, StackPointer, 8);
++ as_stwbrx(r, r0, tmp); // addr = tmp
++ // Load reversed 64-bit value from SP+8.
++ as_ld(r, StackPointer, 8);
++ as_addi(StackPointer, StackPointer, 16);
++ }
++}
++
++// ===============================================================
++// Arithmetic functions
++
++void MacroAssembler::addPtr(Register src, Register dest) {
++ as_add(dest, dest, src);
++}
++
++void MacroAssembler::addPtr(Imm32 imm, Register dest) {
++ int32_t val = imm.value;
++ if (is_intN(val, 16)) {
++ as_addi(dest, dest, val);
++ return;
++ }
++ if (HasPOWER10()) {
++ // Imm32 always fits 34-bit signed; paddi does dest = dest + imm in one
++ // prefixed instruction with no scratch.
++ as_paddi(dest, dest, int64_t(val), /*R=*/false);
++ return;
++ }
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord(int64_t(val)), scratch);
++ as_add(dest, dest, scratch);
++}
++
++void MacroAssembler::addPtr(ImmWord imm, Register dest) {
++ if (is_intN(int64_t(imm.value), 16)) {
++ as_addi(dest, dest, int16_t(imm.value));
++ return;
++ }
++ if (HasPOWER10() && is_intN((intptr_t)imm.value, 34)) {
++ as_paddi(dest, dest, (int64_t)imm.value, /*R=*/false);
++ return;
++ }
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(imm, scratch);
++ as_add(dest, dest, scratch);
++}
++
++void MacroAssembler::add64(Register64 src, Register64 dest) {
++ as_add(dest.reg, dest.reg, src.reg);
++}
++
++void MacroAssembler::add64(Imm32 imm, Register64 dest) {
++ addPtr(Imm32(imm.value), dest.reg);
++}
++
++void MacroAssembler::add64(Imm64 imm, Register64 dest) {
++ if (is_intN(int64_t(imm.value), 16)) {
++ as_addi(dest.reg, dest.reg, int16_t(imm.value));
++ return;
++ }
++ if (HasPOWER10() && is_intN((int64_t)imm.value, 34)) {
++ as_paddi(dest.reg, dest.reg, (int64_t)imm.value, /*R=*/false);
++ return;
++ }
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ MOZ_ASSERT(dest.reg != scratch);
++ movePtr(ImmWord(imm.value), scratch);
++ as_add(dest.reg, dest.reg, scratch);
++}
++
++void MacroAssembler::add32(Register src, Register dest) {
++ as_add(dest, dest, src);
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::add32(Imm32 imm, Register dest) {
++ if (is_intN(imm.value, 16)) {
++ as_addi(dest, dest, imm.value);
++ } else {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ move32(imm, scratch);
++ as_add(dest, dest, scratch);
++ }
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::add32(Imm32 imm, Register src, Register dest) {
++ move32(src, dest);
++ add32(imm, dest);
++}
++
++void MacroAssembler::add32(Imm32 imm, const Address& dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ load32(dest, scratch);
++ add32(imm, scratch);
++ store32(scratch, dest);
++}
++
++void MacroAssembler::add32(const Address& src, Register dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ load32(src, scratch);
++ as_add(dest, dest, scratch);
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::addPtr(Imm32 imm, const Address& dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ loadPtr(dest, scratch);
++ addPtr(imm, scratch);
++ storePtr(scratch, dest);
++}
++
++void MacroAssembler::addPtr(const Address& src, Register dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ loadPtr(src, scratch);
++ addPtr(scratch, dest);
++}
++
++void MacroAssembler::addDouble(FloatRegister src, FloatRegister dest) {
++ as_fadd(dest, dest, src);
++}
++
++void MacroAssembler::addFloat32(FloatRegister src, FloatRegister dest) {
++ as_fadds(dest, dest, src);
++}
++
++CodeOffset MacroAssembler::sub32FromStackPtrWithPatch(Register dest) {
++ CodeOffset offset = CodeOffset(currentOffset());
++ emitLoad64Stanza(dest, 0);
++ as_subf(dest, dest, StackPointer);
++ return offset;
++}
++
++void MacroAssembler::patchSub32FromStackPtr(CodeOffset offset, Imm32 imm) {
++ Instruction* inst = (Instruction*)editSrc(BufferOffset(offset.offset()));
++ UpdateLoad64Value(inst, uint64_t(int64_t(imm.value)));
++}
++
++void MacroAssembler::subPtr(Register src, Register dest) {
++ as_subf(dest, src, dest);
++}
++
++void MacroAssembler::subPtr(Imm32 imm, Register dest) {
++ if (is_intN(-int64_t(imm.value), 16)) {
++ as_addi(dest, dest, -imm.value);
++ return;
++ }
++ if (HasPOWER10()) {
++ // -Imm32 fits 34-bit signed (worst case -INT32_MIN = +2^31, well within
++ // ±2^33). paddi with the negated immediate does the subtract in 1 prefixed
++ // insn with no scratch.
++ as_paddi(dest, dest, -int64_t(imm.value), /*R=*/false);
++ return;
++ }
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord(int64_t(imm.value)), scratch);
++ as_subf(dest, scratch, dest);
++}
++
++void MacroAssembler::sub64(Register64 src, Register64 dest) {
++ as_subf(dest.reg, src.reg, dest.reg);
++}
++
++void MacroAssembler::sub64(Imm64 imm, Register64 dest) {
++ if (is_intN(-int64_t(imm.value), 16)) {
++ as_addi(dest.reg, dest.reg, int16_t(-int64_t(imm.value)));
++ return;
++ }
++ if (HasPOWER10() && is_intN(-(int64_t)imm.value, 34)) {
++ as_paddi(dest.reg, dest.reg, -(int64_t)imm.value, /*R=*/false);
++ return;
++ }
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ MOZ_ASSERT(dest.reg != scratch);
++ movePtr(ImmWord(imm.value), scratch);
++ as_subf(dest.reg, scratch, dest.reg);
++}
++
++void MacroAssembler::sub32(Register src, Register dest) {
++ as_subf(dest, src, dest);
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::sub32(Imm32 imm, Register dest) {
++ if (is_intN(-int64_t(imm.value), 16)) {
++ as_addi(dest, dest, -imm.value);
++ } else {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ move32(imm, scratch);
++ as_subf(dest, scratch, dest);
++ }
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::sub32(const Address& src, Register dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ load32(src, scratch);
++ as_subf(dest, scratch, dest);
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::subPtr(Register src, const Address& dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ loadPtr(dest, scratch);
++ as_subf(scratch, src, scratch);
++ storePtr(scratch, dest);
++}
++
++void MacroAssembler::subPtr(const Address& addr, Register dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ loadPtr(addr, scratch);
++ as_subf(dest, scratch, dest);
++}
++
++void MacroAssembler::subDouble(FloatRegister src, FloatRegister dest) {
++ as_fsub(dest, dest, src);
++}
++
++void MacroAssembler::subFloat32(FloatRegister src, FloatRegister dest) {
++ as_fsubs(dest, dest, src);
++}
++
++void MacroAssembler::mul64(const Register64& rhs, const Register64& srcDest) {
++ as_mulld(srcDest.reg, srcDest.reg, rhs.reg);
++}
++
++void MacroAssembler::mul64(Imm64 imm, const Register64& dest) {
++ if (is_intN(int64_t(imm.value), 16)) {
++ as_mulli(dest.reg, dest.reg, int16_t(imm.value));
++ } else {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ MOZ_ASSERT(dest.reg != scratch);
++ movePtr(ImmWord(imm.value), scratch);
++ as_mulld(dest.reg, dest.reg, scratch);
++ }
++}
++
++void MacroAssembler::mul64(Imm64 imm, const Register64& dest,
++ const Register temp) {
++ MOZ_ASSERT(temp == Register::Invalid());
++ mul64(imm, dest);
++}
++
++void MacroAssembler::mul64(const Register64& src, const Register64& dest,
++ const Register temp) {
++ MOZ_ASSERT(temp == Register::Invalid());
++ as_mulld(dest.reg, dest.reg, src.reg);
++}
++
++void MacroAssembler::mulPtr(Register rhs, Register srcDest) {
++ as_mulld(srcDest, srcDest, rhs);
++}
++
++void MacroAssembler::mulPtr(ImmWord rhs, Register srcDest) {
++ if (is_intN(int64_t(rhs.value), 16)) {
++ as_mulli(srcDest, srcDest, int16_t(rhs.value));
++ return;
++ }
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ MOZ_ASSERT(srcDest != scratch);
++ movePtr(rhs, scratch);
++ mulPtr(scratch, srcDest);
++}
++
++void MacroAssembler::mulBy3(Register src, Register dest) {
++ // mulli is the 16-bit-immediate form of mulld. 1 insn, no scratch,
++ // src==dest aliasing safe (RA read before RT write).
++ as_mulli(dest, src, 3);
++}
++
++void MacroAssembler::mul32(Register rhs, Register srcDest) {
++ as_mullw(srcDest, srcDest, rhs);
++ as_extsw(srcDest, srcDest);
++}
++
++void MacroAssembler::mul32(Imm32 imm, Register srcDest) {
++ if (is_intN(imm.value, 16)) {
++ as_mulli(srcDest, srcDest, imm.value);
++ } else {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ move32(imm, scratch);
++ as_mullw(srcDest, srcDest, scratch);
++ }
++ as_extsw(srcDest, srcDest);
++}
++
++void MacroAssembler::mulHighUnsigned32(Imm32 imm, Register src, Register dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ MOZ_ASSERT(src != scratch);
++ move32(imm, scratch);
++ as_mulhwu(dest, src, scratch);
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::mulFloat32(FloatRegister src, FloatRegister dest) {
++ as_fmuls(dest, dest, src);
++}
++
++void MacroAssembler::mulDouble(FloatRegister src, FloatRegister dest) {
++ as_fmul(dest, dest, src);
++}
++
++void MacroAssembler::mulDoublePtr(ImmPtr imm, Register temp,
++ FloatRegister dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(imm, scratch);
++ as_lfd(ScratchDoubleReg, scratch, 0);
++ as_fmul(dest, dest, ScratchDoubleReg);
++}
++
++void MacroAssembler::inc64(AbsoluteAddress dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register addrReg = temps.Acquire();
++ movePtr(ImmWord(uintptr_t(dest.addr)), addrReg);
++ Register scratch = SecondScratchReg;
++ as_ld(scratch, addrReg, 0);
++ as_addi(scratch, scratch, 1);
++ as_std(scratch, addrReg, 0);
++}
++
++void MacroAssembler::divFloat32(FloatRegister src, FloatRegister dest) {
++ as_fdivs(dest, dest, src);
++}
++
++void MacroAssembler::divDouble(FloatRegister src, FloatRegister dest) {
++ as_fdiv(dest, dest, src);
++}
++
++void MacroAssembler::quotient32(Register lhs, Register rhs, Register dest,
++ bool isUnsigned) {
++ if (isUnsigned) {
++ as_divwu(dest, lhs, rhs);
++ } else {
++ as_divw(dest, lhs, rhs);
++ }
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::quotient64(Register lhs, Register rhs, Register dest,
++ bool isUnsigned) {
++ if (isUnsigned) {
++ as_divdu(dest, lhs, rhs);
++ } else {
++ as_divd(dest, lhs, rhs);
++ }
++}
++
++void MacroAssembler::remainder32(Register lhs, Register rhs, Register dest,
++ bool isUnsigned) {
++ if (HasPOWER9()) {
++ if (isUnsigned) {
++ as_moduw(dest, lhs, rhs);
++ } else {
++ as_modsw(dest, lhs, rhs);
++ }
++ } else {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ if (isUnsigned) {
++ as_divwu(scratch, lhs, rhs);
++ as_mullw(scratch, scratch, rhs);
++ } else {
++ as_divw(scratch, lhs, rhs);
++ as_mullw(scratch, scratch, rhs);
++ }
++ as_subf(dest, scratch, lhs);
++ }
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::remainder64(Register lhs, Register rhs, Register dest,
++ bool isUnsigned) {
++ if (HasPOWER9()) {
++ if (isUnsigned) {
++ as_modud(dest, lhs, rhs);
++ } else {
++ as_modsd(dest, lhs, rhs);
++ }
++ } else {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ if (isUnsigned) {
++ as_divdu(scratch, lhs, rhs);
++ as_mulld(scratch, scratch, rhs);
++ } else {
++ as_divd(scratch, lhs, rhs);
++ as_mulld(scratch, scratch, rhs);
++ }
++ as_subf(dest, scratch, lhs);
++ }
++}
++
++void MacroAssembler::neg64(Register64 reg) { as_neg(reg.reg, reg.reg); }
++
++void MacroAssembler::negPtr(Register reg) { as_neg(reg, reg); }
++
++void MacroAssembler::neg32(Register reg) {
++ as_neg(reg, reg);
++ as_extsw(reg, reg);
++}
++
++void MacroAssembler::negateDouble(FloatRegister reg) { as_fneg(reg, reg); }
++
++void MacroAssembler::negateFloat(FloatRegister reg) { as_fneg(reg, reg); }
++
++void MacroAssembler::abs32(Register src, Register dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ as_srawi(scratch, src, 31);
++ as_xor_(dest, src, scratch);
++ as_subf(dest, scratch, dest);
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::absFloat32(FloatRegister src, FloatRegister dest) {
++ as_fabs(dest, src);
++}
++
++void MacroAssembler::absDouble(FloatRegister src, FloatRegister dest) {
++ as_fabs(dest, src);
++}
++
++void MacroAssembler::sqrtFloat32(FloatRegister src, FloatRegister dest) {
++ as_fsqrts(dest, src);
++}
++
++void MacroAssembler::sqrtDouble(FloatRegister src, FloatRegister dest) {
++ as_fsqrt(dest, src);
++}
++
++void MacroAssembler::min32(Register lhs, Register rhs, Register dest) {
++ as_cmpw(lhs, rhs);
++ // isel rt, ra, rb, cond: rt = (CR[cond] set) ? ra : rb
++ // LessThan set if lhs < rhs (signed), so pick lhs; else rhs = min.
++ as_isel(dest, lhs, rhs, LessThan, cr0);
++}
++
++void MacroAssembler::min32(Register lhs, Imm32 rhs, Register dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ move32(rhs, scratch);
++ min32(lhs, scratch, dest);
++}
++
++void MacroAssembler::max32(Register lhs, Register rhs, Register dest) {
++ as_cmpw(lhs, rhs);
++ // GT set if lhs > rhs (signed), so pick lhs; else rhs = max.
++ as_isel(dest, lhs, rhs, GreaterThan, cr0);
++}
++
++void MacroAssembler::max32(Register lhs, Imm32 rhs, Register dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ move32(rhs, scratch);
++ max32(lhs, scratch, dest);
++}
++
++void MacroAssembler::minPtr(Register lhs, Register rhs, Register dest) {
++ as_cmpd(lhs, rhs);
++ as_isel(dest, lhs, rhs, LessThan, cr0);
++}
++
++void MacroAssembler::minPtr(Register lhs, ImmWord rhs, Register dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(rhs, scratch);
++ minPtr(lhs, scratch, dest);
++}
++
++void MacroAssembler::maxPtr(Register lhs, Register rhs, Register dest) {
++ as_cmpd(lhs, rhs);
++ as_isel(dest, lhs, rhs, GreaterThan, cr0);
++}
++
++void MacroAssembler::maxPtr(Register lhs, ImmWord rhs, Register dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(rhs, scratch);
++ maxPtr(lhs, scratch, dest);
++}
++
++void MacroAssembler::minFloat32(FloatRegister other, FloatRegister srcDest,
++ bool handleNaN) {
++ if (HasPOWER9()) {
++ // xsminjdp matches ECMA-262 Math.min semantics for ±0 and NaN.
++ // Float32 values are stored as doubles in PPC FPRs; the J-form
++ // result is bit-exact for values representable in float32 (which
++ // includes every NaN/±0/±Inf corner case JS observes). 1 insn.
++ as_xsminjdp(srcDest, srcDest, other);
++ return;
++ }
++ Label done, nan, equal;
++ as_fcmpu(srcDest, other);
++ if (handleNaN) {
++ ma_b(Assembler::DoubleUnordered, &nan);
++ }
++ // Handle +0 vs -0.
++ ma_b(Assembler::DoubleEqual, &equal);
++ ma_b(Assembler::DoubleLessThan, &done);
++ as_fmr(srcDest, other);
++ jump(&done);
++
++ bind(&equal);
++ // Both operands are equal. Check if they're zero.
++ loadConstantFloat32(0.0f, ScratchFloat32Reg);
++ as_fcmpu(srcDest, ScratchFloat32Reg);
++ // If not zero, they're identical; keep srcDest.
++ ma_b(Assembler::DoubleNotEqual, &done);
++ // Both are some combination of +0/-0. For min, result should be -0
++ // if either is -0: -((-srcDest) - other) gives -0 when either is -0.
++ as_fneg(ScratchFloat32Reg, srcDest);
++ as_fsubs(ScratchFloat32Reg, ScratchFloat32Reg, other);
++ as_fneg(srcDest, ScratchFloat32Reg);
++ jump(&done);
++
++ if (handleNaN) {
++ bind(&nan);
++ as_fadds(srcDest, srcDest, other);
++ }
++ bind(&done);
++}
++
++void MacroAssembler::minDouble(FloatRegister other, FloatRegister srcDest,
++ bool handleNaN) {
++ if (HasPOWER9()) {
++ // xsminjdp matches ECMA-262 Math.min semantics exactly (covers
++ // 19 corner cases including ±0 and NaN). 1 insn vs ~12 for the
++ // fcmpu/branch fallback. POWER8 fallback follows.
++ as_xsminjdp(srcDest, srcDest, other);
++ return;
++ }
++ Label done, nan, equal;
++ as_fcmpu(srcDest, other);
++ if (handleNaN) {
++ ma_b(Assembler::DoubleUnordered, &nan);
++ }
++ // Handle +0 vs -0.
++ ma_b(Assembler::DoubleEqual, &equal);
++ ma_b(Assembler::DoubleLessThan, &done);
++ as_fmr(srcDest, other);
++ jump(&done);
++
++ bind(&equal);
++ loadConstantDouble(0.0, ScratchDoubleReg);
++ as_fcmpu(srcDest, ScratchDoubleReg);
++ ma_b(Assembler::DoubleNotEqual, &done);
++ // -((-srcDest) - other) gives -0 when either is -0.
++ as_fneg(ScratchDoubleReg, srcDest);
++ as_fsub(ScratchDoubleReg, ScratchDoubleReg, other);
++ as_fneg(srcDest, ScratchDoubleReg);
++ jump(&done);
++
++ if (handleNaN) {
++ bind(&nan);
++ as_fadd(srcDest, srcDest, other);
++ }
++ bind(&done);
++}
++
++void MacroAssembler::maxFloat32(FloatRegister other, FloatRegister srcDest,
++ bool handleNaN) {
++ if (HasPOWER9()) {
++ // See minFloat32 above for the float32 ↔ J-form bit-exactness note.
++ as_xsmaxjdp(srcDest, srcDest, other);
++ return;
++ }
++ Label done, nan, equal;
++ as_fcmpu(srcDest, other);
++ if (handleNaN) {
++ ma_b(Assembler::DoubleUnordered, &nan);
++ }
++ // Handle +0 vs -0.
++ ma_b(Assembler::DoubleEqual, &equal);
++ ma_b(Assembler::DoubleGreaterThan, &done);
++ as_fmr(srcDest, other);
++ jump(&done);
++
++ bind(&equal);
++ loadConstantFloat32(0.0f, ScratchFloat32Reg);
++ as_fcmpu(srcDest, ScratchFloat32Reg);
++ ma_b(Assembler::DoubleNotEqual, &done);
++ // -0 + -0 = -0 and -0 + 0 = +0.
++ as_fadds(srcDest, srcDest, other);
++ jump(&done);
++
++ if (handleNaN) {
++ bind(&nan);
++ as_fadds(srcDest, srcDest, other);
++ }
++ bind(&done);
++}
++
++void MacroAssembler::maxDouble(FloatRegister other, FloatRegister srcDest,
++ bool handleNaN) {
++ if (HasPOWER9()) {
++ // See minDouble above for the J-form semantics note.
++ as_xsmaxjdp(srcDest, srcDest, other);
++ return;
++ }
++ Label done, nan, equal;
++ as_fcmpu(srcDest, other);
++ if (handleNaN) {
++ ma_b(Assembler::DoubleUnordered, &nan);
++ }
++ // Handle +0 vs -0.
++ ma_b(Assembler::DoubleEqual, &equal);
++ ma_b(Assembler::DoubleGreaterThan, &done);
++ as_fmr(srcDest, other);
++ jump(&done);
++
++ bind(&equal);
++ loadConstantDouble(0.0, ScratchDoubleReg);
++ as_fcmpu(srcDest, ScratchDoubleReg);
++ ma_b(Assembler::DoubleNotEqual, &done);
++ // -0 + -0 = -0 and -0 + 0 = +0.
++ as_fadd(srcDest, srcDest, other);
++ jump(&done);
++
++ if (handleNaN) {
++ bind(&nan);
++ as_fadd(srcDest, srcDest, other);
++ }
++ bind(&done);
++}
++
++// ===============================================================
++// Shift functions
++
++void MacroAssembler::lshift32(Register src, Register dest) {
++ UseScratchRegisterScope temps(*this);
++ Register masked = temps.Acquire();
++ as_rlwinm(masked, src, 0, 27, 31);
++ as_slw(dest, dest, masked);
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::lshift32(Imm32 imm, Register dest) {
++ lshift32(imm, dest, dest);
++}
++
++void MacroAssembler::lshift32(Imm32 imm, Register src, Register dest) {
++ x_slwi(dest, src, imm.value & 0x1f);
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::flexibleLshift32(Register src, Register dest) {
++ lshift32(src, dest);
++}
++
++void MacroAssembler::lshift64(Register shift, Register64 dest) {
++ // PPC64 sld uses a 7-bit shift field; shifts >= 64 produce 0.
++ // Wasm i64.shl requires shift count modulo 64, so mask to 6 bits.
++ UseScratchRegisterScope temps(asMasm());
++ Register masked = temps.Acquire();
++ as_rldicl(masked, shift, 0, 58); // clrldi: keep low 6 bits
++ as_sld(dest.reg, dest.reg, masked);
++}
++
++void MacroAssembler::lshift64(Imm32 imm, Register64 dest) {
++ MOZ_ASSERT(0 <= imm.value && imm.value < 64);
++ x_sldi(dest.reg, dest.reg, imm.value);
++}
++
++void MacroAssembler::lshiftPtr(Register shift, Register dest) {
++ as_sld(dest, dest, shift);
++}
++
++void MacroAssembler::lshiftPtr(Imm32 imm, Register dest) {
++ lshiftPtr(imm, dest, dest);
++}
++
++void MacroAssembler::lshiftPtr(Imm32 imm, Register src, Register dest) {
++ MOZ_ASSERT(0 <= imm.value && imm.value < 64);
++ x_sldi(dest, src, imm.value);
++}
++
++void MacroAssembler::flexibleLshiftPtr(Register shift, Register srcDest) {
++ lshiftPtr(shift, srcDest);
++}
++
++void MacroAssembler::rshift32(Register src, Register dest) {
++ UseScratchRegisterScope temps(*this);
++ Register masked = temps.Acquire();
++ as_rlwinm(masked, src, 0, 27, 31);
++ as_srw(dest, dest, masked);
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::rshift32(Imm32 imm, Register dest) {
++ rshift32(imm, dest, dest);
++}
++
++void MacroAssembler::rshift32(Imm32 imm, Register src, Register dest) {
++ x_srwi(dest, src, imm.value & 0x1f);
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::flexibleRshift32(Register src, Register dest) {
++ rshift32(src, dest);
++}
++
++void MacroAssembler::rshift32Arithmetic(Register src, Register dest) {
++ UseScratchRegisterScope temps(*this);
++ Register masked = temps.Acquire();
++ as_rlwinm(masked, src, 0, 27, 31);
++ as_sraw(dest, dest, masked);
++}
++
++void MacroAssembler::rshift32Arithmetic(Imm32 imm, Register dest) {
++ rshift32Arithmetic(imm, dest, dest);
++}
++
++void MacroAssembler::rshift32Arithmetic(Imm32 imm, Register src,
++ Register dest) {
++ as_srawi(dest, src, imm.value & 0x1f);
++}
++
++void MacroAssembler::flexibleRshift32Arithmetic(Register src, Register dest) {
++ rshift32Arithmetic(src, dest);
++}
++
++void MacroAssembler::rshift64(Register shift, Register64 dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register masked = temps.Acquire();
++ as_rldicl(masked, shift, 0, 58);
++ as_srd(dest.reg, dest.reg, masked);
++}
++
++void MacroAssembler::rshift64(Imm32 imm, Register64 dest) {
++ MOZ_ASSERT(0 <= imm.value && imm.value < 64);
++ x_srdi(dest.reg, dest.reg, imm.value);
++}
++
++void MacroAssembler::rshift64Arithmetic(Imm32 imm, Register64 dest) {
++ MOZ_ASSERT(0 <= imm.value && imm.value < 64);
++ as_sradi(dest.reg, dest.reg, imm.value);
++}
++
++void MacroAssembler::rshift64Arithmetic(Register shift, Register64 dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register masked = temps.Acquire();
++ as_rldicl(masked, shift, 0, 58);
++ as_srad(dest.reg, dest.reg, masked);
++}
++
++void MacroAssembler::rshiftPtr(Register shift, Register dest) {
++ as_srd(dest, dest, shift);
++}
++
++void MacroAssembler::rshiftPtr(Imm32 imm, Register dest) {
++ rshiftPtr(imm, dest, dest);
++}
++
++void MacroAssembler::rshiftPtr(Imm32 imm, Register src, Register dest) {
++ MOZ_ASSERT(0 <= imm.value && imm.value < 64);
++ x_srdi(dest, src, imm.value);
++}
++
++void MacroAssembler::flexibleRshiftPtr(Register shift, Register srcDest) {
++ rshiftPtr(shift, srcDest);
++}
++
++void MacroAssembler::rshiftPtrArithmetic(Imm32 imm, Register dest) {
++ rshiftPtrArithmetic(imm, dest, dest);
++}
++
++void MacroAssembler::rshiftPtrArithmetic(Imm32 imm, Register src,
++ Register dest) {
++ MOZ_ASSERT(0 <= imm.value && imm.value < 64);
++ as_sradi(dest, src, imm.value);
++}
++
++void MacroAssembler::rshiftPtrArithmetic(Register shift, Register dest) {
++ as_srad(dest, dest, shift);
++}
++
++void MacroAssembler::flexibleRshiftPtrArithmetic(Register shift,
++ Register srcDest) {
++ rshiftPtrArithmetic(shift, srcDest);
++}
++
++// ===============================================================
++// Rotation functions
++
++void MacroAssembler::rotateLeft(Register count, Register input, Register dest) {
++ // PPC rotlw is rlwnm with full mask: rlwnm dest, input, count, 0, 31
++ as_rlwnm(dest, input, count, 0, 31);
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::rotateLeft(Imm32 count, Register input, Register dest) {
++ as_rlwinm(dest, input, count.value & 31, 0, 31);
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::rotateLeft64(Register count, Register64 src,
++ Register64 dest, Register temp) {
++ MOZ_ASSERT(temp == Register::Invalid());
++ // rldcl dest, src, count, 0 — rotate left doubleword then clear left 0 bits.
++ as_rldcl(dest.reg, src.reg, count, 0);
++}
++
++void MacroAssembler::rotateLeft64(Imm32 count, Register64 src, Register64 dest,
++ Register temp) {
++ MOZ_ASSERT(temp == Register::Invalid());
++ // rldicl dest, src, count, 0 — rotate left doubleword immediate then clear.
++ as_rldicl(dest.reg, src.reg, count.value & 63, 0);
++}
++
++void MacroAssembler::rotateRight(Register count, Register input,
++ Register dest) {
++ // rotateRight(n) = rotateLeft(32-n). When dest != input, the negated
++ // count can land directly in dest, dropping the GPR scratch. dest may
++ // alias count harmlessly (subfic reads count, then writes dest, then
++ // rlwnm consumes the new dest as its rotate-count).
++ if (dest != input) {
++ as_subfic(dest, count, 32);
++ as_rlwnm(dest, input, dest, 0, 31);
++ as_extsw(dest, dest);
++ return;
++ }
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ as_subfic(scratch, count, 32);
++ as_rlwnm(dest, input, scratch, 0, 31);
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::rotateRight(Imm32 count, Register input, Register dest) {
++ rotateLeft(Imm32((32 - count.value) & 31), input, dest);
++}
++
++void MacroAssembler::rotateRight64(Register count, Register64 src,
++ Register64 dest, Register temp) {
++ MOZ_ASSERT(temp == Register::Invalid());
++ // Same shape as rotateRight32: when dest != src, the negated count
++ // can land directly in dest, dropping the GPR scratch.
++ if (dest.reg != src.reg) {
++ as_subfic(dest.reg, count, 64);
++ as_rldcl(dest.reg, src.reg, dest.reg, 0);
++ return;
++ }
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ as_subfic(scratch, count, 64);
++ as_rldcl(dest.reg, src.reg, scratch, 0);
++}
++
++void MacroAssembler::rotateRight64(Imm32 count, Register64 src, Register64 dest,
++ Register temp) {
++ MOZ_ASSERT(temp == Register::Invalid());
++ rotateLeft64(Imm32((64 - count.value) & 63), src, dest, temp);
++}
++
++// ===============================================================
++// Bit counting functions
++
++void MacroAssembler::clz64(Register64 src, Register64 dest) {
++ as_cntlzd(dest.reg, src.reg);
++}
++
++void MacroAssembler::ctz64(Register64 src, Register64 dest) {
++ if (HasPOWER9()) {
++ as_cnttzd(dest.reg, src.reg);
++ } else {
++ UseScratchRegisterScope temps(*this);
++ Register tmp = temps.Acquire();
++ as_neg(tmp, src.reg);
++ // and. (record form) sets CR0[eq] based on result; result is 0 iff src==0,
++ // so this folds the explicit zero-check that would otherwise need cmpdi.
++ as_and__rc(tmp, src.reg, tmp); // tmp = x & -x; CR0[eq] = (src == 0)
++ as_cntlzd(tmp, tmp); // tmp = clz(isolated bit)
++ as_subfic(dest.reg, tmp, 63); // dest = 63 - clz = ctz (for nonzero)
++ xs_li(tmp, 64);
++ as_isel(dest.reg, tmp, dest.reg, Equal); // CR0[eq] → 64 if src==0
++ }
++}
++
++void MacroAssembler::popcnt64(Register64 input, Register64 output,
++ Register tmp) {
++ as_popcntd(output.reg, input.reg);
++}
++
++void MacroAssembler::clz32(Register src, Register dest, bool knownNotZero) {
++ as_cntlzw(dest, src);
++}
++
++void MacroAssembler::ctz32(Register src, Register dest, bool knownNotZero) {
++ if (HasPOWER9()) {
++ as_cnttzw(dest, src);
++ } else {
++ UseScratchRegisterScope temps(*this);
++ Register tmp = temps.Acquire();
++ as_neg(tmp, src);
++ // and. record form folds the cmpwi src,0 that would otherwise be needed
++ // to drive the isel below: tmp == 0 iff src == 0.
++ if (knownNotZero) {
++ as_and_(tmp, src, tmp);
++ } else {
++ as_and__rc(tmp, src, tmp); // CR0[eq] = (src == 0)
++ }
++ as_cntlzw(tmp, tmp);
++ as_subfic(dest, tmp, 31);
++ if (!knownNotZero) {
++ xs_li(tmp, 32);
++ as_isel(dest, tmp, dest, Equal); // CR0[eq] → 32 if src==0
++ }
++ }
++}
++
++void MacroAssembler::popcnt32(Register input, Register output, Register tmp) {
++ as_popcntw(output, input);
++ // popcntw gives per-word results; on 64-bit the low word count is in
++ // bits 32:63, so just mask to 32 bits.
++ as_rlwinm(output, output, 0, 0, 31);
++}
++
++// ===============================================================
++// Condition functions
++
++void MacroAssembler::cmp8Set(Condition cond, Address lhs, Imm32 rhs,
++ Register dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ MOZ_ASSERT(scratch != lhs.base);
++ bool isUnsigned = (cond & Assembler::ConditionUnsigned) != 0;
++ if (isUnsigned) {
++ load8ZeroExtend(lhs, scratch);
++ Condition c = ma_cmp(scratch, Imm32(uint8_t(rhs.value)), cond, true);
++ ma_cmp_set(dest, c);
++ } else {
++ load8SignExtend(lhs, scratch);
++ Condition c = ma_cmp(scratch, Imm32(int8_t(rhs.value)), cond, true);
++ ma_cmp_set(dest, c);
++ }
++}
++
++void MacroAssembler::cmp16Set(Condition cond, Address lhs, Imm32 rhs,
++ Register dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ MOZ_ASSERT(scratch != lhs.base);
++ bool isUnsigned = (cond & Assembler::ConditionUnsigned) != 0;
++ if (isUnsigned) {
++ load16ZeroExtend(lhs, scratch);
++ Condition c = ma_cmp(scratch, Imm32(uint16_t(rhs.value)), cond, true);
++ ma_cmp_set(dest, c);
++ } else {
++ load16SignExtend(lhs, scratch);
++ Condition c = ma_cmp(scratch, Imm32(int16_t(rhs.value)), cond, true);
++ ma_cmp_set(dest, c);
++ }
++}
++
++template <typename T1, typename T2>
++void MacroAssembler::cmp32Set(Condition cond, T1 lhs, T2 rhs, Register dest) {
++ Condition c = ma_cmp(lhs, rhs, cond, true);
++ ma_cmp_set(dest, c);
++}
++
++void MacroAssembler::cmp64Set(Condition cond, Register64 lhs, Register64 rhs,
++ Register dest) {
++ Condition c = ma_cmp(lhs.reg, rhs.reg, cond);
++ ma_cmp_set(dest, c);
++}
++
++void MacroAssembler::cmp64Set(Condition cond, Register64 lhs, Imm64 rhs,
++ Register dest) {
++ Condition c = ma_cmp(lhs.reg, ImmWord(uint64_t(rhs.value)), cond);
++ ma_cmp_set(dest, c);
++}
++
++void MacroAssembler::cmp64Set(Condition cond, Address lhs, Register64 rhs,
++ Register dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ loadPtr(lhs, scratch);
++ Condition c = ma_cmp(scratch, rhs.reg, cond);
++ ma_cmp_set(dest, c);
++}
++
++void MacroAssembler::cmp64Set(Condition cond, Address lhs, Imm64 rhs,
++ Register dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ loadPtr(lhs, scratch);
++ Condition c = ma_cmp(scratch, ImmWord(uint64_t(rhs.value)), cond);
++ ma_cmp_set(dest, c);
++}
++
++template <typename T1, typename T2>
++void MacroAssembler::cmpPtrSet(Condition cond, T1 lhs, T2 rhs, Register dest) {
++ Condition c = ma_cmp(lhs, rhs, cond);
++ ma_cmp_set(dest, c);
++}
++
++// ===============================================================
++// Branch functions
++
++void MacroAssembler::branch8(Condition cond, const Address& lhs, Imm32 rhs,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ // Mirror ARM64/LoongArch64/RISC-V: narrow the immediate to the 8-bit
++ // memory operand width so both sides of the compare have matching bit
++ // patterns regardless of how move32(Imm32) materializes the imm. Use
++ // uint8 cast for equality / unsigned, int8 cast for signed relational.
++ bool isEqOrNe = (cond == Assembler::Equal) || (cond == Assembler::NotEqual);
++ bool isUnsigned = (cond & Assembler::ConditionUnsigned) != 0;
++ Imm32 narrowed(0);
++ if (isEqOrNe || isUnsigned) {
++ load8ZeroExtend(lhs, scratch);
++ narrowed = Imm32(uint8_t(rhs.value));
++ } else {
++ load8SignExtend(lhs, scratch);
++ narrowed = Imm32(int8_t(rhs.value));
++ }
++ Condition c = ma_cmp(scratch, narrowed, cond, true);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branch8(Condition cond, const BaseIndex& lhs, Register rhs,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ load8ZeroExtend(lhs, scratch);
++ Condition c = ma_cmp(scratch, rhs, cond, true);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branch16(Condition cond, const Address& lhs, Imm32 rhs,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ // See branch8: narrow the immediate to 16 bits so both sides have matching
++ // bit patterns. uint16 for equality / unsigned, int16 for signed relational.
++ bool isEqOrNe = (cond == Assembler::Equal) || (cond == Assembler::NotEqual);
++ bool isUnsigned = (cond & Assembler::ConditionUnsigned) != 0;
++ Imm32 narrowed(0);
++ if (isEqOrNe || isUnsigned) {
++ load16ZeroExtend(lhs, scratch);
++ narrowed = Imm32(uint16_t(rhs.value));
++ } else {
++ load16SignExtend(lhs, scratch);
++ narrowed = Imm32(int16_t(rhs.value));
++ }
++ Condition c = ma_cmp(scratch, narrowed, cond, true);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branch32(Condition cond, Register lhs, Register rhs,
++ Label* label) {
++ Condition c = ma_cmp(lhs, rhs, cond, true);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branch32(Condition cond, Register lhs, Imm32 imm,
++ Label* label) {
++ Condition c = ma_cmp(lhs, imm, cond, true);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branch32(Condition cond, const Address& lhs, Register rhs,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ load32(lhs, scratch);
++ Condition c = ma_cmp(scratch, rhs, cond, true);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branch32(Condition cond, const Address& lhs, Imm32 rhs,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ load32(lhs, scratch);
++ Condition c = ma_cmp(scratch, rhs, cond, true);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branch32(Condition cond, const AbsoluteAddress& lhs,
++ Register rhs, Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord((uintptr_t)lhs.addr), scratch);
++ load32(Address(scratch, 0), scratch);
++ Condition c = ma_cmp(scratch, rhs, cond, true);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branch32(Condition cond, const AbsoluteAddress& lhs,
++ Imm32 rhs, Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord((uintptr_t)lhs.addr), scratch);
++ load32(Address(scratch, 0), scratch);
++ Condition c = ma_cmp(scratch, rhs, cond, true);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branch32(Condition cond, const BaseIndex& lhs, Imm32 rhs,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ load32(lhs, scratch);
++ Condition c = ma_cmp(scratch, rhs, cond, true);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branch32(Condition cond, wasm::SymbolicAddress addr,
++ Imm32 imm, Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(addr, scratch);
++ load32(Address(scratch, 0), scratch);
++ Condition c = ma_cmp(scratch, imm, cond, true);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branch64(Condition cond, Register64 lhs, Imm64 val,
++ Label* success, Label* fail) {
++ Condition c = ma_cmp(lhs.reg, ImmWord(uint64_t(val.value)), cond);
++ if (fail) {
++ ma_b(c, success);
++ jump(fail);
++ } else {
++ ma_b(c, success);
++ }
++}
++
++void MacroAssembler::branch64(Condition cond, Register64 lhs, Register64 rhs,
++ Label* success, Label* fail) {
++ Condition c = ma_cmp(lhs.reg, rhs.reg, cond);
++ if (fail) {
++ ma_b(c, success);
++ jump(fail);
++ } else {
++ ma_b(c, success);
++ }
++}
++
++void MacroAssembler::branch64(Condition cond, const Address& lhs, Imm64 val,
++ Label* success, Label* fail) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ loadPtr(lhs, scratch);
++ Condition c = ma_cmp(scratch, ImmWord(uint64_t(val.value)), cond);
++ if (fail) {
++ ma_b(c, success);
++ jump(fail);
++ } else {
++ ma_b(c, success);
++ }
++}
++
++void MacroAssembler::branch64(Condition cond, const Address& lhs,
++ Register64 rhs, Label* success, Label* fail) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ loadPtr(lhs, scratch);
++ Condition c = ma_cmp(scratch, rhs.reg, cond);
++ if (fail) {
++ ma_b(c, success);
++ jump(fail);
++ } else {
++ ma_b(c, success);
++ }
++}
++
++void MacroAssembler::branch64(Condition cond, const Address& lhs,
++ const Address& rhs, Register scratch,
++ Label* label) {
++ loadPtr(rhs, scratch);
++ branch64(cond, lhs, Register64(scratch), label, nullptr);
++}
++
++void MacroAssembler::branchPtr(Condition cond, Register lhs, Register rhs,
++ Label* label) {
++ Condition c = ma_cmp(lhs, rhs, cond);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, Register lhs, Imm32 rhs,
++ Label* label) {
++ Condition c = ma_cmp(lhs, ImmWord(int64_t(rhs.value)), cond);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, Register lhs, ImmPtr rhs,
++ Label* label) {
++ Condition c = ma_cmp(lhs, rhs, cond);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, Register lhs, ImmGCPtr rhs,
++ Label* label) {
++ Condition c = ma_cmp(lhs, rhs, cond);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, Register lhs, ImmWord rhs,
++ Label* label) {
++ Condition c = ma_cmp(lhs, rhs, cond);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, const Address& lhs, Register rhs,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ loadPtr(lhs, scratch);
++ Condition c = ma_cmp(scratch, rhs, cond);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, const Address& lhs, ImmPtr rhs,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ loadPtr(lhs, scratch);
++ Condition c = ma_cmp(scratch, rhs, cond);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, const Address& lhs, ImmGCPtr rhs,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ loadPtr(lhs, scratch);
++ Condition c = ma_cmp(scratch, rhs, cond);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, const Address& lhs, ImmWord rhs,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ loadPtr(lhs, scratch);
++ Condition c = ma_cmp(scratch, rhs, cond);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, const AbsoluteAddress& lhs,
++ Register rhs, Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord((uintptr_t)lhs.addr), scratch);
++ loadPtr(Address(scratch, 0), scratch);
++ Condition c = ma_cmp(scratch, rhs, cond);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, const AbsoluteAddress& lhs,
++ ImmWord rhs, Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord((uintptr_t)lhs.addr), scratch);
++ loadPtr(Address(scratch, 0), scratch);
++ Condition c = ma_cmp(scratch, rhs, cond);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, wasm::SymbolicAddress lhs,
++ Register rhs, Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(lhs, scratch);
++ loadPtr(Address(scratch, 0), scratch);
++ Condition c = ma_cmp(scratch, rhs, cond);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, const BaseIndex& lhs,
++ Register rhs, Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ loadPtr(lhs, scratch);
++ Condition c = ma_cmp(scratch, rhs, cond);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchPtr(Condition cond, const BaseIndex& lhs,
++ ImmWord rhs, Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ loadPtr(lhs, scratch);
++ Condition c = ma_cmp(scratch, rhs, cond);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchPrivatePtr(Condition cond, const Address& lhs,
++ Register rhs, Label* label) {
++ branchPtr(cond, lhs, rhs, label);
++}
++
++void MacroAssembler::branchFloat(DoubleCondition cond, FloatRegister lhs,
++ FloatRegister rhs, Label* label) {
++ as_fcmpu(lhs, rhs);
++ ma_b(cond, label);
++}
++
++void MacroAssembler::branchTruncateFloat32MaybeModUint32(FloatRegister src,
++ Register dest,
++ Label* fail) {
++ // Convert float32 to int64 (truncating toward zero), fail on NaN/overflow.
++ as_fctidz(ScratchDoubleReg, src);
++ as_mfvsrd(dest, ScratchDoubleReg);
++ // PPC64 fctidz saturates to INT64_MIN on negative overflow/NaN,
++ // and to INT64_MAX on positive overflow. Check both.
++ asMasm().branchPtr(Assembler::Equal, dest, ImmWord(int64_t(INT64_MIN)), fail);
++ asMasm().branchPtr(Assembler::Equal, dest, ImmWord(int64_t(INT64_MAX)), fail);
++ // Truncate to uint32 (keep low 32 bits).
++ as_rldicl(dest, dest, 0, 32);
++}
++
++void MacroAssembler::branchTruncateFloat32ToInt32(FloatRegister src,
++ Register dest, Label* fail) {
++ convertFloat32ToInt32(src, dest, fail, false);
++}
++
++void MacroAssembler::branchDouble(DoubleCondition cond, FloatRegister lhs,
++ FloatRegister rhs, Label* label) {
++ as_fcmpu(lhs, rhs);
++ ma_b(cond, label);
++}
++
++void MacroAssembler::branchTruncateDoubleMaybeModUint32(FloatRegister src,
++ Register dest,
++ Label* fail) {
++ // Convert double to int64 (truncating toward zero), fail on NaN/overflow.
++ as_fctidz(ScratchDoubleReg, src);
++ as_mfvsrd(dest, ScratchDoubleReg);
++ // PPC64 fctidz saturates to INT64_MIN on negative overflow/NaN,
++ // and to INT64_MAX on positive overflow. Check both.
++ asMasm().branchPtr(Assembler::Equal, dest, ImmWord(int64_t(INT64_MIN)), fail);
++ asMasm().branchPtr(Assembler::Equal, dest, ImmWord(int64_t(INT64_MAX)), fail);
++ // Truncate to uint32 (keep low 32 bits).
++ as_rldicl(dest, dest, 0, 32);
++}
++
++void MacroAssembler::branchTruncateDoubleToInt32(FloatRegister src,
++ Register dest, Label* fail) {
++ convertDoubleToInt32(src, dest, fail, false);
++}
++
++void MacroAssembler::branchInt64NotInPtrRange(Register64 src, Label* label) {
++ // No-op on 64-bit.
++}
++
++void MacroAssembler::branchUInt64NotInPtrRange(Register64 src, Label* label) {
++ // Branch if src >= 2^63 (sign bit set = out of signed ptr range).
++ as_cmpdi(src.reg, 0);
++ ma_b(Assembler::LessThan, label);
++}
++
++template <typename T>
++void MacroAssembler::branchAdd32(Condition cond, T src, Register dest,
++ Label* overflow) {
++ switch (cond) {
++ case Overflow: {
++ // Do raw 64-bit add (no sign extension) so we can detect 32-bit overflow.
++ // Both inputs should already be sign-extended 32-bit values, so the
++ // 64-bit result is mathematically correct. If extsw(result) != result,
++ // the 32-bit add overflowed.
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ addPtr(src, dest);
++ as_extsw(scratch, dest);
++ as_cmpd(dest, scratch);
++ as_extsw(dest, dest);
++ ma_b(NotEqual, overflow);
++ break;
++ }
++ case NonZero:
++ case Zero:
++ add32(src, dest);
++ as_cmpdi(dest, 0);
++ ma_b(cond == NonZero ? NotEqual : Equal, overflow);
++ break;
++ case Signed:
++ case NotSigned:
++ add32(src, dest);
++ as_cmpdi(dest, 0);
++ ma_b(cond == Signed ? LessThan : GreaterThanOrEqual, overflow);
++ break;
++ case CarryClear:
++ case CarrySet: {
++ // Unsigned 32-bit carry detection: save dest, do 32-bit add,
++ // then unsigned-compare result with original. If result < original
++ // (unsigned), a carry occurred.
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ move32(dest, scratch);
++ add32(src, dest);
++ as_cmplw(dest, scratch);
++ ma_b(cond == CarrySet ? LessThan : GreaterThanOrEqual, overflow);
++ break;
++ }
++ default:
++ MOZ_CRASH("NYI");
++ }
++}
++
++template <typename T>
++void MacroAssembler::branchSub32(Condition cond, T src, Register dest,
++ Label* overflow) {
++ switch (cond) {
++ case Overflow: {
++ // Do raw 64-bit sub (no sign extension) so we can detect 32-bit overflow.
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ subPtr(src, dest);
++ as_extsw(scratch, dest);
++ as_cmpd(dest, scratch);
++ as_extsw(dest, dest);
++ ma_b(NotEqual, overflow);
++ break;
++ }
++ case NonZero:
++ case Zero:
++ sub32(src, dest);
++ as_cmpdi(dest, 0);
++ ma_b(cond == NonZero ? NotEqual : Equal, overflow);
++ break;
++ case Signed:
++ case NotSigned:
++ sub32(src, dest);
++ as_cmpdi(dest, 0);
++ ma_b(cond == Signed ? LessThan : GreaterThanOrEqual, overflow);
++ break;
++ default:
++ MOZ_CRASH("NYI");
++ }
++}
++
++template <typename T>
++void MacroAssembler::branchMul32(Condition cond, T src, Register dest,
++ Label* overflow) {
++ MOZ_ASSERT(cond == Overflow);
++ // Do raw 64-bit multiply (no sign extension) so we can detect 32-bit
++ // overflow. as_mulld gives full 64-bit low result; if extsw(result) !=
++ // result, overflow. scratch is dead after the mulld (consumed as RB),
++ // so the sign-extension round-trip reuses it instead of acquiring a
++ // second scratch.
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ move32(src, scratch);
++ as_mulld(dest, dest, scratch);
++ as_extsw(scratch, dest);
++ as_cmpd(dest, scratch);
++ as_extsw(dest, dest);
++ ma_b(NotEqual, overflow);
++}
++
++template <typename T>
++void MacroAssembler::branchRshift32(Condition cond, T src, Register dest,
++ Label* label) {
++ MOZ_ASSERT(cond == Zero || cond == NonZero);
++ rshift32(src, dest);
++ branch32(cond == Zero ? Equal : NotEqual, dest, Imm32(0), label);
++}
++
++void MacroAssembler::branchNeg32(Condition cond, Register reg, Label* label) {
++ MOZ_ASSERT(cond == Overflow);
++ neg32(reg);
++ branch32(Equal, reg, Imm32(INT32_MIN), label);
++}
++
++template <typename T>
++void MacroAssembler::branchAddPtr(Condition cond, T src, Register dest,
++ Label* label) {
++ switch (cond) {
++ case Overflow: {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(dest, scratch); // scratch = old_dest
++ addPtr(src, dest); // dest = result = old_dest + src
++ as_xor_(SecondScratchReg, dest,
++ scratch); // SecondScratch = result ^ old_dest
++ as_subf(scratch, scratch,
++ dest); // scratch = result - old_dest = src_value
++ as_xor_(scratch, scratch, dest); // scratch = src_value ^ result
++ // (old_dest ^ result) & (src_value ^ result): bit 63 set iff overflow.
++ // and. record form sets CR0[lt]=(bit 63 set), folding the cmpdi.
++ as_and__rc(scratch, scratch, SecondScratchReg);
++ ma_b(LessThan, label);
++ break;
++ }
++ case NonZero:
++ case Zero:
++ addPtr(src, dest);
++ as_cmpdi(dest, 0);
++ ma_b(cond == NonZero ? NotEqual : Equal, label);
++ break;
++ case Signed:
++ case NotSigned:
++ addPtr(src, dest);
++ as_cmpdi(dest, 0);
++ ma_b(cond == Signed ? LessThan : GreaterThanOrEqual, label);
++ break;
++ case CarryClear:
++ case CarrySet: {
++ // Unsigned 64-bit carry detection: save dest, do 64-bit add,
++ // then unsigned-compare result with original. If result < original
++ // (unsigned), a carry occurred.
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(dest, scratch);
++ addPtr(src, dest);
++ as_cmpld(dest, scratch);
++ ma_b(cond == CarrySet ? LessThan : GreaterThanOrEqual, label);
++ break;
++ }
++ default:
++ MOZ_CRASH("NYI");
++ }
++}
++
++template <typename T>
++void MacroAssembler::branchSubPtr(Condition cond, T src, Register dest,
++ Label* label) {
++ switch (cond) {
++ case Overflow: {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(dest, scratch); // scratch = old_dest
++ subPtr(src, dest); // dest = result = old_dest - src
++ // Overflow if (old_dest ^ src_value) & (old_dest ^ result) has bit 63
++ // set.
++ as_subf(SecondScratchReg, dest,
++ scratch); // SecondScratch = old_dest - result = src_value
++ as_xor_(SecondScratchReg, scratch,
++ SecondScratchReg); // old_dest ^ src_value
++ as_xor_(scratch, scratch, dest); // old_dest ^ result
++ // Record-form AND sets CR0 to the signed compare of the result vs 0,
++ // so a separate cmpdi is unnecessary; LessThan reads CR0.LT.
++ as_and__rc(scratch, scratch, SecondScratchReg);
++ ma_b(LessThan, label);
++ break;
++ }
++ case NonZero:
++ case Zero:
++ subPtr(src, dest);
++ as_cmpdi(dest, 0);
++ ma_b(cond == NonZero ? NotEqual : Equal, label);
++ break;
++ case Signed:
++ case NotSigned:
++ subPtr(src, dest);
++ as_cmpdi(dest, 0);
++ ma_b(cond == Signed ? LessThan : GreaterThanOrEqual, label);
++ break;
++ default:
++ MOZ_CRASH("NYI");
++ }
++}
++
++void MacroAssembler::branchMulPtr(Condition cond, Register src, Register dest,
++ Label* label) {
++ MOZ_ASSERT(cond == Assembler::Overflow);
++ as_mulldo(dest, dest, src);
++ ma_b(Overflow, label);
++}
++
++void MacroAssembler::branchNegPtr(Condition cond, Register reg, Label* label) {
++ MOZ_ASSERT(cond == Overflow);
++ negPtr(reg);
++ branchPtr(Assembler::Equal, reg, ImmWord(intptr_t(INTPTR_MIN)), label);
++}
++
++void MacroAssembler::decBranchPtr(Condition cond, Register lhs, Imm32 rhs,
++ Label* label) {
++ subPtr(rhs, lhs);
++ branchPtr(cond, lhs, Imm32(0), label);
++}
++
++void MacroAssembler::branchTest32(Condition cond, Register lhs, Register rhs,
++ Label* label) {
++ MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
++ cond == NotSigned);
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ if (lhs != rhs) {
++ as_and_(scratch, lhs, rhs);
++ as_extsw_rc(scratch, scratch); // CR0 set on sign-extended i32; folds cmpdi
++ } else {
++ as_extsw_rc(scratch, lhs);
++ }
++ Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++ ma_b(base, label);
++}
++
++void MacroAssembler::branchTest32(Condition cond, Register lhs, Imm32 rhs,
++ Label* label) {
++ MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
++ cond == NotSigned);
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ if (is_uintN(rhs.value, 16)) {
++ as_andi_rc(scratch, lhs, rhs.value);
++ // andi_rc sets CR0 on the masked value, but only the low 16 bits matter
++ // since rhs is a 16-bit unsigned mask — sign of the i32 result is always
++ // 0, so CR0[lt] is always 0. For Signed/NotSigned conditions the answer
++ // is fixed; for Zero/NonZero CR0[eq] is correct.
++ } else {
++ move32(rhs, scratch);
++ as_and_(scratch, lhs, scratch);
++ as_extsw_rc(scratch, scratch); // CR0 set on sign-extended i32; folds cmpdi
++ }
++ Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++ ma_b(base, label);
++}
++
++void MacroAssembler::branchTest32(Condition cond, const Address& lhs, Imm32 rhs,
++ Label* label) {
++ MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
++ cond == NotSigned);
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ load32(lhs, scratch);
++ // and32 picks up the rlwinm contig-mask fast path for non-16-bit-fit
++ // immediates that are a contiguous run of 1-bits (common: tag masks,
++ // header bit-fields). It also emits the trailing extsw.
++ and32(rhs, scratch);
++ as_cmpdi(scratch, 0);
++ Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++ ma_b(base, label);
++}
++
++void MacroAssembler::branchTest32(Condition cond, const AbsoluteAddress& lhs,
++ Imm32 rhs, Label* label) {
++ MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
++ cond == NotSigned);
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord((uintptr_t)lhs.addr), scratch);
++ load32(Address(scratch, 0), scratch);
++ and32(rhs, scratch);
++ as_cmpdi(scratch, 0);
++ Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++ ma_b(base, label);
++}
++
++void MacroAssembler::branchTestPtr(Condition cond, Register lhs, Register rhs,
++ Label* label) {
++ MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
++ cond == NotSigned);
++ if (lhs == rhs) {
++ as_cmpdi(lhs, 0);
++ } else {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ // Record-form AND sets CR0; no follow-up cmpdi needed.
++ as_and__rc(scratch, lhs, rhs);
++ }
++ Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++ ma_b(base, label);
++}
++
++void MacroAssembler::branchTestPtr(Condition cond, Register lhs, Imm32 rhs,
++ Label* label) {
++ MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
++ cond == NotSigned);
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ if (is_uintN(rhs.value, 16)) {
++ as_andi_rc(scratch, lhs, rhs.value);
++ } else {
++ move32(rhs, scratch);
++ as_and__rc(scratch, lhs, scratch); // record form folds the cmpdi
++ }
++ Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++ ma_b(base, label);
++}
++
++void MacroAssembler::branchTestPtr(Condition cond, Register lhs, ImmWord rhs,
++ Label* label) {
++ MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
++ cond == NotSigned);
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(rhs, scratch);
++ as_and__rc(scratch, lhs, scratch);
++ Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++ ma_b(base, label);
++}
++
++void MacroAssembler::branchTestPtr(Condition cond, const Address& lhs,
++ Imm32 rhs, Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ loadPtr(lhs, scratch);
++ // andPtr picks up the rlwinm contig-mask fast path for non-16-bit-fit
++ // immediates that are a contiguous run of 1-bits.
++ andPtr(rhs, scratch);
++ as_cmpdi(scratch, 0);
++ Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++ ma_b(base, label);
++}
++
++void MacroAssembler::branchTest64(Condition cond, Register64 lhs,
++ Register64 rhs, Register temp, Label* success,
++ Label* fail) {
++ MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
++ cond == NotSigned);
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ as_and__rc(scratch, lhs.reg, rhs.reg);
++ Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++ if (fail) {
++ ma_b(base, success);
++ jump(fail);
++ } else {
++ ma_b(base, success);
++ }
++}
++
++void MacroAssembler::branchTest64(Condition cond, Register64 lhs, Imm64 rhs,
++ Label* success, Label* fail) {
++ MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
++ cond == NotSigned);
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord(uint64_t(rhs.value)), scratch);
++ as_and__rc(scratch, lhs.reg, scratch);
++ Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++ if (fail) {
++ ma_b(base, success);
++ jump(fail);
++ } else {
++ ma_b(base, success);
++ }
++}
++
++// ===============================================================
++// Value-type branch functions
++
++void MacroAssembler::branchTestUndefined(Condition cond, Register tag,
++ Label* label) {
++ MOZ_ASSERT(cond == Equal || cond == NotEqual);
++ Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_UNDEFINED), cond);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchTestUndefined(Condition cond,
++ const ValueOperand& value,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ splitTag(value, scratch);
++ branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_UNDEFINED), label);
++}
++
++void MacroAssembler::branchTestUndefined(Condition cond, const Address& address,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(address, scratch);
++ branchTestTag(cond, tag, ImmTag(JSVAL_TAG_UNDEFINED), label);
++}
++
++void MacroAssembler::branchTestUndefined(Condition cond,
++ const BaseIndex& address,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(address, scratch);
++ branchTestTag(cond, tag, ImmTag(JSVAL_TAG_UNDEFINED), label);
++}
++
++void MacroAssembler::branchTestInt32(Condition cond, Register tag,
++ Label* label) {
++ MOZ_ASSERT(cond == Equal || cond == NotEqual);
++ Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_INT32), cond);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchTestInt32(Condition cond, const ValueOperand& value,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ splitTag(value, scratch);
++ branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_INT32), label);
++}
++
++void MacroAssembler::branchTestInt32(Condition cond, const Address& address,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(address, scratch);
++ branchTestTag(cond, tag, ImmTag(JSVAL_TAG_INT32), label);
++}
++
++void MacroAssembler::branchTestInt32(Condition cond, const BaseIndex& address,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(address, scratch);
++ branchTestTag(cond, tag, ImmTag(JSVAL_TAG_INT32), label);
++}
++
++void MacroAssembler::branchTestInt32Truthy(bool b, const ValueOperand& value,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ unboxInt32(value, scratch);
++ as_cmpwi(scratch, 0);
++ ma_b(b ? NotEqual : Equal, label);
++}
++
++void MacroAssembler::branchTestDouble(Condition cond, Register tag,
++ Label* label) {
++ MOZ_ASSERT(cond == Equal || cond == NotEqual);
++ Condition actual = (cond == Equal) ? BelowOrEqual : Above;
++ Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_MAX_DOUBLE), actual);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchTestDouble(Condition cond, const ValueOperand& value,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ splitTag(value, scratch);
++ branchTestDouble(cond, scratch, label);
++}
++
++void MacroAssembler::branchTestDouble(Condition cond, const Address& address,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(address, scratch);
++ branchTestDouble(cond, tag, label);
++}
++
++void MacroAssembler::branchTestDouble(Condition cond, const BaseIndex& address,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(address, scratch);
++ branchTestDouble(cond, tag, label);
++}
++
++void MacroAssembler::branchTestDoubleTruthy(bool b, FloatRegister value,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ xs_li(scratch, 0);
++ as_mtvsrd(ScratchDoubleReg, scratch);
++ as_fcmpu(value, ScratchDoubleReg);
++ DoubleCondition cond = b ? DoubleNotEqual : DoubleEqualOrUnordered;
++ ma_b(cond, label);
++}
++
++void MacroAssembler::branchTestNumber(Condition cond, Register tag,
++ Label* label) {
++ MOZ_ASSERT(cond == Equal || cond == NotEqual);
++ Condition actual = (cond == Equal) ? BelowOrEqual : Above;
++ Condition c = ma_cmp(tag, Imm32(JS::detail::ValueUpperInclNumberTag), actual);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchTestNumber(Condition cond, const ValueOperand& value,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ splitTag(value, scratch);
++ branchTestNumber(cond, scratch, label);
++}
++
++void MacroAssembler::branchTestBoolean(Condition cond, Register tag,
++ Label* label) {
++ MOZ_ASSERT(cond == Equal || cond == NotEqual);
++ Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_BOOLEAN), cond);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchTestBoolean(Condition cond,
++ const ValueOperand& value,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ splitTag(value, scratch);
++ branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_BOOLEAN), label);
++}
++
++void MacroAssembler::branchTestBoolean(Condition cond, const Address& address,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(address, scratch);
++ branchTestTag(cond, tag, ImmTag(JSVAL_TAG_BOOLEAN), label);
++}
++
++void MacroAssembler::branchTestBoolean(Condition cond, const BaseIndex& address,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(address, scratch);
++ branchTestTag(cond, tag, ImmTag(JSVAL_TAG_BOOLEAN), label);
++}
++
++void MacroAssembler::branchTestBooleanTruthy(bool b, const ValueOperand& value,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ unboxBoolean(value, scratch);
++ as_cmpwi(scratch, 0);
++ ma_b(b ? NotEqual : Equal, label);
++}
++
++void MacroAssembler::branchTestString(Condition cond, Register tag,
++ Label* label) {
++ MOZ_ASSERT(cond == Equal || cond == NotEqual);
++ Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_STRING), cond);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchTestString(Condition cond, const ValueOperand& value,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ splitTag(value, scratch);
++ branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_STRING), label);
++}
++
++void MacroAssembler::branchTestString(Condition cond, const Address& address,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(address, scratch);
++ branchTestTag(cond, tag, ImmTag(JSVAL_TAG_STRING), label);
++}
++
++void MacroAssembler::branchTestString(Condition cond, const BaseIndex& address,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(address, scratch);
++ branchTestTag(cond, tag, ImmTag(JSVAL_TAG_STRING), label);
++}
++
++void MacroAssembler::branchTestStringTruthy(bool b, const ValueOperand& value,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ unboxString(value, scratch);
++ load32(Address(scratch, JSString::offsetOfLength()), scratch);
++ as_cmpwi(scratch, 0);
++ ma_b(b ? NotEqual : Equal, label);
++}
++
++void MacroAssembler::branchTestSymbol(Condition cond, Register tag,
++ Label* label) {
++ MOZ_ASSERT(cond == Equal || cond == NotEqual);
++ Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_SYMBOL), cond);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchTestSymbol(Condition cond, const ValueOperand& value,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ splitTag(value, scratch);
++ branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_SYMBOL), label);
++}
++
++void MacroAssembler::branchTestSymbol(Condition cond, const BaseIndex& address,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(address, scratch);
++ branchTestTag(cond, tag, ImmTag(JSVAL_TAG_SYMBOL), label);
++}
++
++void MacroAssembler::branchTestSymbol(Condition cond, const Address& address,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(address, scratch);
++ branchTestTag(cond, tag, ImmTag(JSVAL_TAG_SYMBOL), label);
++}
++
++void MacroAssembler::branchTestBigInt(Condition cond, Register tag,
++ Label* label) {
++ MOZ_ASSERT(cond == Equal || cond == NotEqual);
++ Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_BIGINT), cond);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchTestBigInt(Condition cond, const ValueOperand& value,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ splitTag(value, scratch);
++ branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_BIGINT), label);
++}
++
++void MacroAssembler::branchTestBigInt(Condition cond, const Address& address,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(address, scratch);
++ branchTestTag(cond, tag, ImmTag(JSVAL_TAG_BIGINT), label);
++}
++
++void MacroAssembler::branchTestBigInt(Condition cond, const BaseIndex& address,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(address, scratch);
++ branchTestTag(cond, tag, ImmTag(JSVAL_TAG_BIGINT), label);
++}
++
++void MacroAssembler::branchTestBigIntTruthy(bool b, const ValueOperand& value,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ unboxBigInt(value, scratch);
++ load32(Address(scratch, BigInt::offsetOfDigitLength()), scratch);
++ as_cmpwi(scratch, 0);
++ ma_b(b ? NotEqual : Equal, label);
++}
++
++void MacroAssembler::branchTestNull(Condition cond, Register tag,
++ Label* label) {
++ MOZ_ASSERT(cond == Equal || cond == NotEqual);
++ Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_NULL), cond);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchTestNull(Condition cond, const ValueOperand& value,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ splitTag(value, scratch);
++ branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_NULL), label);
++}
++
++void MacroAssembler::branchTestNull(Condition cond, const Address& address,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(address, scratch);
++ branchTestTag(cond, tag, ImmTag(JSVAL_TAG_NULL), label);
++}
++
++void MacroAssembler::branchTestNull(Condition cond, const BaseIndex& address,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(address, scratch);
++ branchTestTag(cond, tag, ImmTag(JSVAL_TAG_NULL), label);
++}
++
++void MacroAssembler::branchTestObject(Condition cond, Register tag,
++ Label* label) {
++ MOZ_ASSERT(cond == Equal || cond == NotEqual);
++ Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_OBJECT), cond);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchTestObject(Condition cond, const ValueOperand& value,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ splitTag(value, scratch);
++ branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_OBJECT), label);
++}
++
++void MacroAssembler::branchTestObject(Condition cond, const Address& address,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(address, scratch);
++ branchTestTag(cond, tag, ImmTag(JSVAL_TAG_OBJECT), label);
++}
++
++void MacroAssembler::branchTestObject(Condition cond, const BaseIndex& address,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(address, scratch);
++ branchTestTag(cond, tag, ImmTag(JSVAL_TAG_OBJECT), label);
++}
++
++void MacroAssembler::branchTestPrimitive(Condition cond,
++ const ValueOperand& value,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ splitTag(value, scratch);
++ branchTestPrimitive(cond, scratch, label);
++}
++
++void MacroAssembler::branchTestGCThing(Condition cond, const Address& address,
++ Label* label) {
++ branchTestGCThingImpl(cond, address, label);
++}
++
++void MacroAssembler::branchTestGCThing(Condition cond, const BaseIndex& address,
++ Label* label) {
++ branchTestGCThingImpl(cond, address, label);
++}
++
++void MacroAssembler::branchTestGCThing(Condition cond,
++ const ValueOperand& address,
++ Label* label) {
++ branchTestGCThingImpl(cond, address, label);
++}
++
++template <typename T>
++void MacroAssembler::branchTestGCThingImpl(Condition cond, const T& address,
++ Label* label) {
++ MOZ_ASSERT(cond == Equal || cond == NotEqual);
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(address, scratch);
++ Condition actual = (cond == Equal) ? AboveOrEqual : Below;
++ Condition c =
++ ma_cmp(tag, Imm32(JS::detail::ValueLowerInclGCThingTag), actual);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchTestPrimitive(Condition cond, Register tag,
++ Label* label) {
++ MOZ_ASSERT(cond == Equal || cond == NotEqual);
++ Condition actual = (cond == Equal) ? Below : AboveOrEqual;
++ Condition c =
++ ma_cmp(tag, Imm32(JS::detail::ValueUpperExclPrimitiveTag), actual);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchTestMagic(Condition cond, Register tag,
++ Label* label) {
++ MOZ_ASSERT(cond == Equal || cond == NotEqual);
++ Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_MAGIC), cond);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchTestMagic(Condition cond, const Address& address,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(address, scratch);
++ branchTestTag(cond, tag, ImmTag(JSVAL_TAG_MAGIC), label);
++}
++
++void MacroAssembler::branchTestMagic(Condition cond, const BaseIndex& address,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(address, scratch);
++ branchTestTag(cond, tag, ImmTag(JSVAL_TAG_MAGIC), label);
++}
++
++void MacroAssembler::branchTestMagic(Condition cond, const ValueOperand& value,
++ Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ splitTag(value, scratch);
++ branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_MAGIC), label);
++}
++
++void MacroAssembler::branchTestMagic(Condition cond, const Address& valaddr,
++ JSWhyMagic why, Label* label) {
++ uint64_t magic = MagicValue(why).asRawBits();
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ loadPtr(valaddr, scratch);
++ Condition c = ma_cmp(scratch, ImmWord(magic), cond);
++ ma_b(c, label);
++}
++
++void MacroAssembler::branchTestMagic(Condition cond, const BaseIndex& valaddr,
++ JSWhyMagic why, Label* label) {
++ uint64_t magic = MagicValue(why).asRawBits();
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ loadPtr(valaddr, scratch);
++ Condition c = ma_cmp(scratch, ImmWord(magic), cond);
++ ma_b(c, label);
++}
++
++template <typename T>
++void MacroAssembler::branchTestValue(Condition cond, const T& lhs,
++ const ValueOperand& rhs, Label* label) {
++ MOZ_ASSERT(cond == Equal || cond == NotEqual);
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ loadPtr(lhs, scratch);
++ Condition c = ma_cmp(scratch, rhs.valueReg(), cond);
++ ma_b(c, label);
++}
++
++// ===============================================================
++// Test-set functions
++
++template <typename T>
++void MacroAssembler::testNumberSet(Condition cond, const T& src,
++ Register dest) {
++ MOZ_ASSERT(cond == Equal || cond == NotEqual);
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(src, scratch);
++ Condition actual = (cond == Equal) ? BelowOrEqual : Above;
++ Condition c = ma_cmp(tag, Imm32(JS::detail::ValueUpperInclNumberTag), actual);
++ ma_cmp_set(dest, c);
++}
++
++template <typename T>
++void MacroAssembler::testBooleanSet(Condition cond, const T& src,
++ Register dest) {
++ MOZ_ASSERT(cond == Equal || cond == NotEqual);
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(src, scratch);
++ Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_BOOLEAN), cond);
++ ma_cmp_set(dest, c);
++}
++
++template <typename T>
++void MacroAssembler::testStringSet(Condition cond, const T& src,
++ Register dest) {
++ MOZ_ASSERT(cond == Equal || cond == NotEqual);
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(src, scratch);
++ Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_STRING), cond);
++ ma_cmp_set(dest, c);
++}
++
++template <typename T>
++void MacroAssembler::testSymbolSet(Condition cond, const T& src,
++ Register dest) {
++ MOZ_ASSERT(cond == Equal || cond == NotEqual);
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(src, scratch);
++ Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_SYMBOL), cond);
++ ma_cmp_set(dest, c);
++}
++
++template <typename T>
++void MacroAssembler::testBigIntSet(Condition cond, const T& src,
++ Register dest) {
++ MOZ_ASSERT(cond == Equal || cond == NotEqual);
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ Register tag = extractTag(src, scratch);
++ Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_BIGINT), cond);
++ ma_cmp_set(dest, c);
++}
++
++// ===============================================================
++// Computed address / conditional move / conditional load
++
++void MacroAssembler::branchToComputedAddress(const BaseIndex& addr) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ loadPtr(addr, scratch);
++ branch(scratch);
++}
++
++void MacroAssembler::cmp32Move32(Condition cond, Register lhs, Imm32 rhs,
++ Register src, Register dest) {
++ Condition c = ma_cmp(lhs, rhs, cond, true);
++ ma_cmp_move(dest, src, c);
++}
++
++void MacroAssembler::cmp32Move32(Condition cond, Register lhs, Register rhs,
++ Register src, Register dest) {
++ Condition c = ma_cmp(lhs, rhs, cond, true);
++ ma_cmp_move(dest, src, c);
++}
++
++void MacroAssembler::cmp32Move32(Condition cond, Register lhs,
++ const Address& rhs, Register src,
++ Register dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ load32(rhs, scratch);
++ Condition c = ma_cmp(lhs, scratch, cond, true);
++ ma_cmp_move(dest, src, c);
++}
++
++void MacroAssembler::cmp32MovePtr(Condition cond, Register lhs, Imm32 rhs,
++ Register src, Register dest) {
++ Condition c = ma_cmp(lhs, rhs, cond, true);
++ ma_cmp_move(dest, src, c);
++}
++
++void MacroAssembler::cmpPtrMovePtr(Condition cond, Register lhs, Imm32 rhs,
++ Register src, Register dest) {
++ Condition c = ma_cmp(lhs, ImmWord(int64_t(rhs.value)), cond);
++ ma_cmp_move(dest, src, c);
++}
++
++void MacroAssembler::cmpPtrMovePtr(Condition cond, Register lhs, Register rhs,
++ Register src, Register dest) {
++ Condition c = ma_cmp(lhs, rhs, cond);
++ ma_cmp_move(dest, src, c);
++}
++
++void MacroAssembler::cmpPtrMovePtr(Condition cond, Register lhs,
++ const Address& rhs, Register src,
++ Register dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ loadPtr(rhs, scratch);
++ Condition c = ma_cmp(lhs, scratch, cond);
++ ma_cmp_move(dest, src, c);
++}
++
++void MacroAssembler::cmp32Load32(Condition cond, Register lhs,
++ const Address& rhs, const Address& src,
++ Register dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ load32(rhs, scratch);
++ Condition c = ma_cmp(lhs, scratch, cond, true);
++ // Conditional load: load into scratch, then isel.
++ load32(src, scratch);
++ ma_cmp_move(dest, scratch, c);
++}
++
++void MacroAssembler::cmp32Load32(Condition cond, Register lhs, Register rhs,
++ const Address& src, Register dest) {
++ Condition c = ma_cmp(lhs, rhs, cond, true);
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ load32(src, scratch);
++ ma_cmp_move(dest, scratch, c);
++}
++
++void MacroAssembler::cmp32Load32(Condition cond, Register lhs, Imm32 rhs,
++ const Address& src, Register dest) {
++ Condition c = ma_cmp(lhs, rhs, cond, true);
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ load32(src, scratch);
++ ma_cmp_move(dest, scratch, c);
++}
++
++void MacroAssembler::cmp32LoadPtr(Condition cond, const Address& lhs, Imm32 rhs,
++ const Address& src, Register dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ load32(lhs, scratch);
++ Condition c = ma_cmp(scratch, rhs, cond, true);
++ loadPtr(src, scratch);
++ ma_cmp_move(dest, scratch, c);
++}
++
++void MacroAssembler::test32LoadPtr(Condition cond, const Address& addr,
++ Imm32 mask, const Address& src,
++ Register dest) {
++ MOZ_ASSERT(cond == Zero || cond == NonZero);
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ load32(addr, scratch);
++ if (is_uintN(mask.value, 16)) {
++ as_andi_rc(scratch, scratch, mask.value);
++ } else {
++ // Use a nested scope so scratch2 is released before loadPtr below.
++ UseScratchRegisterScope temps2(asMasm());
++ Register scratch2 = temps2.Acquire();
++ move32(mask, scratch2);
++ as_and__rc(scratch, scratch, scratch2); // record form folds the cmpdi
++ }
++ Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++ loadPtr(src, scratch);
++ ma_cmp_move(dest, scratch, base);
++}
++
++void MacroAssembler::test32MovePtr(Condition cond, Register operand, Imm32 mask,
++ Register src, Register dest) {
++ MOZ_ASSERT(cond == Zero || cond == NonZero);
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ if (is_uintN(mask.value, 16)) {
++ as_andi_rc(scratch, operand, mask.value);
++ } else {
++ move32(mask, scratch);
++ as_and__rc(scratch, operand, scratch); // record form folds the cmpdi
++ }
++ Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++ ma_cmp_move(dest, src, base);
++}
++
++void MacroAssembler::test32MovePtr(Condition cond, const Address& addr,
++ Imm32 mask, Register src, Register dest) {
++ MOZ_ASSERT(cond == Zero || cond == NonZero);
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ load32(addr, scratch);
++ and32(mask, scratch);
++ as_cmpdi(scratch, 0);
++ Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
++ ma_cmp_move(dest, src, base);
++}
++
++// ===============================================================
++// Spectre mitigations
++
++void MacroAssembler::spectreMovePtr(Condition cond, Register src,
++ Register dest) {
++ // Assumes compare already issued.
++ Condition base = static_cast<Condition>(
++ cond & ~(Assembler::ConditionUnsigned | Assembler::ConditionZero));
++ ma_cmp_move(dest, src, base);
++}
++
++void MacroAssembler::spectreZeroRegister(Condition cond, Register scratch,
++ Register dest) {
++ // Assumes compare already issued. Zero dest if condition is true.
++ Condition origBase = static_cast<Condition>(
++ cond & ~(Assembler::ConditionUnsigned | Assembler::ConditionZero));
++ // If original condition is true, we want dest=0.
++ // isel: if condition true, select zero; else keep dest.
++ xs_li(scratch, 0);
++ ma_cmp_move(dest, scratch, origBase);
++}
++
++void MacroAssembler::spectreBoundsCheck32(Register index, Register length,
++ Register maybeScratch,
++ Label* failure) {
++ Condition c = ma_cmp(index, length, Below, true);
++ if (failure) {
++ ma_b(InvertCondition(c), failure);
++ }
++ if (maybeScratch != InvalidReg) {
++ xs_li(maybeScratch, 0);
++ ma_cmp_move(index, maybeScratch, InvertCondition(c));
++ }
++}
++
++void MacroAssembler::spectreBoundsCheck32(Register index, const Address& length,
++ Register maybeScratch,
++ Label* failure) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ load32(length, scratch);
++ spectreBoundsCheck32(index, scratch, maybeScratch, failure);
++}
++
++void MacroAssembler::spectreBoundsCheckPtr(Register index, Register length,
++ Register maybeScratch,
++ Label* failure) {
++ Condition c = ma_cmp(index, length, Below);
++ if (failure) {
++ ma_b(InvertCondition(c), failure);
++ }
++ if (maybeScratch != InvalidReg) {
++ xs_li(maybeScratch, 0);
++ ma_cmp_move(index, maybeScratch, InvertCondition(c));
++ }
++}
++
++void MacroAssembler::spectreBoundsCheckPtr(Register index,
++ const Address& length,
++ Register maybeScratch,
++ Label* failure) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ loadPtr(length, scratch);
++ spectreBoundsCheckPtr(index, scratch, maybeScratch, failure);
++}
++
++// ===============================================================
++// Memory access primitives
++
++FaultingCodeOffset MacroAssembler::storeFloat32(FloatRegister src,
++ const Address& addr) {
++ MOZ_ASSERT(addr.base != r0);
++ if (is_intN(addr.offset, 16)) {
++ return FaultingCodeOffset(as_stfs(src, addr.base, addr.offset).getOffset());
++ }
++ if (HasPOWER10() && is_intN((intptr_t)addr.offset, 34)) {
++ return FaultingCodeOffset(
++ as_pstfs(src, addr.base, (int64_t)addr.offset, /*R=*/false)
++ .getOffset());
++ }
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord(addr.offset), scratch);
++ return FaultingCodeOffset(as_stfsx(src, addr.base, scratch).getOffset());
++}
++
++FaultingCodeOffset MacroAssembler::storeFloat32(FloatRegister src,
++ const BaseIndex& addr) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ computeEffectiveAddress(addr, scratch);
++ return FaultingCodeOffset(as_stfs(src, scratch, 0).getOffset());
++}
++
++FaultingCodeOffset MacroAssembler::storeDouble(FloatRegister src,
++ const Address& addr) {
++ MOZ_ASSERT(addr.base != r0);
++ if (is_intN(addr.offset, 16)) {
++ return FaultingCodeOffset(as_stfd(src, addr.base, addr.offset).getOffset());
++ }
++ if (HasPOWER10() && is_intN((intptr_t)addr.offset, 34)) {
++ return FaultingCodeOffset(
++ as_pstfd(src, addr.base, (int64_t)addr.offset, /*R=*/false)
++ .getOffset());
++ }
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord(addr.offset), scratch);
++ return FaultingCodeOffset(as_stfdx(src, addr.base, scratch).getOffset());
++}
++
++FaultingCodeOffset MacroAssembler::storeDouble(FloatRegister src,
++ const BaseIndex& addr) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ computeEffectiveAddress(addr, scratch);
++ return FaultingCodeOffset(as_stfd(src, scratch, 0).getOffset());
++}
++
++FaultingCodeOffset MacroAssembler::storeFloat16(FloatRegister src,
++ const Address& dest,
++ Register temp) {
++ MOZ_ASSERT(HasPOWER9());
++ if (dest.offset == 0) {
++ return FaultingCodeOffset(as_stxsihx(src, r0, dest.base).getOffset());
++ }
++ if (is_intN(dest.offset, 16)) {
++ as_addi(temp, dest.base, dest.offset);
++ return FaultingCodeOffset(as_stxsihx(src, r0, temp).getOffset());
++ }
++ movePtr(ImmWord(dest.offset), temp);
++ return FaultingCodeOffset(as_stxsihx(src, dest.base, temp).getOffset());
++}
++
++FaultingCodeOffset MacroAssembler::storeFloat16(FloatRegister src,
++ const BaseIndex& dest,
++ Register temp) {
++ MOZ_ASSERT(HasPOWER9());
++ computeEffectiveAddress(dest, temp);
++ return FaultingCodeOffset(as_stxsihx(src, r0, temp).getOffset());
++}
++
++void MacroAssembler::memoryBarrier(MemoryBarrier barrier) {
++ if (barrier.isNone()) {
++ return;
++ }
++ if (barrier.hasStoreLoad() || barrier.hasSync()) {
++ as_sync();
++ } else {
++ as_lwsync();
++ }
++}
++
++// ===============================================================
++// Clamping functions
++
++void MacroAssembler::clampIntToUint8(Register reg) {
++ // Clamp to [0, 255].
++ Label done;
++ as_cmpwi(reg, 255);
++ ma_b(LessThanOrEqual, &done);
++ move32(Imm32(255), reg);
++ bind(&done);
++ Label positive;
++ as_cmpwi(reg, 0);
++ ma_b(GreaterThanOrEqual, &positive);
++ move32(Imm32(0), reg);
++ bind(&positive);
++}
++
++// ===============================================================
++// Unboxing
++
++void MacroAssembler::fallibleUnboxPtr(const ValueOperand& src, Register dest,
++ JSValueType type, Label* fail) {
++ MOZ_ASSERT(type == JSVAL_TYPE_OBJECT || type == JSVAL_TYPE_STRING ||
++ type == JSVAL_TYPE_SYMBOL || type == JSVAL_TYPE_BIGINT);
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ splitTag(src, scratch);
++ Condition c = ma_cmp(scratch, ImmTag(JSVAL_TYPE_TO_TAG(type)), NotEqual);
++ ma_b(c, fail);
++ unboxNonDouble(src, dest, type);
++}
++
++void MacroAssembler::fallibleUnboxPtr(const Address& src, Register dest,
++ JSValueType type, Label* fail) {
++ loadValue(src, ValueOperand(dest));
++ fallibleUnboxPtr(ValueOperand(dest), dest, type, fail);
++}
++
++void MacroAssembler::fallibleUnboxPtr(const BaseIndex& src, Register dest,
++ JSValueType type, Label* fail) {
++ loadValue(src, ValueOperand(dest));
++ fallibleUnboxPtr(ValueOperand(dest), dest, type, fail);
++}
++
++void MacroAssembler::wasmAddSubI128HI64(Register lhsLo, Register lhsHi,
++ Register rhsLo, Register rhsHi,
++ Register output, bool isAdd) {
++ MOZ_RELEASE_ASSERT(output != lhsLo && output != lhsHi && output != rhsLo &&
++ output != rhsHi);
++ if (isAdd) {
++ // addc sets CA (carry), adde uses it.
++ as_addc(output, lhsLo, rhsLo); // output = lhsLo + rhsLo, CA = carry
++ as_adde(output, lhsHi, rhsHi); // output = lhsHi + rhsHi + CA
++ } else {
++ // subfc: rd = rb - ra, sets CA (borrow complement).
++ // subfe: rd = rb + ~ra + CA.
++ as_subfc(output, rhsLo, lhsLo); // output = lhsLo - rhsLo, CA = ~borrow
++ as_subfe(output, rhsHi, lhsHi); // output = lhsHi - rhsHi - borrow
++ }
++}
++
++void MacroAssembler::wasmMulI64WideHI64(Register lhs, Register rhs,
++ Register output, bool isSigned) {
++ if (isSigned) {
++ as_mulhd(output, lhs, rhs);
++ } else {
++ as_mulhdu(output, lhs, rhs);
++ }
++}
++
++//}}} check_macroassembler_style
++
++void MacroAssemblerPPC64Compat::incrementInt32Value(const Address& addr) {
++ asMasm().add32(Imm32(1), addr);
++}
++
++void MacroAssemblerPPC64Compat::retn(Imm32 n) {
++ // Load return address from [SP,0] first, then adjust SP, then return.
++ // Must load RA before adjusting SP (like loong64), since the RA is at
++ // the current top of stack, not at SP+n.
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ as_ld(scratch, StackPointer, 0);
++ if (n.value != 0) {
++ asMasm().addPtr(Imm32(n.value), StackPointer);
++ }
++ xs_mtlr(scratch);
++ as_blr();
++}
++
++// ===============================================================
++// Template specializations (outside check_macroassembler_style)
++
++template <>
++inline void MacroAssembler::cmpPtrSet(Assembler::Condition cond, Address lhs,
++ ImmPtr rhs, Register dest) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ loadPtr(lhs, scratch);
++ Assembler::Condition c = ma_cmp(scratch, rhs, cond);
++ ma_cmp_set(dest, c);
++}
++
++template <>
++inline void MacroAssembler::cmpPtrSet(Assembler::Condition cond, Register lhs,
++ Address rhs, Register dest) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ loadPtr(rhs, scratch);
++ Assembler::Condition c = ma_cmp(lhs, scratch, cond);
++ ma_cmp_set(dest, c);
++}
++
++template <>
++inline void MacroAssembler::cmpPtrSet(Assembler::Condition cond, Address lhs,
++ Register rhs, Register dest) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ loadPtr(lhs, scratch);
++ Assembler::Condition c = ma_cmp(scratch, rhs, cond);
++ ma_cmp_set(dest, c);
++}
++
++template <>
++inline void MacroAssembler::cmp32Set(Assembler::Condition cond, Register lhs,
++ Address rhs, Register dest) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ load32(rhs, scratch);
++ Assembler::Condition c = ma_cmp(lhs, scratch, cond, true);
++ ma_cmp_set(dest, c);
++}
++
++template <>
++inline void MacroAssembler::cmp32Set(Assembler::Condition cond, Address lhs,
++ Register rhs, Register dest) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ load32(lhs, scratch);
++ Assembler::Condition c = ma_cmp(scratch, rhs, cond, true);
++ ma_cmp_set(dest, c);
++}
++
++template <>
++inline void MacroAssembler::cmp32Set(Assembler::Condition cond, Address lhs,
++ Imm32 rhs, Register dest) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ load32(lhs, scratch);
++ Assembler::Condition c = ma_cmp(scratch, rhs, cond, true);
++ ma_cmp_set(dest, c);
++}
++
++//{{{ check_macroassembler_style
++// ===============================================================
++// SIMD load/store (128-bit)
++
++FaultingCodeOffset MacroAssembler::loadUnalignedSimd128(const Address& src,
++ FloatRegister dest) {
++ UseScratchRegisterScope temps(asMasm());
++ if (HasPOWER10() && is_intN((intptr_t)src.offset, 34)) {
++ // POWER10 prefixed load — natural-LE byte order, no GPR scratch.
++ return FaultingCodeOffset(
++ as_plxv(dest.encoding(), src.base, (int64_t)src.offset, /*R=*/false)
++ .getOffset());
++ }
++ if (HasPOWER9()) {
++ // POWER9: lxvx (X-form, indexed) loads 128 bits in correct LE order.
++ Register scratch = temps.Acquire();
++ if (src.offset == 0) {
++ // RA=0 means "use 0 as base" in indexed forms, so use r0 encoding.
++ return FaultingCodeOffset(as_lxvx(dest, r0, src.base).getOffset());
++ }
++ movePtr(ImmWord(src.offset), scratch);
++ return FaultingCodeOffset(as_lxvx(dest, src.base, scratch).getOffset());
++ }
++ // POWER8: lxvd2x loads with doubleword swap on LE. Fix with xxpermdi.
++ Register scratch = temps.Acquire();
++ FaultingCodeOffset fco;
++ if (src.offset == 0) {
++ fco = FaultingCodeOffset(as_lxvd2x(dest, r0, src.base).getOffset());
++ } else {
++ movePtr(ImmWord(src.offset), scratch);
++ fco = FaultingCodeOffset(as_lxvd2x(dest, src.base, scratch).getOffset());
++ }
++ as_xxpermdi(dest, dest, dest, 2);
++ return fco;
++}
++
++FaultingCodeOffset MacroAssembler::loadUnalignedSimd128(const BaseIndex& src,
++ FloatRegister dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ computeScaledAddress(src, scratch);
++ if (src.offset != 0) {
++ // addPtr picks up POWER10 paddi (1 prefixed insn) when available;
++ // falls back to movePtr+add on P9/P8. Drops the explicit scratch2.
++ addPtr(ImmWord(src.offset), scratch);
++ }
++ if (HasPOWER9()) {
++ return FaultingCodeOffset(as_lxvx(dest, r0, scratch).getOffset());
++ }
++ FaultingCodeOffset fco(as_lxvd2x(dest, r0, scratch).getOffset());
++ as_xxpermdi(dest, dest, dest, 2);
++ return fco;
++}
++
++FaultingCodeOffset MacroAssembler::storeUnalignedSimd128(FloatRegister src,
++ const Address& dest) {
++ UseScratchRegisterScope temps(asMasm());
++ if (HasPOWER10() && is_intN((intptr_t)dest.offset, 34)) {
++ // POWER10 prefixed store — natural-LE byte order, no GPR scratch.
++ return FaultingCodeOffset(
++ as_pstxv(src.encoding(), dest.base, (int64_t)dest.offset, /*R=*/false)
++ .getOffset());
++ }
++ if (HasPOWER9()) {
++ Register scratch = temps.Acquire();
++ if (dest.offset == 0) {
++ return FaultingCodeOffset(as_stxvx(src, r0, dest.base).getOffset());
++ }
++ movePtr(ImmWord(dest.offset), scratch);
++ return FaultingCodeOffset(as_stxvx(src, dest.base, scratch).getOffset());
++ }
++ // POWER8: stxvd2x stores with doubleword swap on LE.
++ // Swap before store, then swap back to restore the register.
++ ScratchSimd128Scope scratch128(*this);
++ as_xxpermdi(scratch128, src, src, 2);
++ Register scratch = temps.Acquire();
++ FaultingCodeOffset fco;
++ if (dest.offset == 0) {
++ fco = FaultingCodeOffset(as_stxvd2x(scratch128, r0, dest.base).getOffset());
++ } else {
++ movePtr(ImmWord(dest.offset), scratch);
++ fco = FaultingCodeOffset(
++ as_stxvd2x(scratch128, dest.base, scratch).getOffset());
++ }
++ return fco;
++}
++
++FaultingCodeOffset MacroAssembler::storeUnalignedSimd128(
++ FloatRegister src, const BaseIndex& dest) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ computeScaledAddress(dest, scratch);
++ if (dest.offset != 0) {
++ addPtr(ImmWord(dest.offset), scratch);
++ }
++ if (HasPOWER9()) {
++ return FaultingCodeOffset(as_stxvx(src, r0, scratch).getOffset());
++ }
++ ScratchSimd128Scope scratch128(*this);
++ as_xxpermdi(scratch128, src, src, 2);
++ return FaultingCodeOffset(as_stxvd2x(scratch128, r0, scratch).getOffset());
++}
++
++// ===============================================================
++// SIMD operations
++//
++// Scratch register conventions for SIMD helpers (read this before writing
++// a new one):
++//
++// 1. `ScratchSimd128Scope scratch(*this)` — acquires v0 (= VR0 = VSR32,
++// non-allocatable). Constructed as {FloatRegisters::f0, Simd128} so
++// encoding() = 0 + 32 = 32 (per Architecture-ppc64.h). Default temp.
++// One scope at a time per helper. Safe to pass to any VMX/VSX
++// instruction; the allocator never places a live v128 in v0.
++//
++// 2. **Do NOT** write to VR1..VR31 (= VSR33..VSR63) without a Lowering
++// temp. VR1..VR31 are allocatable; a live wasm v128 may be sitting in
++// any of them. Use `ScratchSimd128Scope` (rule 1) or a Lowering temp.
++//
++// 3. **Red-zone stash** — use `RedZoneStashSimd128` / `RedZoneRestoreSimd128`
++// (declared just below) when a helper genuinely needs >1 SIMD scratch
++// AND adding a Lowering temp would require LIR + MIR + CodeGen changes.
++// ELFv2 reserves 288 bytes below SP; we use at most 32 (two 16-byte
++// slots). Live users: `extAddPairwiseInt*` (2 slots), `swizzleInt8x16`
++// (1 slot), `dotInt8x16Int7x16ThenAdd` 4-arg (1 slot). If you find
++// yourself wanting a 3rd slot or nested save/restore, prefer a Lowering
++// temp instead — the red-zone approach is tolerable because it's
++// self-contained to a single helper. The `MOZ_ASSERT(slot < 2)` inside
++// the helpers enforces this at test time.
++//
++// Simd128 lives in VR-namespace (VSR32-63), so VMX ops address Simd128
++// FloatRegisters directly with no staging. Encoding is 32-63; the VMX
++// VR field is 5-bit (0-31), so we mask with `& 31`.
++
++// Two 16-byte Simd128 slots available in the ELFv2 red zone for short-lived
++// SIMD spills (see point 3 of the SIMD conventions preamble above).
++static constexpr int kRedZoneSimd128MaxSlots = 2;
++
++static inline void RedZoneStashSimd128(MacroAssembler& masm, FloatRegister src,
++ int slot) {
++ MOZ_ASSERT(slot >= 0 && slot < kRedZoneSimd128MaxSlots);
++ masm.storeUnalignedSimd128(src, Address(StackPointer, -16 * (slot + 1)));
++}
++
++static inline void RedZoneRestoreSimd128(MacroAssembler& masm, int slot,
++ FloatRegister dest) {
++ MOZ_ASSERT(slot >= 0 && slot < kRedZoneSimd128MaxSlots);
++ masm.loadUnalignedSimd128(Address(StackPointer, -16 * (slot + 1)), dest);
++}
++
++typedef void (*VmxBinaryFn)(Assembler&, uint8_t, uint8_t, uint8_t);
++
++static void EmitVmxBinary(MacroAssembler& masm, VmxBinaryFn vmxOp,
++ FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ vmxOp(static_cast<Assembler&>(masm), dest.encoding() & 31,
++ lhs.encoding() & 31, rhs.encoding() & 31);
++}
++
++// Macro for defining VMX binary wrappers.
++#define VMX_BINARY_WRAPPER(vmxInst) \
++ [](Assembler& a, uint8_t vrt, uint8_t vra, uint8_t vrb) { \
++ a.as_##vmxInst(vrt, vra, vrb); \
++ }
++
++// Emit op directly on Simd128 dest, then xxlnor in place.
++template <typename VmxBinaryFnT>
++static void EmitVmxBinaryNot(MacroAssembler& masm, VmxBinaryFnT vmxOp,
++ FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ vmxOp(static_cast<Assembler&>(masm), dest.encoding() & 31,
++ lhs.encoding() & 31, rhs.encoding() & 31);
++ masm.as_xxlnor(dest, dest, dest);
++}
++
++// Integer SIMD compare helper. VMX compare instructions produce all-ones
++// for true, all-zeros for false per element.
++// Available VMX compares: vcmpequ* (eq), vcmpgts* (signed gt), vcmpgtu*
++// (unsigned gt). Other conditions derived by swapping operands or
++// complementing.
++template <typename EqFn, typename GtsFn, typename GtuFn>
++static void EmitVmxCompare(MacroAssembler& masm, Assembler::Condition cond,
++ FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest, EqFn eqFn, GtsFn gtsFn,
++ GtuFn gtuFn) {
++ switch (cond) {
++ case Assembler::Equal:
++ EmitVmxBinary(masm, eqFn, lhs, rhs, dest);
++ break;
++ case Assembler::NotEqual:
++ EmitVmxBinaryNot(masm, eqFn, lhs, rhs, dest);
++ break;
++ case Assembler::GreaterThan:
++ EmitVmxBinary(masm, gtsFn, lhs, rhs, dest);
++ break;
++ case Assembler::GreaterThanOrEqual:
++ // !(rhs > lhs)
++ EmitVmxBinaryNot(masm, gtsFn, rhs, lhs, dest);
++ break;
++ case Assembler::LessThan:
++ // rhs > lhs (swap)
++ EmitVmxBinary(masm, gtsFn, rhs, lhs, dest);
++ break;
++ case Assembler::LessThanOrEqual:
++ // !(lhs > rhs)
++ EmitVmxBinaryNot(masm, gtsFn, lhs, rhs, dest);
++ break;
++ case Assembler::Above:
++ EmitVmxBinary(masm, gtuFn, lhs, rhs, dest);
++ break;
++ case Assembler::AboveOrEqual:
++ EmitVmxBinaryNot(masm, gtuFn, rhs, lhs, dest);
++ break;
++ case Assembler::Below:
++ EmitVmxBinary(masm, gtuFn, rhs, lhs, dest);
++ break;
++ case Assembler::BelowOrEqual:
++ EmitVmxBinaryNot(masm, gtuFn, lhs, rhs, dest);
++ break;
++ default:
++ MOZ_CRASH("Unexpected SIMD integer condition");
++ }
++}
++
++// Emit ternary VMX op directly on Simd128 regs, no staging.
++typedef void (*VmxTernaryFn)(Assembler&, uint8_t, uint8_t, uint8_t, uint8_t);
++
++static void EmitVmxTernary(MacroAssembler& masm, VmxTernaryFn vmxOp,
++ FloatRegister a, FloatRegister b, FloatRegister c,
++ FloatRegister dest) {
++ vmxOp(static_cast<Assembler&>(masm), dest.encoding() & 31, a.encoding() & 31,
++ b.encoding() & 31, c.encoding() & 31);
++}
++
++// Emit unary VMX op directly on Simd128 regs, no staging.
++typedef void (*VmxUnaryFn)(Assembler&, uint8_t, uint8_t);
++
++static void EmitVmxUnary(MacroAssembler& masm, VmxUnaryFn vmxOp,
++ FloatRegister src, FloatRegister dest) {
++ vmxOp(static_cast<Assembler&>(masm), dest.encoding() & 31,
++ src.encoding() & 31);
++}
++
++// Helper: create a zero SIMD register using xxlxor.
++static void ZeroSimd128(MacroAssembler& masm, FloatRegister dest) {
++ masm.as_xxlxor(dest, dest, dest);
++}
++
++void MacroAssembler::moveSimd128(FloatRegister src, FloatRegister dest) {
++ if (src != dest) {
++ as_xxlor(dest, src, src);
++ }
++}
++
++void MacroAssembler::loadConstantSimd128(const SimdConstant& v,
++ FloatRegister dest) {
++ // Load 128-bit constant from inline constant pool.
++ // Clobbers SecondScratchReg (r12).
++ loadFromPoolSimd128(dest, v);
++}
++
++// PPC64 LE lane mapping:
++// Wasm lane K = memory byte K = register byte (15-K).
++// mfvsrd extracts register bits[0:63] = BE dword 0 = Wasm lanes 8-15 (bytes).
++// For VMX byte ops, BE byte index = 15 - wasm_lane.
++// For VMX halfword ops, BE halfword index = 7 - wasm_halfword.
++// For VSX word ops (xxspltw), BE word index = 3 - wasm_word.
++// For doubleword ops, BE dword index = 1 - wasm_dword.
++
++void MacroAssembler::splatX16(Register src, FloatRegister dest) {
++ // mtvsrd writes src into BE 0..63 of dest (low byte at BE byte 7);
++ // vspltb then splats that byte over all 16 lanes. dest aliases as
++ // both source and destination — vspltb tolerates this. No extra
++ // scratch register required, so callers that already hold a
++ // ScratchSimd128Scope (extAddPairwise*, var-shift narrow forms) do
++ // not see a nested-acquire collision.
++ as_mtvsrd(dest, src);
++ as_vspltb(dest, dest, 7);
++}
++
++void MacroAssembler::splatX8(Register src, FloatRegister dest) {
++ // Same shape as splatX16 with halfword granularity. mtvsrd places
++ // the low 16 bits at BE halfword 3 (= BE bytes 6..7); vsplth picks
++ // it up and splats across 8 lanes. vsplth reads only the chosen
++ // halfword, so negative i32 inputs do not need a 16-bit pre-mask
++ // (which the previous GPR-replicate path required).
++ as_mtvsrd(dest, src);
++ as_vsplth(dest, dest, 3);
++}
++
++void MacroAssembler::splatX4(Register src, FloatRegister dest) {
++ if (HasPOWER9()) {
++ as_mtvsrws(dest, src);
++ } else {
++ as_mtvsrd(dest, src);
++ as_xxspltw(dest, dest, 1);
++ }
++}
++
++void MacroAssembler::splatX4(FloatRegister src, FloatRegister dest) {
++ // src is a double-precision FPR holding a float value (the JIT keeps
++ // FP32 in DP-equivalent form on PPC64). Convert DP→SP into BE word 0
++ // (xscvdpspn lays the single at bits[0:31] / BE word 0), then splat
++ // word 0 to all four lanes.
++ as_xscvdpspn(dest, src);
++ as_xxspltw(dest, dest, 0);
++}
++
++void MacroAssembler::splatX2(FloatRegister src, FloatRegister dest) {
++ // Splat scalar double to both doubleword lanes.
++ // Scalar value is in register bits[0:63] (BE dword 0).
++ // xxpermdi dm=0: dest = [src.dw0, src.dw0]
++ as_xxpermdi(dest, src, src, 0);
++}
++
++// Helpers: splat Imm32 into SIMD register at various element widths.
++// VMX shift instructions read the shift count from EACH element independently,
++// so the count must be replicated to every byte/halfword/word as appropriate.
++//
++// Fast path for small constants: vspltis{b,h,w} (POWER7+) splats a 5-bit
++// signed immediate to all lanes in 1 insn with no pool entry. For values
++// outside [-16, 15] we fall back to the inline-pool path.
++static void SplatImm8(MacroAssembler& masm, Imm32 imm, FloatRegister dest) {
++ int8_t val = (int8_t)imm.value;
++ if (val >= -16 && val <= 15) {
++ masm.as_vspltisb(dest.encoding() & 31, val);
++ return;
++ }
++ if (HasPOWER9()) {
++ // P9 xxspltib handles the full 8-bit range in 1 insn.
++ masm.as_xxspltib(dest, (uint8_t)val);
++ return;
++ }
++ int8_t bytes[16];
++ for (int i = 0; i < 16; i++) bytes[i] = val;
++ masm.loadConstantSimd128(SimdConstant::CreateX16(bytes), dest);
++}
++
++static void SplatImm16(MacroAssembler& masm, Imm32 imm, FloatRegister dest) {
++ int16_t val = (int16_t)imm.value;
++ if (val >= -16 && val <= 15) {
++ masm.as_vspltish(dest.encoding() & 31, (int8_t)val);
++ return;
++ }
++ int16_t halfs[8];
++ for (int i = 0; i < 8; i++) halfs[i] = val;
++ masm.loadConstantSimd128(SimdConstant::CreateX8(halfs), dest);
++}
++
++static void SplatImm32(MacroAssembler& masm, Imm32 imm, FloatRegister dest) {
++ int32_t val = imm.value;
++ if (val >= -16 && val <= 15) {
++ masm.as_vspltisw(dest.encoding() & 31, (int8_t)val);
++ return;
++ }
++ int32_t words[4] = {val, val, val, val};
++ masm.loadConstantSimd128(SimdConstant::CreateX4(words), dest);
++}
++
++// ===============================================================
++// Extract lane
++
++static void ExtractLaneToGPR(MacroAssembler& masm, uint32_t lane,
++ FloatRegister src, Register dest,
++ unsigned laneWidthBytes, unsigned laneWidthBits) {
++ // Extract Wasm lane from vector register to GPR.
++ // Wasm lane K → register byte offset (15 - K*laneWidthBytes).
++ //
++ // Strategy: use mfvsrd to get one 64-bit half of the register, then shift
++ // and mask to isolate the lane.
++ //
++ // mfvsrd gets register bits[0:63] (BE dword 0) = Wasm lanes in the high
++ // half of the register (high-numbered lanes in LE memory order).
++ // For an N-bit lane at Wasm index L:
++ // If L is in the high dword (L >= 8/laneWidthBytes):
++ // use mfvsrd; lane is at GPR bit offset laneWidthBits*(L -
++ // 8/laneWidthBytes) from LSB
++ // Else (L in low dword):
++ // swap dwords, then mfvsrd; lane is at GPR bit offset laneWidthBits*L
++ // from LSB
++
++ unsigned lanesPerDword = 8 / laneWidthBytes;
++
++ if (lane >= lanesPerDword) {
++ masm.as_mfvsrd(dest, src);
++ unsigned shift = laneWidthBits * (lane - lanesPerDword);
++ if (shift) {
++ masm.x_srdi(dest, dest, shift);
++ }
++ } else {
++ if (HasPOWER9()) {
++ masm.as_mfvsrld(dest, src);
++ } else {
++ // POWER8: swap dwords to get dw1 into scalar position.
++ // Avoid ScratchSimd128Scope — callers may already hold it.
++ // Use xxpermdi directly on ScratchSimd128Reg (v0/VSR32, non-allocatable).
++ masm.as_xxpermdi(ScratchSimd128Reg, src, src, 2);
++ masm.as_mfvsrd(dest, ScratchSimd128Reg);
++ }
++ unsigned shift = laneWidthBits * lane;
++ if (shift) {
++ masm.x_srdi(dest, dest, shift);
++ }
++ }
++}
++
++void MacroAssembler::unsignedExtractLaneInt8x16(uint32_t lane,
++ FloatRegister src,
++ Register dest) {
++ MOZ_ASSERT(lane < 16);
++ if (HasPOWER9()) {
++ // vextractub puts VRB.BE_byte[UIM] at VRT.BE_byte[7] with the rest
++ // zeroed; mfvsrd then reads BE bytes 0..7 → low byte of dest, high
++ // bytes already 0. No mask needed.
++ as_vextractub(ScratchSimd128Reg, src, 15 - lane);
++ as_mfvsrd(dest, ScratchSimd128Reg);
++ return;
++ }
++ ExtractLaneToGPR(*this, lane, src, dest, 1, 8);
++ as_rldicl(dest, dest, 0, 56);
++}
++
++void MacroAssembler::unsignedExtractLaneInt16x8(uint32_t lane,
++ FloatRegister src,
++ Register dest) {
++ MOZ_ASSERT(lane < 8);
++ if (HasPOWER9()) {
++ as_vextractuh(ScratchSimd128Reg, src, 14 - 2 * lane);
++ as_mfvsrd(dest, ScratchSimd128Reg);
++ return;
++ }
++ ExtractLaneToGPR(*this, lane, src, dest, 2, 16);
++ as_rldicl(dest, dest, 0, 48);
++}
++
++void MacroAssembler::extractLaneFloat32x4(uint32_t lane, FloatRegister src,
++ FloatRegister dest) {
++ MOZ_ASSERT(lane < 4);
++ // BE word index = 3 - lane. xxextractuw extracts a word by BE byte offset.
++ // BE byte offset of BE word W = W*4. So offset = (3-lane)*4.
++ // xxextractuw puts the extracted word into bits[32:63] of dest (the low
++ // word of the scalar doubleword), then xscvspdpn converts SP→DP.
++ // xxspltw replicates a word into all 4 positions. The scalar SP value
++ // is then at bits[0:31] where xscvspdpn expects it.
++ as_xxspltw(dest, src, 3 - lane);
++ as_xscvspdpn(dest, dest);
++}
++
++void MacroAssembler::extractLaneFloat64x2(uint32_t lane, FloatRegister src,
++ FloatRegister dest) {
++ MOZ_ASSERT(lane < 2);
++ if (lane == 0) {
++ // Lane 0 = LE low dword = BE dword 1. Need to swap to scalar position.
++ as_xxpermdi(dest, src, src, 2);
++ } else {
++ // Lane 1 = LE high dword = BE dword 0 = scalar position.
++ if (src != dest) {
++ as_xxlor(dest, src, src);
++ }
++ }
++}
++
++// ===============================================================
++// Replace lane
++
++void MacroAssembler::replaceLaneInt8x16(unsigned lane, Register rhs,
++ FloatRegister lhsDest) {
++ MOZ_ASSERT(lane < 16);
++ if (HasPOWER10()) {
++ // 2 insns + 1 GPR scratch: load lane index, vinsbrx (right-indexed
++ // = LE-natural). vinsbrx masks RA & 0xF, so the immediate fits.
++ UseScratchRegisterScope temps(asMasm());
++ Register idx = temps.Acquire();
++ xs_li(idx, int16_t(lane));
++ as_vinsbrx(lhsDest, idx, rhs);
++ return;
++ }
++ if (HasPOWER9()) {
++ // 2 insns + 1 VSR scratch: stage rhs in BE 0..63 of a scratch VSR
++ // (low byte of rhs lands at BE byte 7), then vinsertb copies that
++ // BE byte 7 into lhsDest's BE byte (15 - lane) = wasm lane L.
++ ScratchSimd128Scope scratch(*this);
++ as_mtvsrd(scratch, rhs);
++ as_vinsertb(lhsDest, scratch, 15 - lane);
++ return;
++ }
++ {
++ // POWER8: extract dword, use rldimi to insert byte, write back.
++ // Only needs 1 GPR scratch.
++ UseScratchRegisterScope temps(asMasm());
++ ScratchSimd128Scope scratch128(*this);
++ Register tmp = temps.Acquire();
++ unsigned dword = lane / 8;
++ unsigned byteInDword = lane % 8;
++ if (dword == 1) {
++ as_mfvsrd(tmp, lhsDest);
++ } else {
++ as_xxpermdi(scratch128, lhsDest, lhsDest, 2);
++ as_mfvsrd(tmp, scratch128);
++ }
++ // rldimi RT,RS,SH,MB: insert rotated RS bits into RT at positions
++ // MB..63-SH. Insert rhs byte at bit offset 8*byteInDword from LSB:
++ // SH = 8*byteInDword, MB = 56 - 8*byteInDword
++ as_rldimi(tmp, rhs, 8 * byteInDword, 56 - 8 * byteInDword);
++ as_mtvsrd(scratch128, tmp);
++ // mtvsrd writes scratch128.dw0 from `tmp` and leaves scratch128.dw1
++ // undefined. Both xxpermdi forms below select scratch128.dw0 only:
++ // DM=0b01 → [scratch.dw0, lhsDest.dw1]
++ // DM=0b00 → [lhsDest.dw0, scratch.dw0]
++ // So the undefined dw1 is never read. INVARIANT: any future change
++ // to either DM literal MUST first zero scratch128.dw1 via xxlxor or
++ // adopt a different staging scheme; otherwise reads of dw1 produce
++ // POWER9-zero / POWER8-undefined garbage in the output.
++ if (dword == 1) {
++ as_xxpermdi(lhsDest, scratch128, lhsDest, 1);
++ } else {
++ as_xxpermdi(lhsDest, lhsDest, scratch128, 0);
++ }
++ }
++}
++
++void MacroAssembler::replaceLaneInt16x8(unsigned lane, Register rhs,
++ FloatRegister lhsDest) {
++ MOZ_ASSERT(lane < 8);
++ if (HasPOWER10()) {
++ // 2 insns + 1 GPR scratch: lane*2 → byte position, then vinshrx.
++ UseScratchRegisterScope temps(asMasm());
++ Register idx = temps.Acquire();
++ xs_li(idx, int16_t(lane * 2));
++ as_vinshrx(lhsDest, idx, rhs);
++ return;
++ }
++ if (HasPOWER9()) {
++ // 2 insns + 1 VSR scratch: stage rhs in BE 0..63 (low 16 of rhs
++ // lands at BE bytes 6..7), then vinserth copies those two bytes
++ // into lhsDest's BE bytes (14 - 2L)..(15 - 2L) = wasm lane L.
++ ScratchSimd128Scope scratch(*this);
++ as_mtvsrd(scratch, rhs);
++ as_vinserth(lhsDest, scratch, 14 - 2 * lane);
++ return;
++ }
++ {
++ // POWER8: extract dword, rldimi to insert halfword, write back.
++ // Same dw1-undef invariant as replaceLaneInt8x16 above.
++ UseScratchRegisterScope temps(asMasm());
++ ScratchSimd128Scope scratch128(*this);
++ Register tmp = temps.Acquire();
++ unsigned dword = lane / 4;
++ unsigned hwInDword = lane % 4;
++ if (dword == 1) {
++ as_mfvsrd(tmp, lhsDest);
++ } else {
++ as_xxpermdi(scratch128, lhsDest, lhsDest, 2);
++ as_mfvsrd(tmp, scratch128);
++ }
++ as_rldimi(tmp, rhs, 16 * hwInDword, 48 - 16 * hwInDword);
++ as_mtvsrd(scratch128, tmp);
++ if (dword == 1) {
++ as_xxpermdi(lhsDest, scratch128, lhsDest, 1);
++ } else {
++ as_xxpermdi(lhsDest, lhsDest, scratch128, 0);
++ }
++ }
++}
++
++void MacroAssembler::replaceLaneInt32x4(unsigned lane, Register rhs,
++ FloatRegister lhsDest) {
++ MOZ_ASSERT(lane < 4);
++ if (HasPOWER10()) {
++ // 1 insn, no scratch VSR. UIM is the BE byte offset.
++ as_vinsw(lhsDest, rhs, (3 - lane) * 4);
++ return;
++ }
++ if (HasPOWER9()) {
++ // POWER9: xxinsertw inserts word from bits[32:63] of XB at BE byte
++ // offset UIM in XT. mtvsrd puts GPR into bits[0:63]; low 32 bits
++ // land at bits[32:63]. BE byte offset of Wasm word lane = (3-lane)*4.
++ ScratchSimd128Scope scratch(*this);
++ as_mtvsrd(scratch, rhs);
++ as_xxinsertw(lhsDest, scratch, (3 - lane) * 4);
++ return;
++ }
++ // POWER8: extract dword, rldimi to insert word, write back.
++ // Modeled on replaceLaneInt16x8 below.
++ UseScratchRegisterScope temps(asMasm());
++ ScratchSimd128Scope scratch128(*this);
++ Register tmp = temps.Acquire();
++ unsigned dword = lane / 2; // 0 = lanes 0,1; 1 = lanes 2,3.
++ unsigned wordInDword = lane % 2; // 0 = low LE word; 1 = high LE word.
++ if (dword == 1) {
++ as_mfvsrd(tmp, lhsDest);
++ } else {
++ as_xxpermdi(scratch128, lhsDest, lhsDest, 2);
++ as_mfvsrd(tmp, scratch128);
++ }
++ as_rldimi(tmp, rhs, 32 * wordInDword, 32 - 32 * wordInDword);
++ as_mtvsrd(scratch128, tmp);
++ if (dword == 1) {
++ as_xxpermdi(lhsDest, scratch128, lhsDest, 1);
++ } else {
++ as_xxpermdi(lhsDest, lhsDest, scratch128, 0);
++ }
++}
++
++void MacroAssembler::replaceLaneFloat32x4(unsigned lane, FloatRegister rhs,
++ FloatRegister lhsDest) {
++ MOZ_ASSERT(lane < 4);
++ if (HasPOWER9()) {
++ ScratchSimd128Scope scratch(*this);
++ as_xscvdpspn(scratch, rhs);
++ as_xxinsertw(lhsDest, scratch, (3 - lane) * 4);
++ return;
++ }
++ // POWER8: convert double rhs to single (lands in BE bits 0..31 of FPR),
++ // extract bits to a GPR, then route through the integer insert path.
++ UseScratchRegisterScope temps(asMasm());
++ Register rhsBits = temps.Acquire();
++ {
++ ScratchSimd128Scope scratch(*this);
++ as_xscvdpspn(scratch, rhs);
++ as_mfvsrd(rhsBits, scratch); // single is in high 32 bits of GPR
++ x_srdi(rhsBits, rhsBits, 32); // single → low 32 bits
++ }
++ // Inline the int-insert sequence (can't call replaceLaneInt32x4 from
++ // here because we're already inside a UseScratchRegisterScope and
++ // need to acquire a separate tmp).
++ ScratchSimd128Scope scratch128(*this);
++ Register tmp = temps.Acquire();
++ unsigned dword = lane / 2;
++ unsigned wordInDword = lane % 2;
++ if (dword == 1) {
++ as_mfvsrd(tmp, lhsDest);
++ } else {
++ as_xxpermdi(scratch128, lhsDest, lhsDest, 2);
++ as_mfvsrd(tmp, scratch128);
++ }
++ as_rldimi(tmp, rhsBits, 32 * wordInDword, 32 - 32 * wordInDword);
++ as_mtvsrd(scratch128, tmp);
++ if (dword == 1) {
++ as_xxpermdi(lhsDest, scratch128, lhsDest, 1);
++ } else {
++ as_xxpermdi(lhsDest, lhsDest, scratch128, 0);
++ }
++}
++
++void MacroAssembler::replaceLaneFloat64x2(unsigned lane, FloatRegister rhs,
++ FloatRegister lhsDest) {
++ MOZ_ASSERT(lane < 2);
++ // xxpermdi to place the scalar double into the correct lane.
++ if (lane == 0) {
++ // Replace LE low dword (= dw1). Keep lhsDest dw0 (lane 1).
++ // rhs scalar is in dw0. dm=0b00: [lhsDest.dw0, rhs.dw0]
++ as_xxpermdi(lhsDest, lhsDest, rhs, 0);
++ } else {
++ // Replace LE high dword (= dw0). Keep lhsDest dw1 (lane 0).
++ // rhs scalar is in dw0. dm=0b01: [rhs.dw0, lhsDest.dw1]
++ as_xxpermdi(lhsDest, rhs, lhsDest, 1);
++ }
++}
++
++void MacroAssembler::shuffleInt8x16(const uint8_t lanes[16], FloatRegister rhs,
++ FloatRegister lhsDest) {
++ shuffleInt8x16(lanes, lhsDest, rhs, lhsDest);
++}
++
++void MacroAssembler::shuffleInt8x16(const uint8_t lanes[16], FloatRegister lhs,
++ FloatRegister rhs, FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ // PPC64 vperm uses BE byte indices: VRA[0]=MSB, VRA[15]=LSB, VRB[16..31].
++ // Convert Wasm LE lane indices to vperm control: lhs lane N = BE index
++ // (15-N), rhs lane N = BE index (31-N) = (47 - (N+16)).
++ int8_t ctrl[16];
++ for (unsigned i = 0; i < 16; i++) {
++ uint8_t src = lanes[i];
++ if (src < 16) {
++ ctrl[i] = 15 - src;
++ } else {
++ ctrl[i] = 47 - src;
++ }
++ }
++ loadConstantSimd128(SimdConstant::CreateX16(ctrl), scratch);
++ // vperm directly on Simd128 regs.
++ as_vperm(dest.encoding() & 31, lhs.encoding() & 31, rhs.encoding() & 31,
++ scratch.encoding() & 31);
++}
++
++void MacroAssembler::laneSelectSimd128(FloatRegister mask, FloatRegister lhs,
++ FloatRegister rhs, FloatRegister dest) {
++ // xxsel: XC=0→XA, XC=1→XB → XT = (XA & ~XC) | (XB & XC)
++ // laneSelect: dest = (lhs & mask) | (rhs & ~mask)
++ // Need XA=rhs, XB=lhs, XC=mask.
++ as_xxsel(dest, rhs, lhs, mask);
++}
++
++void MacroAssembler::interleaveHighInt8x16(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ // On LE, vmrghb(rhs, lhs) gives Wasm interleave_high.
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmrghb), rhs, lhs, dest);
++}
++
++void MacroAssembler::interleaveHighInt16x8(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmrghh), rhs, lhs, dest);
++}
++
++void MacroAssembler::interleaveHighInt32x4(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmrghw), rhs, lhs, dest);
++}
++
++void MacroAssembler::interleaveHighInt64x2(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ // xxpermdi DM=0: [XA.dw0, XB.dw0] = merge high dwords.
++ // On LE: dw0 = high Wasm lane (lane 1).
++ as_xxpermdi(dest, rhs, lhs, 0);
++}
++
++void MacroAssembler::interleaveLowInt8x16(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmrglb), rhs, lhs, dest);
++}
++
++void MacroAssembler::interleaveLowInt16x8(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmrglh), rhs, lhs, dest);
++}
++
++void MacroAssembler::interleaveLowInt32x4(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmrglw), rhs, lhs, dest);
++}
++
++void MacroAssembler::interleaveLowInt64x2(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ // xxpermdi DM=3: [XA.dw1, XB.dw1] = merge low dwords.
++ as_xxpermdi(dest, rhs, lhs, 3);
++}
++
++void MacroAssembler::concatAndRightShiftSimd128(FloatRegister lhs,
++ FloatRegister rhs,
++ FloatRegister dest,
++ uint32_t shift) {
++ // vsldoi(VRA, VRB, SH) extracts 16 bytes starting at byte SH of the
++ // big-endian concatenation VRA||VRB. Endianness mapping for the Wasm
++ // `v128.shuffle` right-shift-concat semantic:
++ // Wasm: result[i] = (i + shift < 16) ? rhs[i + shift]
++ // : lhs[i + shift - 16]
++ // PPC LE: vsldoi(rhs, lhs, shift) produces exactly that — the LE byte
++ // layout reverses from BE, so passing (rhs, lhs, shift) here is the LE
++ // equivalent of (lhs, rhs, 16 - shift) on BE.
++ MOZ_ASSERT(shift < 16);
++ if (shift == 0) {
++ moveSimd128(rhs, dest);
++ return;
++ }
++ // vsldoi VRT,VRA,VRB,SH: result[i] = (VRA||VRB)[SH+i]
++ // Emit vsldoi directly on Simd128 regs (VRA = lhs = high part, VRB =
++ // rhs = low part). The VMX emitter masks `& 31` internally to extract
++ // the 5-bit VR field from the Simd128 encoding.
++ as_vsldoi(dest, lhs, rhs, shift);
++}
++
++void MacroAssembler::leftShiftSimd128(Imm32 count, FloatRegister src,
++ FloatRegister dest) {
++ MOZ_ASSERT(count.value < 16);
++ if (count.value == 0) {
++ moveSimd128(src, dest);
++ return;
++ }
++ // vslo shifts left by bytes (count in bits 121-124 of VRB, i.e. byte 15 bits
++ // 1-4). vsl shifts left by bits (count in bits 125-127 of VRB, i.e. byte 15
++ // bits 5-7). For byte shift: splatX4(count*8, scratch), then vslo.
++ ScratchSimd128Scope scratch(*this);
++ SplatImm32(*this, Imm32(count.value * 8), scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vslo), src, scratch, dest);
++}
++
++void MacroAssembler::rightShiftSimd128(Imm32 count, FloatRegister src,
++ FloatRegister dest) {
++ MOZ_ASSERT(count.value < 16);
++ if (count.value == 0) {
++ moveSimd128(src, dest);
++ return;
++ }
++ ScratchSimd128Scope scratch(*this);
++ SplatImm32(*this, Imm32(count.value * 8), scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsro), src, scratch, dest);
++}
++
++void MacroAssembler::zeroExtend8x16To16x8(FloatRegister src,
++ FloatRegister dest) {
++ // Unsigned widen low: interleave low bytes with zero bytes.
++ // On LE, vmrglb(zero, src) interleaves the low 8 bytes of src with zeros.
++ // Use ScratchSimd128Reg as the zero. Order matters: read src into the
++ // merge BEFORE writing dest (which might alias src). vmrglb reads
++ // vra+vrb, writes vrt — single-cycle issue.
++ ScratchSimd128Scope zero(*this);
++ as_xxlxor(zero, zero, zero);
++ as_vmrglb(dest.encoding() & 31, zero.encoding() & 31, src.encoding() & 31);
++}
++
++void MacroAssembler::zeroExtend8x16To32x4(FloatRegister src,
++ FloatRegister dest) {
++ zeroExtend8x16To16x8(src, dest);
++ zeroExtend16x8To32x4(dest, dest);
++}
++
++void MacroAssembler::zeroExtend8x16To64x2(FloatRegister src,
++ FloatRegister dest) {
++ zeroExtend8x16To32x4(src, dest);
++ zeroExtend32x4To64x2(dest, dest);
++}
++
++void MacroAssembler::zeroExtend16x8To32x4(FloatRegister src,
++ FloatRegister dest) {
++ // Unsigned widen low: interleave low halfwords with zero halfwords.
++ ScratchSimd128Scope zero(*this);
++ as_xxlxor(zero, zero, zero);
++ as_vmrglh(dest.encoding() & 31, zero.encoding() & 31, src.encoding() & 31);
++}
++
++void MacroAssembler::zeroExtend16x8To64x2(FloatRegister src,
++ FloatRegister dest) {
++ zeroExtend16x8To32x4(src, dest);
++ zeroExtend32x4To64x2(dest, dest);
++}
++
++void MacroAssembler::zeroExtend32x4To64x2(FloatRegister src,
++ FloatRegister dest) {
++ // Unsigned widen low: interleave low words with zero words.
++ ScratchSimd128Scope zero(*this);
++ as_xxlxor(zero, zero, zero);
++ as_vmrglw(dest.encoding() & 31, zero.encoding() & 31, src.encoding() & 31);
++}
++
++void MacroAssembler::reverseInt16x8(FloatRegister src, FloatRegister dest) {
++ const uint8_t lanes[] = {14, 15, 12, 13, 10, 11, 8, 9,
++ 6, 7, 4, 5, 2, 3, 0, 1};
++ shuffleInt8x16(lanes, src, src, dest);
++}
++
++void MacroAssembler::reverseInt32x4(FloatRegister src, FloatRegister dest) {
++ const uint8_t lanes[] = {12, 13, 14, 15, 8, 9, 10, 11,
++ 4, 5, 6, 7, 0, 1, 2, 3};
++ shuffleInt8x16(lanes, src, src, dest);
++}
++
++void MacroAssembler::reverseInt64x2(FloatRegister src, FloatRegister dest) {
++ as_xxpermdi(dest, src, src, 2);
++}
++
++void MacroAssembler::swizzleInt8x16Relaxed(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ swizzleInt8x16(lhs, rhs, dest);
++}
++
++// extMul{Low,High}Int{8x16,16x8} use POWER8+ widening multiplies
++// (vmul{e,o}{s,u}{b,h}) plus a halfword/word merge to map BE-indexed
++// even/odd products into Wasm lane order on PPC64 LE.
++//
++// Lane mapping:
++// For Low (Wasm lanes from LE bytes/HW 0..N/2-1 = BE 15..N/2):
++// vmrgl{h,w}(even_products, odd_products) places the right products
++// at BE result indices, which on LE map to Wasm lanes 0..N/2-1.
++// For High (Wasm lanes from LE indices N/2..N-1 = BE N/2-1..0):
++// vmrgh{h,w} takes the upper-half BE indices instead.
++//
++// Aliasing safety: vmul* reads both operands before writing, so
++// `dest = vmulo* lhs, rhs` is safe even when dest aliases lhs/rhs.
++// We use one scratch for the even-product half because vmrgl{h,w}
++// reads dest after the odd multiply.
++
++void MacroAssembler::extMulLowInt8x16(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ uint8_t l = lhs.encoding() & 31, r = rhs.encoding() & 31;
++ uint8_t d = dest.encoding() & 31, s = scratch.encoding() & 31;
++ as_vmulesb(s, l, r);
++ as_vmulosb(d, l, r);
++ as_vmrglh(d, s, d);
++}
++
++void MacroAssembler::extMulHighInt8x16(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ uint8_t l = lhs.encoding() & 31, r = rhs.encoding() & 31;
++ uint8_t d = dest.encoding() & 31, s = scratch.encoding() & 31;
++ as_vmulesb(s, l, r);
++ as_vmulosb(d, l, r);
++ as_vmrghh(d, s, d);
++}
++
++void MacroAssembler::unsignedExtMulLowInt8x16(FloatRegister lhs,
++ FloatRegister rhs,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ uint8_t l = lhs.encoding() & 31, r = rhs.encoding() & 31;
++ uint8_t d = dest.encoding() & 31, s = scratch.encoding() & 31;
++ as_vmuleub(s, l, r);
++ as_vmuloub(d, l, r);
++ as_vmrglh(d, s, d);
++}
++
++void MacroAssembler::unsignedExtMulHighInt8x16(FloatRegister lhs,
++ FloatRegister rhs,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ uint8_t l = lhs.encoding() & 31, r = rhs.encoding() & 31;
++ uint8_t d = dest.encoding() & 31, s = scratch.encoding() & 31;
++ as_vmuleub(s, l, r);
++ as_vmuloub(d, l, r);
++ as_vmrghh(d, s, d);
++}
++
++void MacroAssembler::extMulLowInt16x8(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ uint8_t l = lhs.encoding() & 31, r = rhs.encoding() & 31;
++ uint8_t d = dest.encoding() & 31, s = scratch.encoding() & 31;
++ as_vmulesh(s, l, r);
++ as_vmulosh(d, l, r);
++ as_vmrglw(d, s, d);
++}
++
++void MacroAssembler::extMulHighInt16x8(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ uint8_t l = lhs.encoding() & 31, r = rhs.encoding() & 31;
++ uint8_t d = dest.encoding() & 31, s = scratch.encoding() & 31;
++ as_vmulesh(s, l, r);
++ as_vmulosh(d, l, r);
++ as_vmrghw(d, s, d);
++}
++
++void MacroAssembler::unsignedExtMulLowInt16x8(FloatRegister lhs,
++ FloatRegister rhs,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ uint8_t l = lhs.encoding() & 31, r = rhs.encoding() & 31;
++ uint8_t d = dest.encoding() & 31, s = scratch.encoding() & 31;
++ as_vmuleuh(s, l, r);
++ as_vmulouh(d, l, r);
++ as_vmrglw(d, s, d);
++}
++
++void MacroAssembler::unsignedExtMulHighInt16x8(FloatRegister lhs,
++ FloatRegister rhs,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ uint8_t l = lhs.encoding() & 31, r = rhs.encoding() & 31;
++ uint8_t d = dest.encoding() & 31, s = scratch.encoding() & 31;
++ as_vmuleuh(s, l, r);
++ as_vmulouh(d, l, r);
++ as_vmrghw(d, s, d);
++}
++
++// ExtMul{Low,High}Int32x4 use vmul{e,o}{s,u}w (POWER8+) plus xxpermdi
++// to combine the two i64 partial products into Wasm lane order on PPC64
++// LE. xxpermdi accepts the full 6-bit VSR encoding so it works directly
++// on Simd128 regs (encoding 32-63) without any VR staging.
++//
++// Aliasing safe: both vmul* reads complete before the second one writes
++// dest, and xxpermdi reads both inputs before writing.
++
++static void EmitExtMulInt32x4(
++ MacroAssembler& masm, FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest, void (*mulEven)(Assembler&, uint8_t, uint8_t, uint8_t),
++ void (*mulOdd)(Assembler&, uint8_t, uint8_t, uint8_t), uint8_t dm) {
++ ScratchSimd128Scope scratch(masm);
++ uint8_t l = lhs.encoding() & 31;
++ uint8_t r = rhs.encoding() & 31;
++ uint8_t d = dest.encoding() & 31;
++ uint8_t s = scratch.encoding() & 31;
++ mulEven(static_cast<Assembler&>(masm), s, l, r);
++ mulOdd(static_cast<Assembler&>(masm), d, l, r);
++ masm.as_xxpermdi(dest, scratch, dest, dm);
++}
++
++void MacroAssembler::extMulLowInt32x4(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitExtMulInt32x4(
++ *this, lhs, rhs, dest,
++ [](Assembler& a, uint8_t t, uint8_t x, uint8_t y) {
++ a.as_vmulesw(t, x, y);
++ },
++ [](Assembler& a, uint8_t t, uint8_t x, uint8_t y) {
++ a.as_vmulosw(t, x, y);
++ },
++ 3);
++}
++
++void MacroAssembler::extMulHighInt32x4(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitExtMulInt32x4(
++ *this, lhs, rhs, dest,
++ [](Assembler& a, uint8_t t, uint8_t x, uint8_t y) {
++ a.as_vmulesw(t, x, y);
++ },
++ [](Assembler& a, uint8_t t, uint8_t x, uint8_t y) {
++ a.as_vmulosw(t, x, y);
++ },
++ 0);
++}
++
++void MacroAssembler::unsignedExtMulLowInt32x4(FloatRegister lhs,
++ FloatRegister rhs,
++ FloatRegister dest) {
++ EmitExtMulInt32x4(
++ *this, lhs, rhs, dest,
++ [](Assembler& a, uint8_t t, uint8_t x, uint8_t y) {
++ a.as_vmuleuw(t, x, y);
++ },
++ [](Assembler& a, uint8_t t, uint8_t x, uint8_t y) {
++ a.as_vmulouw(t, x, y);
++ },
++ 3);
++}
++
++void MacroAssembler::unsignedExtMulHighInt32x4(FloatRegister lhs,
++ FloatRegister rhs,
++ FloatRegister dest) {
++ EmitExtMulInt32x4(
++ *this, lhs, rhs, dest,
++ [](Assembler& a, uint8_t t, uint8_t x, uint8_t y) {
++ a.as_vmuleuw(t, x, y);
++ },
++ [](Assembler& a, uint8_t t, uint8_t x, uint8_t y) {
++ a.as_vmulouw(t, x, y);
++ },
++ 0);
++}
++
++void MacroAssembler::q15MulrSatInt16x8(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ // Q15 multiply-round-saturate: vmhraddshs(a, b, zero) computes
++ // saturate((a[i]*b[i] + 0x4000) >> 15) for each halfword.
++ ScratchSimd128Scope scratch(*this);
++ ZeroSimd128(*this, scratch);
++ EmitVmxTernary(
++ *this,
++ [](Assembler& a, uint8_t vrt, uint8_t vra, uint8_t vrb, uint8_t vrc) {
++ a.as_vmhraddshs(vrt, vra, vrb, vrc);
++ },
++ lhs, rhs, scratch, dest);
++}
++
++// neg = 0 - src. Use ScratchSimd128Reg (= VR0, non-allocatable) as the
++// zero source so the register allocator sees no clobbered VRs.
++// 2 insns: xxlxor scratch + vsubuXm dest, scratch, src. vneg{b,h}
++// doesn't exist in any POWER ISA, hence the subtract.
++void MacroAssembler::negInt8x16(FloatRegister src, FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ ZeroSimd128(*this, scratch);
++ as_vsububm(dest.encoding() & 31, scratch.encoding() & 31,
++ src.encoding() & 31);
++}
++
++void MacroAssembler::negInt16x8(FloatRegister src, FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ ZeroSimd128(*this, scratch);
++ as_vsubuhm(dest.encoding() & 31, scratch.encoding() & 31,
++ src.encoding() & 31);
++}
++
++void MacroAssembler::negInt32x4(FloatRegister src, FloatRegister dest) {
++ if (HasPOWER9()) {
++ EmitVmxUnary(
++ *this,
++ [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vnegw(vrt, vrb); },
++ src, dest);
++ return;
++ }
++ // POWER8 fallback: 0 - src via ScratchSimd128Reg (VR0).
++ ScratchSimd128Scope scratch(*this);
++ ZeroSimd128(*this, scratch);
++ as_vsubuwm(dest.encoding() & 31, scratch.encoding() & 31,
++ src.encoding() & 31);
++}
++
++void MacroAssembler::negInt64x2(FloatRegister src, FloatRegister dest) {
++ if (HasPOWER9()) {
++ EmitVmxUnary(
++ *this,
++ [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vnegd(vrt, vrb); },
++ src, dest);
++ return;
++ }
++ // POWER8 fallback: 0 - src via ScratchSimd128Reg (VR0).
++ ScratchSimd128Scope scratch(*this);
++ ZeroSimd128(*this, scratch);
++ as_vsubudm(dest.encoding() & 31, scratch.encoding() & 31,
++ src.encoding() & 31);
++}
++#undef DEF_NEG_INTNxM_VSUB
++
++void MacroAssembler::unsignedAddSatInt8x16(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vaddubs), lhs, rhs, dest);
++}
++
++void MacroAssembler::unsignedAddSatInt16x8(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vadduhs), lhs, rhs, dest);
++}
++
++void MacroAssembler::unsignedSubSatInt8x16(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsububs), lhs, rhs, dest);
++}
++
++void MacroAssembler::unsignedSubSatInt16x8(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsubuhs), lhs, rhs, dest);
++}
++
++void MacroAssembler::unsignedMinInt8x16(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vminub), lhs, rhs, dest);
++}
++
++void MacroAssembler::unsignedMinInt16x8(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vminuh), lhs, rhs, dest);
++}
++
++void MacroAssembler::unsignedMinInt32x4(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vminuw), lhs, rhs, dest);
++}
++
++void MacroAssembler::unsignedMaxInt8x16(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmaxub), lhs, rhs, dest);
++}
++
++void MacroAssembler::unsignedMaxInt16x8(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmaxuh), lhs, rhs, dest);
++}
++
++void MacroAssembler::unsignedMaxInt32x4(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmaxuw), lhs, rhs, dest);
++}
++
++void MacroAssembler::unsignedAverageInt8x16(FloatRegister lhs,
++ FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vavgub), lhs, rhs, dest);
++}
++
++void MacroAssembler::unsignedAverageInt16x8(FloatRegister lhs,
++ FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vavguh), lhs, rhs, dest);
++}
++
++// abs(x) = max(x, -x) per signed lane. No vabs{b,h,w,d} exists in any ISA.
++// vneg{w,d} exists only on POWER9.
++// We use ScratchSimd128Reg as a temp for -src. Order matters: compute
++// -src into temp first (reads src), then max(src, temp) into dest (reads
++// src + temp, writes dest). Safe even when dest == src because src is
++// read before dest is written by vmaxsX.
++
++void MacroAssembler::absInt8x16(FloatRegister src, FloatRegister dest) {
++ ScratchSimd128Scope tmp(*this);
++ as_xxlxor(tmp, tmp, tmp); // tmp = 0
++ as_vsububm(tmp.encoding() & 31, tmp.encoding() & 31,
++ src.encoding() & 31); // tmp = -src
++ as_vmaxsb(dest.encoding() & 31, src.encoding() & 31,
++ tmp.encoding() & 31); // dest = max(src, -src)
++}
++
++void MacroAssembler::absInt16x8(FloatRegister src, FloatRegister dest) {
++ ScratchSimd128Scope tmp(*this);
++ as_xxlxor(tmp, tmp, tmp);
++ as_vsubuhm(tmp.encoding() & 31, tmp.encoding() & 31, src.encoding() & 31);
++ as_vmaxsh(dest.encoding() & 31, src.encoding() & 31, tmp.encoding() & 31);
++}
++
++void MacroAssembler::absInt32x4(FloatRegister src, FloatRegister dest) {
++ ScratchSimd128Scope tmp(*this);
++ if (HasPOWER9()) {
++ as_vnegw(tmp.encoding() & 31, src.encoding() & 31); // tmp = -src
++ } else {
++ as_xxlxor(tmp, tmp, tmp);
++ as_vsubuwm(tmp.encoding() & 31, tmp.encoding() & 31, src.encoding() & 31);
++ }
++ as_vmaxsw(dest.encoding() & 31, src.encoding() & 31, tmp.encoding() & 31);
++}
++
++void MacroAssembler::absInt64x2(FloatRegister src, FloatRegister dest) {
++ ScratchSimd128Scope tmp(*this);
++ if (HasPOWER9()) {
++ as_vnegd(tmp.encoding() & 31, src.encoding() & 31); // tmp = -src
++ } else {
++ as_xxlxor(tmp, tmp, tmp);
++ as_vsubudm(tmp.encoding() & 31, tmp.encoding() & 31, src.encoding() & 31);
++ }
++ as_vmaxsd(dest.encoding() & 31, src.encoding() & 31, tmp.encoding() & 31);
++}
++
++void MacroAssembler::leftShiftInt8x16(Imm32 count, FloatRegister src,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ SplatImm8(*this, count, scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vslb), src, scratch, dest);
++}
++
++void MacroAssembler::leftShiftInt16x8(Imm32 count, FloatRegister src,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ SplatImm16(*this, count, scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vslh), src, scratch, dest);
++}
++
++void MacroAssembler::leftShiftInt32x4(Imm32 count, FloatRegister src,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ SplatImm32(*this, count, scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vslw), src, scratch, dest);
++}
++
++void MacroAssembler::leftShiftInt64x2(Imm32 count, FloatRegister src,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ SplatImm32(*this, count, scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsld), src, scratch, dest);
++}
++
++void MacroAssembler::rightShiftInt8x16(Imm32 count, FloatRegister src,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ SplatImm8(*this, count, scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrab), src, scratch, dest);
++}
++
++void MacroAssembler::unsignedRightShiftInt8x16(Imm32 count, FloatRegister src,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ SplatImm8(*this, count, scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrb), src, scratch, dest);
++}
++
++void MacroAssembler::rightShiftInt16x8(Imm32 count, FloatRegister src,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ SplatImm16(*this, count, scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrah), src, scratch, dest);
++}
++
++void MacroAssembler::unsignedRightShiftInt16x8(Imm32 count, FloatRegister src,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ SplatImm16(*this, count, scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrh), src, scratch, dest);
++}
++
++void MacroAssembler::rightShiftInt32x4(Imm32 count, FloatRegister src,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ SplatImm32(*this, count, scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsraw), src, scratch, dest);
++}
++
++void MacroAssembler::unsignedRightShiftInt32x4(Imm32 count, FloatRegister src,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ SplatImm32(*this, count, scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrw), src, scratch, dest);
++}
++
++void MacroAssembler::rightShiftInt64x2(Imm32 count, FloatRegister src,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ SplatImm32(*this, count, scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrad), src, scratch, dest);
++}
++
++void MacroAssembler::unsignedRightShiftInt64x2(Imm32 count, FloatRegister src,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ SplatImm32(*this, count, scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrd), src, scratch, dest);
++}
++
++void MacroAssembler::bitwiseAndSimd128(FloatRegister rhs,
++ FloatRegister lhsDest) {
++ as_xxland(lhsDest, lhsDest, rhs);
++}
++
++void MacroAssembler::bitwiseAndSimd128(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ as_xxland(dest, lhs, rhs);
++}
++
++void MacroAssembler::bitwiseOrSimd128(FloatRegister rhs,
++ FloatRegister lhsDest) {
++ as_xxlor(lhsDest, lhsDest, rhs);
++}
++
++void MacroAssembler::bitwiseOrSimd128(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ as_xxlor(dest, lhs, rhs);
++}
++
++void MacroAssembler::bitwiseXorSimd128(FloatRegister rhs,
++ FloatRegister lhsDest) {
++ as_xxlxor(lhsDest, lhsDest, rhs);
++}
++
++void MacroAssembler::bitwiseXorSimd128(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ as_xxlxor(dest, lhs, rhs);
++}
++
++void MacroAssembler::bitwiseNotSimd128(FloatRegister src, FloatRegister dest) {
++ as_xxlnor(dest, src, src);
++}
++
++void MacroAssembler::bitwiseNotAndSimd128(FloatRegister rhs,
++ FloatRegister lhsDest) {
++ // notand(lhs, rhs) = ~lhs & rhs = xxlandc(rhs, lhs)
++ as_xxlandc(lhsDest, rhs, lhsDest);
++}
++
++void MacroAssembler::anyTrueSimd128(FloatRegister src, Register dest) {
++ // vcmpequd. (POWER8+) against zero sets CR6:
++ // - CR6.LT (BE bit 24) = 1 iff the per-lane result is all-1s, i.e.
++ // every doubleword of src equals zero (= src is all-zero).
++ // - CR6.EQ (BE bit 26) = 1 iff no lane was equal (= any nonzero).
++ // any-true = !all-zero = !CR6.LT.
++ ScratchSimd128Scope scratch(*this);
++ uint8_t s = scratch.encoding() & 31;
++ as_xxlxor(scratch, scratch, scratch);
++ as_vcmpequd_rc(s, src.encoding() & 31, s);
++ if (HasPOWER10()) {
++ // setbcr materialises (CR[BI] == 0) ? 1 : 0 directly into dest.
++ // dest = (CR6.LT == 0) = "not all-zero" = any-true.
++ as_setbcr(dest, Assembler::LessThan, cr6);
++ return;
++ }
++ as_mfocrf(dest, cr6);
++ // CR6.LT is at BE bit 24 of the GPR. rlwinm sh=25 rotates left 25:
++ // bit (24 - 25) mod 32 = 31 (LSB). Mask 31..31 keeps just bit 31.
++ as_rlwinm(dest, dest, 25, 31, 31);
++ as_xori(dest, dest, 1);
++}
++
++// vcmpequX. against zero sets CR6: LT = all input lanes were zero,
++// EQ = no input lane was zero. The latter is exactly "all-true".
++// mfocrf places CR6 at bits 24-27 of the low 32-bit half (LT=24, EQ=26).
++// rlwinm rd,rd,27,31,31 extracts bit 26 (CR6.EQ) right-justified.
++template <typename VmxCmpRcFn>
++static void EmitAllTrueInt(MacroAssembler& masm, FloatRegister src,
++ Register dest, VmxCmpRcFn vmxCmpRc) {
++ ScratchSimd128Scope scratch(masm);
++ ZeroSimd128(masm, scratch);
++ uint8_t s = scratch.encoding() & 31;
++ vmxCmpRc(static_cast<Assembler&>(masm), s, src.encoding() & 31, s);
++ if (HasPOWER10()) {
++ // setbc materialises CR6.EQ directly into dest (1 insn vs the 2-insn
++ // mfocrf + rlwinm extract). Already wired in ma_cmp_set.
++ masm.as_setbc(dest, Assembler::Equal, cr6);
++ return;
++ }
++ masm.as_mfocrf(dest, cr6);
++ masm.as_rlwinm(dest, dest, 27, 31, 31);
++}
++
++void MacroAssembler::allTrueInt8x16(FloatRegister src, Register dest) {
++ EmitAllTrueInt(*this, src, dest,
++ [](Assembler& a, uint8_t t, uint8_t r, uint8_t b) {
++ a.as_vcmpequb_rc(t, r, b);
++ });
++}
++
++void MacroAssembler::allTrueInt16x8(FloatRegister src, Register dest) {
++ EmitAllTrueInt(*this, src, dest,
++ [](Assembler& a, uint8_t t, uint8_t r, uint8_t b) {
++ a.as_vcmpequh_rc(t, r, b);
++ });
++}
++
++void MacroAssembler::allTrueInt32x4(FloatRegister src, Register dest) {
++ EmitAllTrueInt(*this, src, dest,
++ [](Assembler& a, uint8_t t, uint8_t r, uint8_t b) {
++ a.as_vcmpequw_rc(t, r, b);
++ });
++}
++
++void MacroAssembler::allTrueInt64x2(FloatRegister src, Register dest) {
++ EmitAllTrueInt(*this, src, dest,
++ [](Assembler& a, uint8_t t, uint8_t r, uint8_t b) {
++ a.as_vcmpequd_rc(t, r, b);
++ });
++}
++
++void MacroAssembler::compareInt8x16(Assembler::Condition cond,
++ FloatRegister rhs, FloatRegister lhsDest) {
++ compareInt8x16(cond, lhsDest, rhs, lhsDest);
++}
++
++void MacroAssembler::compareInt8x16(Assembler::Condition cond,
++ FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ if (cond == Assembler::NotEqual && HasPOWER9()) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vcmpneb), lhs, rhs, dest);
++ return;
++ }
++ EmitVmxCompare(*this, cond, lhs, rhs, dest, VMX_BINARY_WRAPPER(vcmpequb),
++ VMX_BINARY_WRAPPER(vcmpgtsb), VMX_BINARY_WRAPPER(vcmpgtub));
++}
++
++void MacroAssembler::compareInt16x8(Assembler::Condition cond,
++ FloatRegister rhs, FloatRegister lhsDest) {
++ compareInt16x8(cond, lhsDest, rhs, lhsDest);
++}
++
++void MacroAssembler::compareInt16x8(Assembler::Condition cond,
++ FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ if (cond == Assembler::NotEqual && HasPOWER9()) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vcmpneh), lhs, rhs, dest);
++ return;
++ }
++ EmitVmxCompare(*this, cond, lhs, rhs, dest, VMX_BINARY_WRAPPER(vcmpequh),
++ VMX_BINARY_WRAPPER(vcmpgtsh), VMX_BINARY_WRAPPER(vcmpgtuh));
++}
++
++void MacroAssembler::compareInt32x4(Assembler::Condition cond,
++ FloatRegister rhs, FloatRegister lhsDest) {
++ compareInt32x4(cond, lhsDest, rhs, lhsDest);
++}
++
++void MacroAssembler::compareInt32x4(Assembler::Condition cond,
++ FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ if (cond == Assembler::NotEqual && HasPOWER9()) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vcmpnew), lhs, rhs, dest);
++ return;
++ }
++ EmitVmxCompare(*this, cond, lhs, rhs, dest, VMX_BINARY_WRAPPER(vcmpequw),
++ VMX_BINARY_WRAPPER(vcmpgtsw), VMX_BINARY_WRAPPER(vcmpgtuw));
++}
++
++void MacroAssembler::compareFloat32x4(Assembler::Condition cond,
++ FloatRegister rhs,
++ FloatRegister lhsDest) {
++ compareFloat32x4(cond, lhsDest, rhs, lhsDest);
++}
++
++void MacroAssembler::compareFloat32x4(Assembler::Condition cond,
++ FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ switch (cond) {
++ case Assembler::Equal:
++ as_xvcmpeqsp(dest, lhs, rhs);
++ break;
++ case Assembler::NotEqual:
++ as_xvcmpeqsp(dest, lhs, rhs);
++ bitwiseNotSimd128(dest, dest);
++ break;
++ case Assembler::GreaterThan:
++ as_xvcmpgtsp(dest, lhs, rhs);
++ break;
++ case Assembler::GreaterThanOrEqual:
++ as_xvcmpgesp(dest, lhs, rhs);
++ break;
++ case Assembler::LessThan:
++ as_xvcmpgtsp(dest, rhs, lhs);
++ break;
++ case Assembler::LessThanOrEqual:
++ as_xvcmpgesp(dest, rhs, lhs);
++ break;
++ default:
++ MOZ_CRASH("Unexpected SIMD float condition");
++ }
++}
++
++void MacroAssembler::compareFloat64x2(Assembler::Condition cond,
++ FloatRegister rhs,
++ FloatRegister lhsDest) {
++ compareFloat64x2(cond, lhsDest, rhs, lhsDest);
++}
++
++void MacroAssembler::compareFloat64x2(Assembler::Condition cond,
++ FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ switch (cond) {
++ case Assembler::Equal:
++ as_xvcmpeqdp(dest, lhs, rhs);
++ break;
++ case Assembler::NotEqual:
++ as_xvcmpeqdp(dest, lhs, rhs);
++ bitwiseNotSimd128(dest, dest);
++ break;
++ case Assembler::GreaterThan:
++ as_xvcmpgtdp(dest, lhs, rhs);
++ break;
++ case Assembler::GreaterThanOrEqual:
++ as_xvcmpgedp(dest, lhs, rhs);
++ break;
++ case Assembler::LessThan:
++ as_xvcmpgtdp(dest, rhs, lhs);
++ break;
++ case Assembler::LessThanOrEqual:
++ as_xvcmpgedp(dest, rhs, lhs);
++ break;
++ default:
++ MOZ_CRASH("Unexpected SIMD float condition");
++ }
++}
++
++void MacroAssembler::negFloat32x4(FloatRegister src, FloatRegister dest) {
++ as_xvnegsp(dest, src);
++}
++
++void MacroAssembler::negFloat64x2(FloatRegister src, FloatRegister dest) {
++ as_xvnegdp(dest, src);
++}
++
++void MacroAssembler::absFloat32x4(FloatRegister src, FloatRegister dest) {
++ as_xvabssp(dest, src);
++}
++
++void MacroAssembler::absFloat64x2(FloatRegister src, FloatRegister dest) {
++ as_xvabsdp(dest, src);
++}
++
++// Per spec:
++// result[k] = (s|u)ext_widen(src[2k]) + (s|u)ext_widen(src[2k+1])
++// POWER lacks pairwise multiply-add. Emulate via vmulX{e,o}X(src, splat(1))
++// + vadd. Both vmuls need `src` AND `splat(1)` available simultaneously.
++//
++// Available SIMD slots without involving Lowering:
++// - ScratchSimd128Reg (VR0, non-allocatable)
++// - dest, src
++// That's 3 regs when dest != src — enough for {src, splat, intermediate}.
++// When dest == src we stash src and the even product to the 288-byte ELFv2
++// red zone and rebuild splat(1).
++//
++// (Earlier implementations of these helpers routed through hardcoded
++// VR1/VR2/VR3 via xxlor_vsr — faster but stomped allocator-managed VRs
++// and silently corrupted any live wasm v128 the allocator had placed
++// there. ScratchSimd128Reg + red-zone stash is the safe contract.)
++// Always-safe pattern: stash src to red zone so dest can be freely overwritten,
++// stash even to red zone after first vmul so we can rebuild splat(1) for the
++// second vmul. The splat-of-1 is now `vspltis{b,h}` (5-bit signed immediate
++// splat) — 1 insn vs the 3-insn movePtr+mtvsrd+vsplt sequence the previous
++// path used.
++// Pattern: stash src to red zone slot 0 so dest can be freely overwritten;
++// vmul-even (signed/unsigned) of src with splat(1) produces sign/zero-extended
++// even-lane products into dest; stash that to slot 1 and rebuild scratch=src
++// (slot 0) and dest=splat(1); vmul-odd produces the odd products; restore
++// even from slot 1 and pairwise-add.
++void MacroAssembler::extAddPairwiseInt8x16(FloatRegister src,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ uint8_t s = scratch.encoding() & 31;
++ uint8_t srcEnc = src.encoding() & 31;
++ uint8_t destEnc = dest.encoding() & 31;
++ RedZoneStashSimd128(*this, src, 0);
++ as_vspltisb(s, 1);
++ as_vmulesb(destEnc, srcEnc, s);
++ RedZoneStashSimd128(*this, dest, 1);
++ RedZoneRestoreSimd128(*this, 0, scratch);
++ as_vspltisb(destEnc, 1);
++ as_vmulosb(destEnc, s, destEnc);
++ RedZoneRestoreSimd128(*this, 1, scratch);
++ as_vadduhm(destEnc, destEnc, s);
++}
++
++void MacroAssembler::unsignedExtAddPairwiseInt8x16(FloatRegister src,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ uint8_t s = scratch.encoding() & 31;
++ uint8_t srcEnc = src.encoding() & 31;
++ uint8_t destEnc = dest.encoding() & 31;
++ RedZoneStashSimd128(*this, src, 0);
++ as_vspltisb(s, 1);
++ as_vmuleub(destEnc, srcEnc, s);
++ RedZoneStashSimd128(*this, dest, 1);
++ RedZoneRestoreSimd128(*this, 0, scratch);
++ as_vspltisb(destEnc, 1);
++ as_vmuloub(destEnc, s, destEnc);
++ RedZoneRestoreSimd128(*this, 1, scratch);
++ as_vadduhm(destEnc, destEnc, s);
++}
++
++// vmsumshm/vmsumuhm collapse the i16x8 → i32x4 pairwise-add into a single
++// multiply-sum: VT.i32[k] = VRA.i16[2k]*VRB.i16[2k] +
++// VRA.i16[2k+1]*VRB.i16[2k+1]
++// + VRC.i32[k]. With VRB = splat(1) and VRC = 0 this is exactly the wasm
++// i32x4.extadd_pairwise_i16x8_{s,u} contract. 3 insns when dest != src;
++// LWasmUnarySimd128 uses useRegisterAtStart so dest may alias src — in that
++// case we put splat(1) into scratch (preserving src in dest) and use a
++// red-zone slot for the zero VRC operand.
++void MacroAssembler::extAddPairwiseInt16x8(FloatRegister src,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ if (dest != src) {
++ as_xxlxor(scratch, scratch, scratch); // scratch = 0 (VRC addend)
++ as_vspltish(dest.encoding() & 31, 1); // dest = splat(1) (VRB multiplier)
++ as_vmsumshm(dest.encoding() & 31, src.encoding() & 31, dest.encoding() & 31,
++ scratch.encoding() & 31);
++ return;
++ }
++ // dest == src: load splat(1) into scratch instead, stash zero to the red
++ // zone, restore zero into scratch after the splat is consumed... actually
++ // simpler: use vmule/vmulo + vadd trio with red zone. Same shape as the
++ // pre-vmsumshm fallback for i8x16.
++ uint8_t s = scratch.encoding() & 31;
++ uint8_t srcEnc = src.encoding() & 31;
++ uint8_t destEnc = dest.encoding() & 31;
++ RedZoneStashSimd128(*this, src, 0);
++ as_vspltish(s, 1);
++ as_vmulesh(destEnc, srcEnc, s);
++ RedZoneStashSimd128(*this, dest, 1);
++ RedZoneRestoreSimd128(*this, 0, scratch);
++ as_vspltish(destEnc, 1);
++ as_vmulosh(destEnc, s, destEnc);
++ RedZoneRestoreSimd128(*this, 1, scratch);
++ as_vadduwm(destEnc, destEnc, s);
++}
++
++void MacroAssembler::unsignedExtAddPairwiseInt16x8(FloatRegister src,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ if (dest != src) {
++ as_xxlxor(scratch, scratch, scratch);
++ as_vspltish(dest.encoding() & 31, 1);
++ as_vmsumuhm(dest.encoding() & 31, src.encoding() & 31, dest.encoding() & 31,
++ scratch.encoding() & 31);
++ return;
++ }
++ uint8_t s = scratch.encoding() & 31;
++ uint8_t srcEnc = src.encoding() & 31;
++ uint8_t destEnc = dest.encoding() & 31;
++ RedZoneStashSimd128(*this, src, 0);
++ as_vspltish(s, 1);
++ as_vmuleuh(destEnc, srcEnc, s);
++ RedZoneStashSimd128(*this, dest, 1);
++ RedZoneRestoreSimd128(*this, 0, scratch);
++ as_vspltish(destEnc, 1);
++ as_vmulouh(destEnc, s, destEnc);
++ RedZoneRestoreSimd128(*this, 1, scratch);
++ as_vadduwm(destEnc, destEnc, s);
++}
++
++void MacroAssembler::sqrtFloat32x4(FloatRegister src, FloatRegister dest) {
++ as_xvsqrtsp(dest, src);
++}
++
++void MacroAssembler::sqrtFloat64x2(FloatRegister src, FloatRegister dest) {
++ as_xvsqrtdp(dest, src);
++}
++
++void MacroAssembler::convertInt32x4ToFloat32x4(FloatRegister src,
++ FloatRegister dest) {
++ as_xvcvsxwsp(dest, src);
++}
++
++void MacroAssembler::unsignedConvertInt32x4ToFloat32x4(FloatRegister src,
++ FloatRegister dest) {
++ as_xvcvuxwsp(dest, src);
++}
++
++// i32x4 (low 2 lanes) → f64x2. Wasm `f64x2.convert_low_i32x4_{s,u}`.
++// xvcv{s,u}xwdp converts BE word 0 and BE word 2 of source to doubles in
++// BE dwords 0 and 1. vmrglw places src.word_BE[2,3] at the read positions,
++// matching the f32→f64 promote shape:
++// vmrglw scratch, src, src ; BE words 2,3 of src → BE words 0,2 of
++// scratch xvcv*xwdp dest, scratch ; convert both, place in BE dwords
++// 0,1
++// Output BE dwords land as [convert(input lane 1), convert(input lane 0)],
++// which on PPC64LE storage IS the wasm output layout.
++//
++// 2 insns each, single ScratchSimd128 scope, no GPR or FPR scratch.
++// All ops POWER7+. dest==src aliasing safe (vmrglw consumes src into
++// scratch before dest is written).
++void MacroAssembler::convertInt32x4ToFloat64x2(FloatRegister src,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ as_vmrglw(scratch.encoding() & 31, src.encoding() & 31, src.encoding() & 31);
++ as_xvcvsxwdp(dest, scratch);
++}
++
++void MacroAssembler::unsignedConvertInt32x4ToFloat64x2(FloatRegister src,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ as_vmrglw(scratch.encoding() & 31, src.encoding() & 31, src.encoding() & 31);
++ as_xvcvuxwdp(dest, scratch);
++}
++
++void MacroAssembler::truncSatFloat32x4ToInt32x4(FloatRegister src,
++ FloatRegister dest) {
++ // xvcvspsxws gives INT32_MIN for NaN, but Wasm requires 0.
++ ScratchSimd128Scope scratch(*this);
++ as_xvcmpeqsp(scratch, src, src); // ~0 for non-NaN, 0 for NaN
++ as_xvcvspsxws(dest, src);
++ as_xxland(dest, dest, scratch); // zero NaN lanes
++}
++
++// Pack the two "interesting" 32-bit results that xvcv*xws / xvcvdpsp leaves
++// at scratch.word_BE[0] (= A) and scratch.word_BE[2] (= B) into a zeroed dest
++// as dest.word_BE = [0, 0, A, B]. This is the layout wasm requires for
++// f64x2 → {i32x4 trunc_sat, f32x4 demote}. Writes dest, consumes scratch.
++//
++// POWER9 path (4 insns) uses xxinsertw/xxextractuw. POWER8 path (7 insns)
++// goes via two GPR round-trips: extract A and B with mfvsrd, splice them
++// into a single dword with rldimi, mtvsrd back into a SIMD reg, and
++// xxpermdi the result into dest.dw1 while keeping dest.dw0 zero.
++static inline void PackTwoWordsToLowHalf(MacroAssembler& masm,
++ FloatRegister scratch,
++ FloatRegister dest) {
++ if (HasPOWER9()) {
++ masm.as_xxinsertw(dest, scratch,
++ 8); // dest.word_BE[2] ← scratch.word_BE[1] (= A)
++ masm.as_xxextractuw(scratch, scratch,
++ 8); // scratch.word_BE[1] ← scratch.word_BE[2] (= B)
++ masm.as_xxinsertw(dest, scratch,
++ 12); // dest.word_BE[3] ← scratch.word_BE[1] (= B)
++ return;
++ }
++ // POWER8: xxinsertw/xxextractuw are ISA 3.0. Take a GPR detour instead.
++ // scratch.dw_BE[0] = (A << 32) | A, scratch.dw_BE[1] = (B << 32) | B.
++ UseScratchRegisterScope temps(masm);
++ Register tmpA = temps.Acquire();
++ Register tmpB = temps.Acquire();
++ masm.as_mfvsrd(tmpA, scratch); // tmpA = (A << 32) | A
++ masm.as_xxpermdi(scratch, scratch, scratch,
++ 2); // swap dwords: now dw0 = (B<<32)|B
++ masm.as_mfvsrd(tmpB, scratch); // tmpB = (B << 32) | B
++ masm.x_srdi(tmpA, tmpA, 32); // tmpA = 0x00000000_AAAAAAAA
++ masm.as_rldimi(tmpB, tmpA, 32,
++ 0); // tmpB[0..31] = A; tmpB[32..63] = B (kept)
++ masm.as_mtvsrd(scratch, tmpB); // scratch.dw_BE[0] = (A << 32) | B; dw1 = 0
++ masm.as_xxpermdi(dest, dest, scratch,
++ 0); // dest = {dest.dw0=0, scratch.dw0} = [0, 0, A, B]
++}
++
++// fctiwz / fcmpu / fctiduz are X-form scalar FP instructions that only
++// encode 5-bit FRT/FRB fields, so emitting them on a Simd128 reg
++// (encoding 32+) would corrupt the opcode. Bridge through
++// ScratchDoubleReg (FPR f0) for the conversion. Extract both lanes' GPR
++// results before writing dest so that dest == src is safe.
++//
++// Avoid replaceLaneInt32x4 on the tail: on POWER8 it needs an extra
++// GPR scratch, but r11 and r12 are already held as a/b here. Pack both
++// int32s into `a` with rldimi, transfer via mtvsrd, then xxpermdi the
++// DWs into the low half so wasm lane 0 (BE W3) holds a, lane 1 (W2) b.
++void MacroAssembler::truncSatFloat64x2ToInt32x4(FloatRegister src,
++ FloatRegister dest,
++ FloatRegister temp) {
++ // Wasm `i32x4.trunc_sat_f64x2_s_zero`. xvcvdpsxws saturates to INT32_MIN
++ // on overflow/NaN (per ISA); wasm requires NaN → 0, so a per-dword NaN
++ // mask via xvcmpeqdp clamps NaN lanes to 0 before laying out the result.
++ // Output BE word positions need wasm lane order: lane 1 → BE word 2,
++ // lane 0 → BE word 3. xvcvdpsxws lands its results at BE words 0 and 2
++ // (with replication into 1/3); PackTwoWordsToLowHalf moves them into
++ // the right positions while zeroing the rest.
++ // dest==src safe: src is consumed by xvcvdpsxws and xvcmpeqdp before
++ // dest is zeroed.
++ ScratchSimd128Scope scratch(*this);
++ as_xvcvdpsxws(scratch, src);
++ as_xvcmpeqdp(dest, src,
++ src); // NaN-mask: 0xFF...F per dword for non-NaN, 0 for NaN
++ as_xxland(scratch, scratch, dest);
++ as_xxlxor(dest, dest, dest);
++ PackTwoWordsToLowHalf(*this, scratch, dest);
++}
++
++void MacroAssembler::unsignedTruncSatFloat64x2ToInt32x4(FloatRegister src,
++ FloatRegister dest,
++ FloatRegister temp) {
++ // Wasm `i32x4.trunc_sat_f64x2_u_zero`. xvcvdpuxws semantics already
++ // match the wasm spec without any masking: NaN → 0, negative → 0,
++ // positive overflow → UINT32_MAX. So no NaN mask needed; just position
++ // the saturated results into BE words 2,3 with zeros at words 0,1.
++ // dest==src safe: src consumed by xvcvdpuxws before dest is zeroed.
++ ScratchSimd128Scope scratch(*this);
++ as_xvcvdpuxws(scratch, src);
++ as_xxlxor(dest, dest, dest);
++ PackTwoWordsToLowHalf(*this, scratch, dest);
++}
++
++void MacroAssembler::truncFloat32x4ToInt32x4Relaxed(FloatRegister src,
++ FloatRegister dest) {
++ truncSatFloat32x4ToInt32x4(src, dest);
++}
++
++void MacroAssembler::unsignedTruncFloat32x4ToInt32x4Relaxed(
++ FloatRegister src, FloatRegister dest) {
++ unsignedTruncSatFloat32x4ToInt32x4(src, dest);
++}
++
++void MacroAssembler::truncFloat64x2ToInt32x4Relaxed(FloatRegister src,
++ FloatRegister dest) {
++ truncSatFloat64x2ToInt32x4(src, dest, ScratchSimd128Reg);
++}
++
++void MacroAssembler::unsignedTruncFloat64x2ToInt32x4Relaxed(
++ FloatRegister src, FloatRegister dest) {
++ unsignedTruncSatFloat64x2ToInt32x4(src, dest, ScratchSimd128Reg);
++}
++
++// f64x2 → f32x4 (low 2 lanes; high lanes zero). Wasm `f32x4.demote_f64x2_zero`.
++// xvcvdpsp converts both doubles in one shot, replicating each result across
++// its dword: BE word lanes = [s(in.dw0), s(in.dw0), s(in.dw1), s(in.dw1)].
++// On PPC64LE wasm storage (lxvx-loaded), input.dw_BE[0] = wasm lane 1 and
++// input.dw_BE[1] = wasm lane 0, so we get [s(l1), s(l1), s(l0), s(l0)] in
++// BE word order. We then zero dest and pack s(l1) into BE word 2 (wasm
++// output lane 1) and s(l0) into BE word 3 (wasm output lane 0) via the
++// shared PackTwoWordsToLowHalf helper, which has POWER9 and POWER8 paths.
++//
++// dest==src aliasing safe: src is consumed by xvcvdpsp before dest is zeroed.
++void MacroAssembler::convertFloat64x2ToFloat32x4(FloatRegister src,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ as_xvcvdpsp(scratch, src);
++ ZeroSimd128(*this, dest);
++ PackTwoWordsToLowHalf(*this, scratch, dest);
++}
++
++// f32x4 (low 2 lanes) → f64x2. Wasm `f64x2.promote_low_f32x4`. xvcvspdp
++// converts both BE word 0 and BE word 2 of its source to doubles in BE
++// dwords 0 and 1 respectively. To get wasm lanes 0 and 1 (= input BE
++// words 3 and 2) into those source positions, vmrglw merges low words:
++// VRT.word[0] = VRA.word[2] = wasm lane 1, VRT.word[2] = VRA.word[3] =
++// wasm lane 0 (with replicated copies in odd word slots that xvcvspdp
++// ignores). Output BE dwords land as [double(lane1), double(lane0)],
++// which on PPC64LE storage is exactly the wasm f64x2 output layout.
++//
++// dest==src aliasing safe: vmrglw consumes src into a separate scratch
++// before dest is written.
++//
++// 2 insns, single ScratchSimd128 scope. All ops POWER7+.
++void MacroAssembler::convertFloat32x4ToFloat64x2(FloatRegister src,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ as_vmrglw(scratch.encoding() & 31, src.encoding() & 31, src.encoding() & 31);
++ as_xvcvspdp(dest, scratch);
++}
++
++void MacroAssembler::unsignedNarrowInt16x8(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ // On LE, VMX pack swaps operand order vs Wasm convention.
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vpkshus), rhs, lhs, dest);
++}
++
++void MacroAssembler::unsignedNarrowInt32x4(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ // On LE, VMX pack swaps operand order vs Wasm convention.
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vpkswus), rhs, lhs, dest);
++}
++
++void MacroAssembler::widenLowInt8x16(FloatRegister src, FloatRegister dest) {
++ // On PPC64 LE, raw vupklsb unpacks the LOW Wasm lanes (not vupkhsb).
++ // GCC vec_unpackh maps to vupklsb on LE (swapped from BE naming).
++ // Raw vupklsb([1..8,-1..-8]) = [1,2,3,4,5,6,7,8].
++ EmitVmxUnary(
++ *this,
++ [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vupklsb(vrt, vrb); },
++ src, dest);
++}
++
++void MacroAssembler::widenHighInt8x16(FloatRegister src, FloatRegister dest) {
++ // On PPC64 LE, raw vupkhsb unpacks the HIGH Wasm lanes.
++ EmitVmxUnary(
++ *this,
++ [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vupkhsb(vrt, vrb); },
++ src, dest);
++}
++
++void MacroAssembler::unsignedWidenLowInt8x16(FloatRegister src,
++ FloatRegister dest) {
++ zeroExtend8x16To16x8(src, dest);
++}
++
++void MacroAssembler::unsignedWidenHighInt8x16(FloatRegister src,
++ FloatRegister dest) {
++ // vmrghb(zero, src) interleaves zero bytes with the BE-high half of src,
++ // producing zero-extended halfwords of the LE-high (Wasm-high) lanes.
++ ScratchSimd128Scope scratch(*this);
++ as_xxlxor(scratch, scratch, scratch);
++ as_vmrghb(dest.encoding() & 31, scratch.encoding() & 31, src.encoding() & 31);
++}
++
++void MacroAssembler::widenLowInt16x8(FloatRegister src, FloatRegister dest) {
++ // On PPC64 LE, raw vupklsh unpacks LOW Wasm lanes (GCC swaps h/l on LE).
++ EmitVmxUnary(
++ *this,
++ [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vupklsh(vrt, vrb); },
++ src, dest);
++}
++
++void MacroAssembler::widenHighInt16x8(FloatRegister src, FloatRegister dest) {
++ EmitVmxUnary(
++ *this,
++ [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vupkhsh(vrt, vrb); },
++ src, dest);
++}
++
++void MacroAssembler::unsignedWidenLowInt16x8(FloatRegister src,
++ FloatRegister dest) {
++ zeroExtend16x8To32x4(src, dest);
++}
++
++void MacroAssembler::unsignedWidenHighInt16x8(FloatRegister src,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ as_xxlxor(scratch, scratch, scratch);
++ as_vmrghh(dest.encoding() & 31, scratch.encoding() & 31, src.encoding() & 31);
++}
++
++void MacroAssembler::widenLowInt32x4(FloatRegister src, FloatRegister dest) {
++ // On PPC64 LE, raw vupklsw unpacks LOW Wasm lanes.
++ EmitVmxUnary(
++ *this,
++ [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vupklsw(vrt, vrb); },
++ src, dest);
++}
++
++void MacroAssembler::unsignedWidenLowInt32x4(FloatRegister src,
++ FloatRegister dest) {
++ zeroExtend32x4To64x2(src, dest);
++}
++
++void MacroAssembler::widenHighInt32x4(FloatRegister src, FloatRegister dest) {
++ EmitVmxUnary(
++ *this,
++ [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vupkhsw(vrt, vrb); },
++ src, dest);
++}
++
++void MacroAssembler::unsignedWidenHighInt32x4(FloatRegister src,
++ FloatRegister dest) {
++ // i64x2.extend_high_i32x4_u: take high 2 i32 lanes of src, zero-extend
++ // to i64 each. Use vmrghw to interleave a zero VR with src — same shape
++ // as the (already-correct) unsignedWidenHighInt16x8 sibling above.
++ ScratchSimd128Scope scratch(*this);
++ ZeroSimd128(*this, scratch);
++ as_vmrghw(dest.encoding() & 31, scratch.encoding() & 31, src.encoding() & 31);
++}
++
++void MacroAssembler::pseudoMinFloat32x4(FloatRegister rhsOrRhsDest,
++ FloatRegister lhsOrLhsDest) {
++ // pmin: result[i] = rhs[i] < lhs[i] ? rhs[i] : lhs[i]
++ // xvcmpgtsp(mask, lhs, rhs) → 1 where lhs > rhs (i.e., rhs < lhs)
++ // xxsel: mask=1 → XB=rhs. mask=0 → XA=lhs.
++ // Result goes to lhsOrLhsDest (second param).
++ ScratchSimd128Scope scratch(*this);
++ as_xvcmpgtsp(scratch, lhsOrLhsDest, rhsOrRhsDest);
++ as_xxsel(lhsOrLhsDest, lhsOrLhsDest, rhsOrRhsDest, scratch);
++}
++
++void MacroAssembler::pseudoMinFloat32x4(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ // pmin(lhs, rhs) = rhs < lhs ? rhs : lhs
++ // Inline to handle dest aliasing with either operand.
++ ScratchSimd128Scope scratch(*this);
++ as_xvcmpgtsp(scratch, lhs, rhs);
++ // mask=1 where lhs > rhs. XC=1 → select XB=rhs. XC=0 → select XA=lhs.
++ as_xxsel(dest, lhs, rhs, scratch);
++}
++
++void MacroAssembler::pseudoMinFloat64x2(FloatRegister rhsOrRhsDest,
++ FloatRegister lhsOrLhsDest) {
++ ScratchSimd128Scope scratch(*this);
++ as_xvcmpgtdp(scratch, lhsOrLhsDest, rhsOrRhsDest);
++ as_xxsel(lhsOrLhsDest, lhsOrLhsDest, rhsOrRhsDest, scratch);
++}
++
++void MacroAssembler::pseudoMinFloat64x2(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ as_xvcmpgtdp(scratch, lhs, rhs);
++ as_xxsel(dest, lhs, rhs, scratch);
++}
++
++void MacroAssembler::pseudoMaxFloat32x4(FloatRegister rhsOrRhsDest,
++ FloatRegister lhsOrLhsDest) {
++ ScratchSimd128Scope scratch(*this);
++ as_xvcmpgtsp(scratch, rhsOrRhsDest, lhsOrLhsDest);
++ as_xxsel(lhsOrLhsDest, lhsOrLhsDest, rhsOrRhsDest, scratch);
++}
++
++void MacroAssembler::pseudoMaxFloat32x4(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ // pmax(lhs, rhs) = lhs < rhs ? rhs : lhs
++ ScratchSimd128Scope scratch(*this);
++ as_xvcmpgtsp(scratch, rhs, lhs);
++ // mask=1 where rhs > lhs (lhs < rhs). XC=1 → select XB=rhs. XC=0 → select
++ // XA=lhs.
++ as_xxsel(dest, lhs, rhs, scratch);
++}
++
++void MacroAssembler::pseudoMaxFloat64x2(FloatRegister rhsOrRhsDest,
++ FloatRegister lhsOrLhsDest) {
++ ScratchSimd128Scope scratch(*this);
++ as_xvcmpgtdp(scratch, rhsOrRhsDest, lhsOrLhsDest);
++ as_xxsel(lhsOrLhsDest, lhsOrLhsDest, rhsOrRhsDest, scratch);
++}
++
++void MacroAssembler::pseudoMaxFloat64x2(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ as_xvcmpgtdp(scratch, rhs, lhs);
++ as_xxsel(dest, lhs, rhs, scratch);
++}
++
++void MacroAssembler::dotInt8x16Int7x16(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ // result[k] = lhs[2k]*rhs[2k] + lhs[2k+1]*rhs[2k+1] for k=0..7.
++ // vmulesb/vmulosb produce even/odd byte products as i16 in matching
++ // halfword lanes; vadduhm sums them pairwise.
++ ScratchSimd128Scope scratch(*this);
++ uint8_t l = lhs.encoding() & 31;
++ uint8_t r = rhs.encoding() & 31;
++ uint8_t d = dest.encoding() & 31;
++ uint8_t s = scratch.encoding() & 31;
++ as_vmulesb(s, l, r);
++ as_vmulosb(d, l, r);
++ as_vadduhm(d, s, d);
++}
++
++void MacroAssembler::ceilFloat32x4(FloatRegister src, FloatRegister dest) {
++ as_xvrspip(dest, src);
++}
++
++void MacroAssembler::ceilFloat64x2(FloatRegister src, FloatRegister dest) {
++ as_xvrdpip(dest, src);
++}
++
++void MacroAssembler::floorFloat32x4(FloatRegister src, FloatRegister dest) {
++ as_xvrspim(dest, src);
++}
++
++void MacroAssembler::floorFloat64x2(FloatRegister src, FloatRegister dest) {
++ as_xvrdpim(dest, src);
++}
++
++void MacroAssembler::truncFloat32x4(FloatRegister src, FloatRegister dest) {
++ as_xvrspiz(dest, src);
++}
++
++void MacroAssembler::truncFloat64x2(FloatRegister src, FloatRegister dest) {
++ as_xvrdpiz(dest, src);
++}
++
++void MacroAssembler::nearestFloat32x4(FloatRegister src, FloatRegister dest) {
++ as_xvrspic(dest, src);
++}
++
++void MacroAssembler::nearestFloat64x2(FloatRegister src, FloatRegister dest) {
++ as_xvrdpic(dest, src);
++}
++
++void MacroAssembler::fnmaFloat32x4(FloatRegister src1, FloatRegister src2,
++ FloatRegister srcDest) {
++ as_xvnmsubasp(srcDest, src1, src2);
++}
++
++void MacroAssembler::fnmaFloat64x2(FloatRegister src1, FloatRegister src2,
++ FloatRegister srcDest) {
++ as_xvnmsubadp(srcDest, src1, src2);
++}
++
++void MacroAssembler::minFloat32x4Relaxed(FloatRegister src,
++ FloatRegister srcDest) {
++ as_xvminsp(srcDest, srcDest, src);
++}
++
++void MacroAssembler::minFloat32x4Relaxed(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ as_xvminsp(dest, lhs, rhs);
++}
++
++void MacroAssembler::maxFloat32x4Relaxed(FloatRegister src,
++ FloatRegister srcDest) {
++ as_xvmaxsp(srcDest, srcDest, src);
++}
++
++void MacroAssembler::maxFloat32x4Relaxed(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ as_xvmaxsp(dest, lhs, rhs);
++}
++
++void MacroAssembler::minFloat64x2Relaxed(FloatRegister src,
++ FloatRegister srcDest) {
++ as_xvmindp(srcDest, srcDest, src);
++}
++
++void MacroAssembler::minFloat64x2Relaxed(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ as_xvmindp(dest, lhs, rhs);
++}
++
++void MacroAssembler::maxFloat64x2Relaxed(FloatRegister src,
++ FloatRegister srcDest) {
++ as_xvmaxdp(srcDest, srcDest, src);
++}
++
++void MacroAssembler::maxFloat64x2Relaxed(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ as_xvmaxdp(dest, lhs, rhs);
++}
++
++void MacroAssembler::q15MulrInt16x8Relaxed(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ q15MulrSatInt16x8(lhs, rhs, dest);
++}
++
++// SIMD overloads accepting an extra FloatRegister temp (shared-header signature
++// used by x86; on PPC64 the temp is unused for most of these).
++void MacroAssembler::popcntInt8x16(FloatRegister src, FloatRegister dest,
++ FloatRegister temp) {
++ popcntInt8x16(src, dest);
++}
++
++void MacroAssembler::unsignedTruncSatFloat32x4ToInt32x4(FloatRegister src,
++ FloatRegister dest,
++ FloatRegister temp) {
++ unsignedTruncSatFloat32x4ToInt32x4(src, dest);
++}
++
++void MacroAssembler::dotInt8x16Int7x16ThenAdd(FloatRegister lhs,
++ FloatRegister rhs,
++ FloatRegister dest,
++ FloatRegister temp) {
++ // dest += pairwise_widen_i16_to_i32(dot_i8x16(lhs, rhs)).
++ //
++ // Step 1: i16x8 dot of i8 byte pairs (vmulesb/vmulosb/vadduhm). Keeps
++ // the existing signed-byte multiply semantics that match ARM64 sdot
++ // and x86 vpdpbssd (vmsummbm would be signed×unsigned and diverge for
++ // i7 lanes that bit-pattern as negative).
++ //
++ // Step 2: vmsumshm dest, dot, splat_hw(1), dest computes
++ // dest.i32[k] = dest.i32[k] + dot.i16[2k]*1 + dot.i16[2k+1]*1
++ // which is exactly pairwise widen + accumulate in a single insn.
++ // splat_hw(1) is a single vspltish (5-bit SIMM splat to all 8 halfwords).
++ ScratchSimd128Scope scratch(*this);
++ uint8_t l = lhs.encoding() & 31;
++ uint8_t r = rhs.encoding() & 31;
++ uint8_t d = dest.encoding() & 31;
++ uint8_t s = scratch.encoding() & 31;
++ uint8_t t = temp.encoding() & 31;
++
++ as_vmulesb(s, l, r);
++ as_vmulosb(t, l, r);
++ as_vadduhm(t, s, t);
++ as_vspltish(s, 1);
++ as_vmsumshm(d, t, s, d);
++}
++
++// SIMD ops ported from arm64- and x86/x64-shaped signatures.
++
++void MacroAssembler::permuteInt16x8(const uint16_t lanes[8], FloatRegister src,
++ FloatRegister dest) {
++ uint8_t shuffleLanes[16];
++ for (unsigned i = 0; i < 8; i++) {
++ shuffleLanes[i * 2] = lanes[i] * 2;
++ shuffleLanes[i * 2 + 1] = lanes[i] * 2 + 1;
++ }
++ shuffleInt8x16(shuffleLanes, src, src, dest);
++}
++
++void MacroAssembler::rotateRightSimd128(FloatRegister src, FloatRegister dest,
++ uint32_t shift) {
++ MOZ_ASSERT(shift < 16);
++ if (shift == 0) {
++ moveSimd128(src, dest);
++ return;
++ }
++ // vsldoi VRT,VRA,VRB,SH: concatenate VRA||VRB, take bytes [SH..SH+15].
++ // Rotate right by N = vsldoi(src, src, 16-N).
++ as_vsldoi(dest, src, src, 16 - shift);
++}
++
++void MacroAssembler::mulInt64x2(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest, FloatRegister temp1,
++ FloatRegister temp2) {
++ // POWER10 collapses the entire i64x2 multiply to a single vmulld.
++ // POWER9/POWER8 fall back to the GPR round-trip path: extract each
++ // lane pair into GPRs (mfvsrld for LE-dw0/Wasm-lane-0, mfvsrd for
++ // LE-dw1/lane-1), multiply, and reassemble via mtvsrd + xxpermdi.
++ if (HasPOWER10()) {
++ as_vmulld(dest.encoding() & 31, lhs.encoding() & 31, rhs.encoding() & 31);
++ return;
++ }
++ // Aliasing safety: stash the lane-0 product in ScratchSimd128 (which
++ // is non-allocatable, so cannot alias lhs/rhs) and only write dest at
++ // the very end, after both lhs and rhs have been fully consumed.
++ ScratchSimd128Scope scratch(*this);
++ UseScratchRegisterScope temps(asMasm());
++ Register a = temps.Acquire();
++ Register b = temps.Acquire();
++
++ if (HasPOWER9()) {
++ as_mfvsrld(a, lhs);
++ as_mfvsrld(b, rhs);
++ } else {
++ as_xxpermdi(scratch, lhs, lhs, 2);
++ as_mfvsrd(a, scratch);
++ as_xxpermdi(scratch, rhs, rhs, 2);
++ as_mfvsrd(b, scratch);
++ }
++ as_mulld(a, a, b);
++ as_mtvsrd(scratch, a);
++
++ as_mfvsrd(a, lhs);
++ as_mfvsrd(b, rhs);
++ as_mulld(a, a, b);
++ as_mtvsrd(dest, a);
++ as_xxpermdi(dest, dest, scratch, 0);
++}
++
++void MacroAssembler::bitwiseAndNotSimd128(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ // andnot(lhs, rhs) = lhs & ~rhs = xxlandc(lhs, rhs)
++ as_xxlandc(dest, lhs, rhs);
++}
++
++void MacroAssembler::bitwiseSelectSimd128(FloatRegister onTrue,
++ FloatRegister onFalse,
++ FloatRegister maskDest) {
++ // result = (onTrue & mask) | (onFalse & ~mask)
++ // xxsel: XC=0→XA, XC=1→XB → XT = (XA & ~XC) | (XB & XC)
++ // Need XA=onFalse, XB=onTrue, XC=mask.
++ as_xxsel(maskDest, onFalse, onTrue, maskDest);
++}
++
++void MacroAssembler::popcntInt8x16(FloatRegister src, FloatRegister dest) {
++ EmitVmxUnary(
++ *this,
++ [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vpopcntb(vrt, vrb); },
++ src, dest);
++}
++
++void MacroAssembler::bitmaskInt8x16(FloatRegister src, Register dest,
++ FloatRegister temp) {
++ if (HasPOWER10()) {
++ // Single-instruction collapse on POWER10.
++ as_vextractbm(dest, src);
++ return;
++ }
++ // POWER8+ vbpermq-based bitmask: ctl[i] = (15-i)*8 produces the wasm-spec
++ // bitmap (bit i = MSB of LE lane i) in dw0 low 16 bits.
++ int8_t ctl[16] = {120, 112, 104, 96, 88, 80, 72, 64,
++ 56, 48, 40, 32, 24, 16, 8, 0};
++ loadConstantSimd128(SimdConstant::CreateX16(ctl), temp);
++ as_vbpermq(temp.encoding() & 31, src.encoding() & 31, temp.encoding() & 31);
++ as_mfvsrd(dest, temp);
++}
++
++void MacroAssembler::bitmaskInt16x8(FloatRegister src, Register dest,
++ FloatRegister temp) {
++ if (HasPOWER10()) {
++ as_vextracthm(dest, src);
++ return;
++ }
++ // Same recipe as bitmaskInt8x16 but ctl picks halfword MSBs:
++ // BE bit (14-2i)*8 for lane i, plus 8 ignore-bytes (high bit set).
++ int8_t ctl[16] = {112, 96, 80, 64, 48, 32, 16, 0,
++ -128, -128, -128, -128, -128, -128, -128, -128};
++ loadConstantSimd128(SimdConstant::CreateX16(ctl), temp);
++ as_vbpermq(temp.encoding() & 31, src.encoding() & 31, temp.encoding() & 31);
++ as_mfvsrd(dest, temp);
++}
++
++void MacroAssembler::bitmaskInt32x4(FloatRegister src, Register dest,
++ FloatRegister temp) {
++ if (HasPOWER10()) {
++ as_vextractwm(dest, src);
++ return;
++ }
++ // Same recipe as bitmaskInt8x16 but ctl picks word MSBs:
++ // BE bit (12-4i)*8 for lane i, plus 12 ignore-bytes (high bit set).
++ int8_t ctl[16] = {96, 64, 32, 0, -128, -128, -128, -128,
++ -128, -128, -128, -128, -128, -128, -128, -128};
++ loadConstantSimd128(SimdConstant::CreateX16(ctl), temp);
++ as_vbpermq(temp.encoding() & 31, src.encoding() & 31, temp.encoding() & 31);
++ as_mfvsrd(dest, temp);
++}
++
++void MacroAssembler::bitmaskInt64x2(FloatRegister src, Register dest,
++ FloatRegister temp) {
++ if (HasPOWER10()) {
++ as_vextractdm(dest, src);
++ return;
++ }
++ // Same recipe as the other bitmask variants. ctl picks dword MSBs:
++ // BE bit 64 for lane 0, BE bit 0 for lane 1, plus 14 ignore-bytes.
++ int8_t ctl[16] = {64, 0, -128, -128, -128, -128, -128, -128,
++ -128, -128, -128, -128, -128, -128, -128, -128};
++ loadConstantSimd128(SimdConstant::CreateX16(ctl), temp);
++ as_vbpermq(temp.encoding() & 31, src.encoding() & 31, temp.encoding() & 31);
++ as_mfvsrd(dest, temp);
++}
++
++void MacroAssembler::compareInt64x2(Assembler::Condition cond,
++ FloatRegister rhs, FloatRegister lhsDest) {
++ compareInt64x2(cond, lhsDest, rhs, lhsDest);
++}
++
++void MacroAssembler::compareInt64x2(Assembler::Condition cond,
++ FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxCompare(*this, cond, lhs, rhs, dest, VMX_BINARY_WRAPPER(vcmpequd),
++ VMX_BINARY_WRAPPER(vcmpgtsd), VMX_BINARY_WRAPPER(vcmpgtud));
++}
++
++void MacroAssembler::minFloat32x4(FloatRegister rhs, FloatRegister lhsDest) {
++ minFloat32x4(lhsDest, rhs, lhsDest);
++}
++
++void MacroAssembler::minFloat32x4(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ as_xvminsp(dest, lhs, rhs);
++}
++
++void MacroAssembler::minFloat32x4(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest, FloatRegister temp1,
++ FloatRegister temp2) {
++ // Wasm min with NaN propagation.
++ // Detect NaN in either operand (not via add which falsely flags inf+(-inf)).
++ // Compute mask and add BEFORE min (min may clobber lhs via dest aliasing).
++ as_xvcmpeqsp(temp1, lhs, lhs);
++ as_xvcmpeqsp(temp2, rhs, rhs);
++ as_xxland(temp1, temp1, temp2);
++ as_xvaddsp(temp2, lhs, rhs);
++ as_xvminsp(dest, lhs, rhs);
++ as_xxsel(dest, temp2, dest, temp1);
++}
++
++void MacroAssembler::minFloat64x2(FloatRegister rhs, FloatRegister lhsDest) {
++ minFloat64x2(lhsDest, rhs, lhsDest);
++}
++
++void MacroAssembler::minFloat64x2(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ as_xvmindp(dest, lhs, rhs);
++}
++
++void MacroAssembler::minFloat64x2(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest, FloatRegister temp1,
++ FloatRegister temp2) {
++ // NaN mask and add must be computed BEFORE min (which may clobber lhs via
++ // dest).
++ as_xvcmpeqdp(temp1, lhs, lhs);
++ as_xvcmpeqdp(temp2, rhs, rhs);
++ as_xxland(temp1, temp1, temp2); // temp1 = ~0 when both non-NaN
++ as_xvadddp(temp2, lhs, rhs); // temp2 = add (NaN source)
++ as_xvmindp(dest, lhs, rhs); // dest = min (may clobber lhs)
++ as_xxsel(dest, temp2, dest, temp1);
++}
++
++void MacroAssembler::maxFloat32x4(FloatRegister rhs, FloatRegister lhsDest) {
++ maxFloat32x4(lhsDest, rhs, lhsDest);
++}
++
++void MacroAssembler::maxFloat32x4(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ as_xvmaxsp(dest, lhs, rhs);
++}
++
++void MacroAssembler::maxFloat32x4(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest, FloatRegister temp1,
++ FloatRegister temp2) {
++ // Wasm max with NaN propagation, using temp registers.
++ as_xvcmpeqsp(temp1, lhs, lhs);
++ as_xvcmpeqsp(temp2, rhs, rhs);
++ as_xxland(temp1, temp1, temp2);
++ as_xvaddsp(temp2, lhs, rhs);
++ as_xvmaxsp(dest, lhs, rhs);
++ as_xxsel(dest, temp2, dest, temp1);
++}
++
++void MacroAssembler::maxFloat64x2(FloatRegister rhs, FloatRegister lhsDest) {
++ maxFloat64x2(lhsDest, rhs, lhsDest);
++}
++
++void MacroAssembler::maxFloat64x2(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ as_xvmaxdp(dest, lhs, rhs);
++}
++
++void MacroAssembler::maxFloat64x2(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest, FloatRegister temp1,
++ FloatRegister temp2) {
++ as_xvcmpeqdp(temp1, lhs, lhs);
++ as_xvcmpeqdp(temp2, rhs, rhs);
++ as_xxland(temp1, temp1, temp2);
++ as_xvadddp(temp2, lhs, rhs);
++ as_xvmaxdp(dest, lhs, rhs);
++ as_xxsel(dest, temp2, dest, temp1);
++}
++
++void MacroAssembler::unsignedTruncSatFloat32x4ToInt32x4(FloatRegister src,
++ FloatRegister dest) {
++ as_xvcvspuxws(dest, src);
++}
++
++void MacroAssembler::extractLaneInt64x2(uint32_t lane, FloatRegister src,
++ Register64 dest) {
++ MOZ_ASSERT(lane < 2);
++ if (lane == 1) {
++ // Lane 1 = BE dword 0 = register bits[0:63].
++ as_mfvsrd(dest.reg, src);
++ } else {
++ // Lane 0 = BE dword 1.
++ if (HasPOWER9()) {
++ as_mfvsrld(dest.reg, src);
++ } else {
++ ScratchSimd128Scope scratch(*this);
++ as_xxpermdi(scratch, src, src, 2);
++ as_mfvsrd(dest.reg, scratch);
++ }
++ }
++}
++
++void MacroAssembler::replaceLaneInt64x2(unsigned lane, Register64 rhs,
++ FloatRegister lhsDest) {
++ MOZ_ASSERT(lane < 2);
++ if (HasPOWER10()) {
++ // 1 insn, no scratch VSR. UIM byte offset: lane 0 → 8, lane 1 → 0.
++ as_vinsd(lhsDest, rhs.reg, (1 - lane) * 8);
++ return;
++ }
++ ScratchSimd128Scope scratch(*this);
++ as_mtvsrd(scratch, rhs.reg);
++ if (lane == 0) {
++ // Replace dw1 (LE low = lane 0). Keep dw0 (lane 1).
++ // dm=0b00: [lhsDest.dw0, scratch.dw0]
++ as_xxpermdi(lhsDest, lhsDest, scratch, 0);
++ } else {
++ // Replace dw0 (LE high = lane 1). Keep dw1 (lane 0).
++ // dm=0b01: [scratch.dw0, lhsDest.dw1]
++ as_xxpermdi(lhsDest, scratch, lhsDest, 1);
++ }
++}
++
++// SIMD 3-operand arithmetic (x86_shared-style signatures).
++
++void MacroAssembler::addFloat32x4(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ as_xvaddsp(dest, lhs, rhs);
++}
++
++void MacroAssembler::addFloat64x2(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ as_xvadddp(dest, lhs, rhs);
++}
++
++void MacroAssembler::addInt16x8(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vadduhm), lhs, rhs, dest);
++}
++
++void MacroAssembler::addInt8x16(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vaddubm), lhs, rhs, dest);
++}
++
++void MacroAssembler::divFloat32x4(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ as_xvdivsp(dest, lhs, rhs);
++}
++
++void MacroAssembler::extractLaneInt16x8(uint32_t lane, FloatRegister src,
++ Register dest) {
++ MOZ_ASSERT(lane < 8);
++ if (HasPOWER9()) {
++ as_vextractuh(ScratchSimd128Reg, src, 14 - 2 * lane);
++ as_mfvsrd(dest, ScratchSimd128Reg);
++ as_extsh(dest, dest);
++ return;
++ }
++ ExtractLaneToGPR(*this, lane, src, dest, 2, 16);
++ as_extsh(dest, dest);
++}
++
++void MacroAssembler::extractLaneInt32x4(uint32_t lane, FloatRegister src,
++ Register dest) {
++ MOZ_ASSERT(lane < 4);
++ ExtractLaneToGPR(*this, lane, src, dest, 4, 32);
++ // ExtractLaneToGPR leaves the adjacent lane in the high 32 bits for the
++ // unshifted lanes (0 and 2); canonicalize to a sign-extended i32, as the
++ // i8x16/i16x8 extracts do with extsb/extsh. A consumer that reads the full
++ // 64-bit register -- e.g. the POWER8 i32.ctz emulation, whose 64-bit neg/and.
++ // with a 32-bit cntlzw otherwise mis-handles a zero low word over nonzero
++ // high garbage and returns -1 -- requires this.
++ as_extsw(dest, dest);
++}
++
++void MacroAssembler::extractLaneInt8x16(uint32_t lane, FloatRegister src,
++ Register dest) {
++ MOZ_ASSERT(lane < 16);
++ if (HasPOWER9()) {
++ as_vextractub(ScratchSimd128Reg, src, 15 - lane);
++ as_mfvsrd(dest, ScratchSimd128Reg);
++ as_extsb(dest, dest);
++ return;
++ }
++ ExtractLaneToGPR(*this, lane, src, dest, 1, 8);
++ as_extsb(dest, dest);
++}
++
++void MacroAssembler::maxInt16x8(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmaxsh), lhs, rhs, dest);
++}
++
++void MacroAssembler::maxInt32x4(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmaxsw), lhs, rhs, dest);
++}
++
++void MacroAssembler::maxInt8x16(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmaxsb), lhs, rhs, dest);
++}
++
++void MacroAssembler::minInt8x16(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vminsb), lhs, rhs, dest);
++}
++
++void MacroAssembler::mulInt32x4(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmuluwm), lhs, rhs, dest);
++}
++
++void MacroAssembler::narrowInt16x8(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ // On LE, VMX pack swaps operand order vs Wasm convention.
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vpkshss), rhs, lhs, dest);
++}
++
++void MacroAssembler::splatX2(Register64 src, FloatRegister dest) {
++ if (HasPOWER9()) {
++ as_mtvsrdd(dest, src.reg, src.reg);
++ } else {
++ as_mtvsrd(dest, src.reg);
++ as_xxpermdi(dest, dest, dest, 0);
++ }
++}
++
++void MacroAssembler::subInt32x4(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsubuwm), lhs, rhs, dest);
++}
++
++void MacroAssembler::swizzleInt8x16(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ // Wasm i8x16.swizzle: result[i] = (rhs[i] < 16) ? lhs[rhs[i]] : 0.
++ //
++ // Strategy: build ctrl in ScratchSimd128 (which can't alias inputs
++ // because v0 is non-allocatable). Use vsububs(splat(15), rhs) to
++ // produce ctrl = max(0, 15 - rhs); the saturation clamps out-of-range
++ // indices to 0, and those positions get masked off below.
++ //
++ // The mask is computed via vcmpgtub(rhs, splat(15)) + xxlnor — 0xFF
++ // where rhs <= 15. Reformulating "rhs < 16" as "!(rhs > 15)" lets us
++ // use vspltisb with a 5-bit signed immediate (P7+, 1 insn, no GPR
++ // scratch) for both splat-of-15 sites, replacing the previous
++ // movePtr(0x0F0F0F0F)/movePtr(0x10101010) + splatX4 dance.
++ //
++ // Aliasing: dest may equal lhs (wasm baseline calls swizzleInt8x16(
++ // rsd, rs, rsd); Ion's useRegisterAtStart permits the same). When
++ // dest != rhs, ctrl can be built in scratch and the mask computed
++ // after the permute (rhs is still alive). When dest == rhs, the
++ // permute would clobber rhs before we could compute the mask, so the
++ // mask goes to the red zone first.
++ ScratchSimd128Scope scratch(*this);
++ uint8_t s = scratch.encoding() & 31;
++ uint8_t l = lhs.encoding() & 31;
++ uint8_t r = rhs.encoding() & 31;
++ uint8_t d = dest.encoding() & 31;
++
++ if (dest != rhs) {
++ as_vspltisb(s, 15);
++ as_vsububs(s, s, r); // scratch = ctrl
++ as_vperm(d, l, l, s); // dest = vperm(lhs, lhs, ctrl)
++ as_vspltisb(s, 15);
++ as_vcmpgtub(s, r, s); // scratch = 0xFF where rhs > 15
++ as_xxlandc(dest, dest, scratch); // dest &= ~scratch (= bytes-to-keep)
++ return;
++ }
++
++ // dest == rhs: vperm clobbers rhs, so build the bytes-to-zero mask first
++ // and stash it. The xxlandc at the end consumes the un-inverted form.
++ as_vspltisb(s, 15);
++ as_vcmpgtub(s, r, s); // scratch = 0xFF where rhs > 15
++ RedZoneStashSimd128(*this, scratch, 0);
++ as_vspltisb(s, 15);
++ as_vsububs(s, s, r); // scratch = ctrl
++ as_vperm(d, l, l, s); // dest = vperm(lhs, lhs, ctrl)
++ RedZoneRestoreSimd128(*this, 0, scratch);
++ as_xxlandc(dest, dest, scratch); // dest &= ~scratch (= bytes-to-keep)
++}
++// SIMD 3-operand arithmetic (continued).
++
++void MacroAssembler::addInt32x4(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vadduwm), lhs, rhs, dest);
++}
++
++void MacroAssembler::addInt64x2(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vaddudm), lhs, rhs, dest);
++}
++
++void MacroAssembler::addSatInt16x8(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vaddshs), lhs, rhs, dest);
++}
++
++void MacroAssembler::addSatInt8x16(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vaddsbs), lhs, rhs, dest);
++}
++
++void MacroAssembler::divFloat64x2(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ as_xvdivdp(dest, lhs, rhs);
++}
++
++void MacroAssembler::minInt16x8(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vminsh), lhs, rhs, dest);
++}
++
++void MacroAssembler::minInt32x4(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vminsw), lhs, rhs, dest);
++}
++
++void MacroAssembler::mulFloat32x4(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ as_xvmulsp(dest, lhs, rhs);
++}
++
++void MacroAssembler::mulFloat64x2(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ as_xvmuldp(dest, lhs, rhs);
++}
++
++void MacroAssembler::mulInt16x8(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ ZeroSimd128(*this, scratch);
++ EmitVmxTernary(
++ *this,
++ [](Assembler& a, uint8_t vrt, uint8_t vra, uint8_t vrb, uint8_t vrc) {
++ a.as_vmladduhm(vrt, vra, vrb, vrc);
++ },
++ lhs, rhs, scratch, dest);
++}
++
++void MacroAssembler::narrowInt32x4(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vpkswss), rhs, lhs, dest);
++}
++
++void MacroAssembler::subFloat32x4(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ as_xvsubsp(dest, lhs, rhs);
++}
++
++void MacroAssembler::subFloat64x2(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ as_xvsubdp(dest, lhs, rhs);
++}
++
++void MacroAssembler::subInt16x8(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsubuhm), lhs, rhs, dest);
++}
++
++void MacroAssembler::subInt64x2(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsubudm), lhs, rhs, dest);
++}
++
++void MacroAssembler::subInt8x16(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsububm), lhs, rhs, dest);
++}
++
++void MacroAssembler::subSatInt16x8(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsubshs), lhs, rhs, dest);
++}
++
++void MacroAssembler::subSatInt8x16(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsubsbs), lhs, rhs, dest);
++}
++
++void MacroAssembler::widenDotInt16x8(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister dest) {
++ // i32x4.dot_i16x8_s: result[k] = lhs[2k]*rhs[2k] + lhs[2k+1]*rhs[2k+1].
++ // vmsumshm computes exactly that for each i32 lane plus an addend (VRC).
++ // With VRC = 0, the addend disappears and we get the wasm spec result in
++ // a single instruction. xxlxor zeros the scratch in 1 insn, so total is
++ // 2 insns vs the old vmulesh/vmulosh/vadduwm trio.
++ ScratchSimd128Scope scratch(*this);
++ as_xxlxor(scratch, scratch, scratch);
++ as_vmsumshm(dest.encoding() & 31, lhs.encoding() & 31, rhs.encoding() & 31,
++ scratch.encoding() & 31);
++}
++
++// SIMD variable-shift and FMA helpers.
++// Pattern: splat the GPR shift count across all lanes of a scratch VSR,
++// then issue a vector-shift on lhs and the splat. vsl{b,h} / vsr{b,h} /
++// vsra{b,h} use the low 3-or-4 bits of each lane's shift count, exactly
++// matching wasm modulo-N shift semantics.
++
++void MacroAssembler::leftShiftInt8x16(FloatRegister lhs, Register rhs,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ splatX16(rhs, scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vslb), lhs, scratch, dest);
++}
++
++void MacroAssembler::rightShiftInt8x16(FloatRegister lhs, Register rhs,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ splatX16(rhs, scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrab), lhs, scratch, dest);
++}
++
++void MacroAssembler::unsignedRightShiftInt8x16(FloatRegister lhs, Register rhs,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ splatX16(rhs, scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrb), lhs, scratch, dest);
++}
++
++void MacroAssembler::leftShiftInt16x8(FloatRegister lhs, Register rhs,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ splatX8(rhs, scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vslh), lhs, scratch, dest);
++}
++
++void MacroAssembler::rightShiftInt16x8(FloatRegister lhs, Register rhs,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ splatX8(rhs, scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrah), lhs, scratch, dest);
++}
++
++void MacroAssembler::unsignedRightShiftInt16x8(FloatRegister lhs, Register rhs,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ splatX8(rhs, scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrh), lhs, scratch, dest);
++}
++
++void MacroAssembler::leftShiftInt32x4(FloatRegister lhs, Register rhs,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ splatX4(rhs, scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vslw), lhs, scratch, dest);
++}
++
++void MacroAssembler::leftShiftInt64x2(FloatRegister lhs, Register rhs,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ splatX4(rhs, scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsld), lhs, scratch, dest);
++}
++
++void MacroAssembler::rightShiftInt32x4(FloatRegister lhs, Register rhs,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ splatX4(rhs, scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsraw), lhs, scratch, dest);
++}
++
++void MacroAssembler::rightShiftInt64x2(FloatRegister lhs, Register rhs,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ splatX4(rhs, scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrad), lhs, scratch, dest);
++}
++
++void MacroAssembler::unsignedRightShiftInt32x4(FloatRegister lhs, Register rhs,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ splatX4(rhs, scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrw), lhs, scratch, dest);
++}
++
++void MacroAssembler::unsignedRightShiftInt64x2(FloatRegister lhs, Register rhs,
++ FloatRegister dest) {
++ ScratchSimd128Scope scratch(*this);
++ splatX4(rhs, scratch);
++ EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrd), lhs, scratch, dest);
++}
++
++void MacroAssembler::fmaFloat32x4(FloatRegister src1, FloatRegister src2,
++ FloatRegister srcDest) {
++ as_xvmaddasp(srcDest, src1, src2);
++}
++
++void MacroAssembler::fmaFloat64x2(FloatRegister src1, FloatRegister src2,
++ FloatRegister srcDest) {
++ as_xvmaddadp(srcDest, src1, src2);
++}
++
++//}}} check_macroassembler_style
++
++} // namespace jit
++} // namespace js
++
++#endif /* jit_ppc64_MacroAssembler_ppc64_inl_h */
+diff --git a/js/src/jit/ppc64/MacroAssembler-ppc64.cpp b/js/src/jit/ppc64/MacroAssembler-ppc64.cpp
+new file mode 100644
+index 000000000000..702fb3cd4cba
+--- /dev/null
++++ b/js/src/jit/ppc64/MacroAssembler-ppc64.cpp
+@@ -0,0 +1,3467 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#include "jit/ppc64/MacroAssembler-ppc64.h"
++
++#include "jit/Bailouts.h"
++#include "jit/BaselineFrame.h"
++#include "jit/FlushICache.h"
++#include "jit/JitFrames.h"
++#include "jit/JitRuntime.h"
++#include "jit/MacroAssembler.h"
++#include "jit/MoveEmitter.h"
++#include "jit/ppc64/SharedICRegisters-ppc64.h"
++#include "vm/JitActivation.h"
++#include "vm/JSContext.h"
++#include "wasm/WasmStubs.h"
++
++#include "jit/MacroAssembler-inl.h"
++
++namespace js {
++namespace jit {
++
++MacroAssembler& MacroAssemblerPPC64::asMasm() {
++ return *static_cast<MacroAssembler*>(this);
++}
++
++const MacroAssembler& MacroAssemblerPPC64::asMasm() const {
++ return *static_cast<const MacroAssembler*>(this);
++}
++
++// ===============================================================
++// Out-of-line fake exit frame
++
++bool MacroAssemblerPPC64Compat::buildOOLFakeExitFrame(void* fakeReturnAddr) {
++ asMasm().Push(FrameDescriptor(FrameType::IonJS));
++ asMasm().Push(ImmPtr(fakeReturnAddr));
++ asMasm().Push(FramePointer);
++ return true;
++}
++
++// ===============================================================
++// Load int32 or double from memory
++
++void MacroAssemblerPPC64Compat::loadInt32OrDouble(const Address& src,
++ FloatRegister dest) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ Label end;
++
++ // Load the boxed value and stash in the FPR immediately, then reuse the
++ // GPR for the tag test. Only one scratch GPR is held here so that
++ // branchTestInt32 can acquire the second one for the ImmTag constant.
++ loadPtr(Address(src.base, src.offset), scratch);
++ as_mtvsrd(dest, scratch);
++ x_srdi(scratch, scratch, JSVAL_TAG_SHIFT);
++ asMasm().branchTestInt32(Assembler::NotEqual, scratch, &end);
++ // It was an int32. Recover the boxed value from the FPR, sign-extend
++ // the low 32 bits, and convert to double.
++ as_mfvsrd(scratch, dest);
++ as_extsw(scratch, scratch);
++ as_mtvsrd(dest, scratch);
++ as_fcfid(dest, dest);
++
++ bind(&end);
++}
++
++void MacroAssemblerPPC64Compat::loadInt32OrDouble(const BaseIndex& addr,
++ FloatRegister dest) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ Label end;
++
++ computeScaledAddress(addr, scratch);
++ loadPtr(Address(scratch, addr.offset), scratch);
++ as_mtvsrd(dest, scratch);
++ x_srdi(scratch, scratch, JSVAL_TAG_SHIFT);
++ asMasm().branchTestInt32(Assembler::NotEqual, scratch, &end);
++ as_mfvsrd(scratch, dest);
++ as_extsw(scratch, scratch);
++ as_mtvsrd(dest, scratch);
++ as_fcfid(dest, dest);
++
++ bind(&end);
++}
++
++// ===============================================================
++// Conversion functions
++
++void MacroAssemblerPPC64Compat::convertUInt32ToDouble(Register src,
++ FloatRegister dest) {
++ // mtvsrwz: VSR[dest].dw0 = zero_ext_64(src[32:63]); P8+ (ISA 2.07).
++ // Replaces rldicl + mtvsrd (2 insns + scratch) with 1 insn.
++ as_mtvsrwz(dest, src);
++ as_fcfid(dest, dest);
++}
++
++void MacroAssemblerPPC64Compat::convertUInt32ToFloat32(Register src,
++ FloatRegister dest) {
++ // mtvsrwz + fcfids; same recipe as convertUInt32ToDouble.
++ as_mtvsrwz(dest, src);
++ as_fcfids(dest, dest);
++}
++
++// Helper for the negative-zero check after a successful round-trip.
++// Precondition: `dest` holds the integer round-trip result; if it equals
++// zero, then `src` was either +0.0 or -0.0 (those are the only doubles
++// that round-trip to int 0). Distinguish them by inspecting src's sign
++// bit: -0.0 has its MSB set, so an mfvsrd-then-signed-cmp-against-zero
++// branches to `fail` only for -0.0. Non-zero `dest` values (including
++// every negative integer) skip the check entirely.
++static void EmitNegativeZeroCheck(MacroAssemblerPPC64Compat& masm,
++ FloatRegister src, Register dest,
++ Label* fail) {
++ Label notZero;
++ masm.as_cmpdi(dest, 0);
++ masm.ma_b(Assembler::NotEqual, ¬Zero);
++ UseScratchRegisterScope temps(masm);
++ Register scratch = temps.Acquire();
++ masm.as_mfvsrd(scratch, src);
++ masm.as_cmpdi(scratch, 0);
++ masm.ma_b(Assembler::LessThan, fail);
++ masm.bind(¬Zero);
++}
++
++void MacroAssemblerPPC64Compat::convertDoubleToInt32(FloatRegister src,
++ Register dest, Label* fail,
++ bool negativeZeroCheck) {
++ // Truncate to int32 (round toward zero), sign-extend, and verify
++ // exactness via round-trip compare. fctiwz writes the int32 to BE
++ // bits 32..63 of the FPR; mfvsrd extracts and extsw sign-extends.
++ // The compare also catches NaN (unordered) and Inf (saturated to
++ // INT32_{MIN,MAX}, won't round-trip equal).
++ as_fctiwz(ScratchDoubleReg, src);
++ as_mfvsrd(dest, ScratchDoubleReg);
++ as_extsw(dest, dest);
++ as_mtvsrd(ScratchDoubleReg, dest);
++ as_fcfid(ScratchDoubleReg, ScratchDoubleReg);
++ as_fcmpu(ScratchDoubleReg, src);
++ ma_b(Assembler::DoubleNotEqualOrUnordered, fail);
++
++ if (negativeZeroCheck) {
++ EmitNegativeZeroCheck(*this, src, dest, fail);
++ }
++}
++
++void MacroAssemblerPPC64Compat::convertDoubleToPtr(FloatRegister src,
++ Register dest, Label* fail,
++ bool negativeZeroCheck) {
++ // Same pattern as convertDoubleToInt32 but to int64 (no sign-extend
++ // needed since fctidz already produces a 64-bit result).
++ as_fctidz(ScratchDoubleReg, src);
++ as_mfvsrd(dest, ScratchDoubleReg);
++ as_mtvsrd(ScratchDoubleReg, dest);
++ as_fcfid(ScratchDoubleReg, ScratchDoubleReg);
++ as_fcmpu(ScratchDoubleReg, src);
++ ma_b(Assembler::DoubleNotEqualOrUnordered, fail);
++
++ if (negativeZeroCheck) {
++ EmitNegativeZeroCheck(*this, src, dest, fail);
++ }
++}
++
++void MacroAssemblerPPC64Compat::convertFloat32ToInt32(FloatRegister src,
++ Register dest,
++ Label* fail,
++ bool negativeZeroCheck) {
++ // Same as convertDoubleToInt32 but the round-trip uses fcfids so the
++ // comparison happens at single precision (matches src's actual width).
++ as_fctiwz(ScratchDoubleReg, src);
++ as_mfvsrd(dest, ScratchDoubleReg);
++ as_extsw(dest, dest);
++ as_mtvsrd(ScratchDoubleReg, dest);
++ as_fcfids(ScratchDoubleReg, ScratchDoubleReg);
++ as_fcmpu(ScratchDoubleReg, src);
++ ma_b(Assembler::DoubleNotEqualOrUnordered, fail);
++
++ if (negativeZeroCheck) {
++ EmitNegativeZeroCheck(*this, src, dest, fail);
++ }
++}
++
++CodeOffset MacroAssemblerPPC64Compat::toggledCall(JitCode* target,
++ bool enabled) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ // stanza(8) + mtctr/bctrl(2) = 10 instructions.
++ m_buffer.enterNoPool(kNoPoolPatchableBranchInsns);
++ BufferOffset boLoad =
++ emitLoad64Stanza(scratch, (uint64_t)uintptr_t(target->raw()));
++ CodeOffset offset(boLoad.getOffset());
++ addPendingJump(boLoad, ImmPtr(target->raw()), RelocationKind::JITCODE);
++ if (enabled) {
++ xs_mtctr(scratch);
++ as_bctr(LinkBit::LinkB);
++ } else {
++ writeInst(NopInst);
++ writeInst(NopInst);
++ }
++ m_buffer.leaveNoPool();
++ MOZ_ASSERT_IF(!oom(), nextOffset().getOffset() - offset.offset() ==
++ ToggledCallSize(nullptr));
++ return offset;
++}
++
++// ===============================================================
++// Exception handling
++
++void MacroAssemblerPPC64Compat::handleFailureWithHandlerTail(
++ Label* profilerExitTail, Label* bailoutTail,
++ uint32_t* returnValueCheckOffset) {
++ // Round sizeof(ResumeFromException) up to ABIStackAlignment. The
++ // canonical (sz + align - 1) & ~(align - 1) form is exact: when sz
++ // is already a multiple of `align` the rounding is a no-op. The
++ // previous (sz + align) & ~(align - 1) over-allocated by `align`
++ // bytes whenever sz was already aligned.
++ int size = (sizeof(ResumeFromException) + ABIStackAlignment - 1) &
++ ~(ABIStackAlignment - 1);
++ asMasm().subPtr(Imm32(size), StackPointer);
++ // Use r3 (first argument register).
++ mov(StackPointer, r3);
++
++ using Fn = void (*)(ResumeFromException* rfe);
++ asMasm().setupUnalignedABICall(r4);
++ asMasm().passABIArg(r3);
++ asMasm().callWithABI<Fn, HandleException>(
++ ABIType::General, CheckUnsafeCallWithABI::DontCheckHasExitFrame);
++
++ *returnValueCheckOffset = asMasm().currentOffset();
++
++ Label entryFrame;
++ Label catch_;
++ Label finally;
++ Label returnBaseline;
++ Label returnIon;
++ Label bailout;
++ Label wasmInterpEntry;
++ Label wasmCatch;
++
++ load32(Address(StackPointer, ResumeFromException::offsetOfKind()), r3);
++ asMasm().branch32(Assembler::Equal, r3,
++ Imm32(ExceptionResumeKind::EntryFrame), &entryFrame);
++ asMasm().branch32(Assembler::Equal, r3, Imm32(ExceptionResumeKind::Catch),
++ &catch_);
++ asMasm().branch32(Assembler::Equal, r3, Imm32(ExceptionResumeKind::Finally),
++ &finally);
++ asMasm().branch32(Assembler::Equal, r3,
++ Imm32(ExceptionResumeKind::ForcedReturnBaseline),
++ &returnBaseline);
++ asMasm().branch32(Assembler::Equal, r3,
++ Imm32(ExceptionResumeKind::ForcedReturnIon), &returnIon);
++ asMasm().branch32(Assembler::Equal, r3, Imm32(ExceptionResumeKind::Bailout),
++ &bailout);
++ asMasm().branch32(Assembler::Equal, r3,
++ Imm32(ExceptionResumeKind::WasmInterpEntry),
++ &wasmInterpEntry);
++ asMasm().branch32(Assembler::Equal, r3, Imm32(ExceptionResumeKind::WasmCatch),
++ &wasmCatch);
++
++ breakpoint(); // Invalid kind.
++
++ // No exception handler. Return error from entry frame.
++ bind(&entryFrame);
++ asMasm().moveValue(MagicValue(JS_ION_ERROR), JSReturnOperand);
++ loadPtr(Address(StackPointer, ResumeFromException::offsetOfFramePointer()),
++ FramePointer);
++ loadPtr(Address(StackPointer, ResumeFromException::offsetOfStackPointer()),
++ StackPointer);
++ ret();
++
++ // Catch handler.
++ bind(&catch_);
++ loadPtr(Address(StackPointer, ResumeFromException::offsetOfTarget()), r3);
++ loadPtr(Address(StackPointer, ResumeFromException::offsetOfFramePointer()),
++ FramePointer);
++ loadPtr(Address(StackPointer, ResumeFromException::offsetOfStackPointer()),
++ StackPointer);
++ jump(r3);
++
++ // Finally block.
++ bind(&finally);
++ ValueOperand exception = ValueOperand(r4);
++ loadValue(Address(StackPointer, ResumeFromException::offsetOfException()),
++ exception);
++
++ ValueOperand exceptionStack = ValueOperand(r5);
++ loadValue(
++ Address(StackPointer, ResumeFromException::offsetOfExceptionStack()),
++ exceptionStack);
++
++ loadPtr(Address(StackPointer, ResumeFromException::offsetOfTarget()), r3);
++ loadPtr(Address(StackPointer, ResumeFromException::offsetOfFramePointer()),
++ FramePointer);
++ loadPtr(Address(StackPointer, ResumeFromException::offsetOfStackPointer()),
++ StackPointer);
++
++ pushValue(exception);
++ pushValue(exceptionStack);
++ pushValue(BooleanValue(true));
++ jump(r3);
++
++ // Forced return from baseline.
++ Label profilingInstrumentation;
++ bind(&returnBaseline);
++ loadPtr(Address(StackPointer, ResumeFromException::offsetOfFramePointer()),
++ FramePointer);
++ loadPtr(Address(StackPointer, ResumeFromException::offsetOfStackPointer()),
++ StackPointer);
++ loadValue(Address(FramePointer, BaselineFrame::reverseOffsetOfReturnValue()),
++ JSReturnOperand);
++ jump(&profilingInstrumentation);
++
++ // Forced return from Ion.
++ bind(&returnIon);
++ loadValue(Address(StackPointer, ResumeFromException::offsetOfException()),
++ JSReturnOperand);
++ loadPtr(Address(StackPointer, ResumeFromException::offsetOfFramePointer()),
++ FramePointer);
++ loadPtr(Address(StackPointer, ResumeFromException::offsetOfStackPointer()),
++ StackPointer);
++
++ bind(&profilingInstrumentation);
++ {
++ Label skipProfilingInstrumentation;
++ AbsoluteAddress addressOfEnabled(
++ asMasm().runtime()->geckoProfiler().addressOfEnabled());
++ asMasm().branch32(Assembler::Equal, addressOfEnabled, Imm32(0),
++ &skipProfilingInstrumentation);
++ jump(profilerExitTail);
++ bind(&skipProfilingInstrumentation);
++ }
++
++ xs_mr(StackPointer, FramePointer);
++ // Pop FP from stack, then return (pop LR + blr).
++ loadPtr(Address(StackPointer, 0), FramePointer);
++ asMasm().addPtr(Imm32(sizeof(void*)), StackPointer);
++ ret();
++
++ // Bailout.
++ bind(&bailout);
++ loadPtr(Address(StackPointer, ResumeFromException::offsetOfBailoutInfo()),
++ r5);
++ loadPtr(Address(StackPointer, ResumeFromException::offsetOfStackPointer()),
++ StackPointer);
++ xs_li(ReturnReg, 1);
++ jump(bailoutTail);
++
++ // Wasm interp entry.
++ bind(&wasmInterpEntry);
++ loadPtr(Address(StackPointer, ResumeFromException::offsetOfFramePointer()),
++ FramePointer);
++ loadPtr(Address(StackPointer, ResumeFromException::offsetOfStackPointer()),
++ StackPointer);
++ movePtr(ImmWord(wasm::InterpFailInstanceReg), InstanceReg);
++ ret();
++
++ // Wasm catch.
++ bind(&wasmCatch);
++ wasm::GenerateJumpToCatchHandler(asMasm(), StackPointer, r4, r5, r6);
++}
++
++void MacroAssembler::clampDoubleToUint8(FloatRegister input, Register output) {
++ ScratchDoubleScope fpscratch(asMasm());
++
++ if (HasPOWER9()) {
++ // P9 xsmaxjdp uses Java/JS semantics (ISA v3.0B): any NaN
++ // is treated as "less than any number that is not a NaN", so
++ // xsmaxjdp(input, 0) collapses {NaN, -Inf, ≤ 0} to 0 in one insn —
++ // the "≤ 0 or NaN → 0" branch dance disappears.
++ //
++ // After the max, fctid (round-to-nearest-even per FPSCR default,
++ // matches ECMA Uint8ClampedArray's round-half-to-even) saturates
++ // out-of-int64 values to INT64_MAX. Remaining upper clamp
++ // (output > 255 → 255) is one cmpdi + isel.
++ zeroDouble(fpscratch);
++ as_xsmaxjdp(fpscratch, input, fpscratch);
++ as_fctid(fpscratch, fpscratch);
++ as_mfvsrd(output, fpscratch);
++ UseScratchRegisterScope temps(asMasm());
++ Register max255 = temps.Acquire();
++ xs_li(max255, 255);
++ as_cmpdi(output, 255);
++ as_isel(output, max255, output, GreaterThan);
++ return;
++ }
++
++ // POWER8 fallback: xsmaxjdp is unavailable, so filter NaN explicitly
++ // before fctid. Per Power ISA, fctid maps NaN to INT64_MAX, which
++ // would clamp to 255 instead of the spec-required 0.
++ Label positive, below255, done;
++ zeroDouble(fpscratch);
++ branchDouble(DoubleGreaterThan, input, fpscratch, &positive);
++ {
++ move32(Imm32(0), output);
++ jump(&done);
++ }
++
++ bind(&positive);
++
++ loadConstantDouble(255.0, fpscratch);
++ branchDouble(DoubleLessThan, input, fpscratch, &below255);
++ {
++ move32(Imm32(255), output);
++ jump(&done);
++ }
++
++ bind(&below255);
++
++ as_fctid(fpscratch, input);
++ as_mfvsrd(output, fpscratch);
++ bind(&done);
++}
++
++void MacroAssembler::subFromStackPtr(Imm32 imm32) {
++ if (imm32.value) {
++ asMasm().subPtr(imm32, StackPointer);
++ }
++}
++
++//{{{ check_macroassembler_style
++
++void MacroAssembler::widenInt32(Register r) {
++ move32To64SignExtend(r, Register64(r));
++}
++
++// Stack operations.
++void MacroAssembler::Push(Register reg) {
++ push(reg);
++ adjustFrame(int32_t(sizeof(intptr_t)));
++}
++void MacroAssembler::Push(const Imm32 imm) {
++ push(imm);
++ adjustFrame(int32_t(sizeof(intptr_t)));
++}
++
++void MacroAssembler::Push(const ImmWord imm) {
++ push(imm);
++ adjustFrame(int32_t(sizeof(intptr_t)));
++}
++
++void MacroAssembler::Push(const ImmPtr imm) {
++ Push(ImmWord(uintptr_t(imm.value)));
++}
++
++void MacroAssembler::Push(const ImmGCPtr ptr) {
++ push(ptr);
++ adjustFrame(int32_t(sizeof(intptr_t)));
++}
++
++void MacroAssembler::PushBoxed(FloatRegister reg) {
++ subFromStackPtr(Imm32(sizeof(double)));
++ boxDouble(reg, Address(getStackPointer(), 0));
++ adjustFrame(sizeof(double));
++}
++
++void MacroAssembler::Pop(Register reg) {
++ pop(reg);
++ adjustFrame(-int32_t(sizeof(intptr_t)));
++}
++void MacroAssembler::PushRegsInMask(LiveRegisterSet set) {
++ int32_t diff =
++ set.gprs().size() * sizeof(intptr_t) + set.fpus().getPushSizeInBytes();
++ const int32_t reserved = diff;
++
++ reserveStack(reserved);
++ for (GeneralRegisterBackwardIterator iter(set.gprs()); iter.more(); ++iter) {
++ diff -= sizeof(intptr_t);
++ storePtr(*iter, Address(StackPointer, diff));
++ }
++
++ // Natural per-kind slot — 8 bytes for Single/Double via stfd, 16 bytes
++ // for Simd128 via stxvx. RegisterDump::FPUArray is sized 32 × 8 = 256
++ // bytes (sizeof(RegisterContent) is 8 — no v128 in the union), so
++ // f_K's stfd slot lands at the right offset. Bailout AllRegs excludes
++ // Simd128 (Ion has no SIMD live), so the FP region in bailout frames
++ // is strictly Float-only.
++ for (FloatRegisterBackwardIterator iter(set.fpus().reduceSetForPush());
++ iter.more(); ++iter) {
++ FloatRegister reg = *iter;
++ diff -= reg.size();
++ if (reg.isSimd128()) {
++ storeUnalignedSimd128(reg, Address(StackPointer, diff));
++ } else {
++ storeDouble(reg.asDouble(), Address(StackPointer, diff));
++ }
++ }
++ MOZ_ASSERT(diff == 0);
++}
++void MacroAssembler::PopRegsInMaskIgnore(LiveRegisterSet set,
++ LiveRegisterSet ignore) {
++ int32_t diff =
++ set.gprs().size() * sizeof(intptr_t) + set.fpus().getPushSizeInBytes();
++ const int32_t reserved = diff;
++
++ for (GeneralRegisterBackwardIterator iter(set.gprs()); iter.more(); ++iter) {
++ diff -= sizeof(intptr_t);
++ if (!ignore.has(*iter)) {
++ loadPtr(Address(StackPointer, diff), *iter);
++ }
++ }
++
++ // Natural per-kind slot. See PushRegsInMask comment.
++ for (FloatRegisterBackwardIterator iter(set.fpus().reduceSetForPush());
++ iter.more(); ++iter) {
++ FloatRegister reg = *iter;
++ diff -= reg.size();
++ if (!ignore.has(reg)) {
++ if (reg.isSimd128()) {
++ loadUnalignedSimd128(Address(StackPointer, diff), reg);
++ } else {
++ loadDouble(Address(StackPointer, diff), reg.asDouble());
++ }
++ }
++ }
++ MOZ_ASSERT(diff == 0);
++ freeStack(reserved);
++}
++
++// Call operations.
++CodeOffset MacroAssembler::call(Register reg) {
++ // ELFv2 ABI: r12 must hold the target address at function entry
++ // so the callee can compute its TOC pointer from r12.
++ if (reg != CallReg) {
++ movePtr(reg, CallReg);
++ }
++ xs_mtctr(CallReg);
++ as_bctr(LinkB);
++ return CodeOffset(currentOffset());
++}
++CodeOffset MacroAssembler::call(Label* label) {
++ if (label->bound()) {
++ // Open the no-pool window BEFORE computing the displacement.
++ // enterNoPool() can itself trigger a pending pool flush, advancing
++ // currentOffset(). A pre-flush displacement emitted at the post-flush
++ // position would overshoot the target by poolSize bytes.
++ m_buffer.enterNoPool(kNoPoolPatchableBranchInsns);
++ int32_t offset = label->offset() - currentOffset();
++ // Call instruction goes at inst[9] in the 10-word stanza.
++ int32_t callOffset = offset - 9 * (int32_t)sizeof(uint32_t);
++ if (JOffImm26::IsInRange(callOffset)) {
++ // Short: 9 nops + bl = 10 instructions.
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ as_b(JOffImm26(callOffset), RelativeBranch, LinkB);
++ m_buffer.leaveNoPool();
++ return CodeOffset(currentOffset());
++ }
++ // Long call to bound label: stanza(8) + mtctr + bctrl = 10 instructions.
++ BufferOffset bo =
++ emitLoad64Stanza(SecondScratchReg, LabelBase::INVALID_OFFSET);
++ xs_mtctr(SecondScratchReg);
++ as_bctr(LinkB);
++ m_buffer.leaveNoPool();
++ addLongJump(bo, BufferOffset(label->offset()));
++ return CodeOffset(currentOffset());
++ }
++ // Emit a CallTag stanza: trap + chain + 8 nops (10 instructions total).
++ m_buffer.enterNoPool(kNoPoolPatchableBranchInsns);
++ BufferOffset bo = xs_trap_tagged(CallTag);
++ writeInst(label->used() ? label->offset() : LabelBase::INVALID_OFFSET);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ m_buffer.leaveNoPool();
++ if (!oom()) {
++ label->use(bo.getOffset());
++ }
++ return CodeOffset(currentOffset());
++}
++CodeOffset MacroAssembler::call(const Address& addr) {
++ loadPtr(addr, CallReg);
++ return call(CallReg);
++}
++
++void MacroAssembler::call(ImmPtr target) {
++ uint64_t addr = uintptr_t(target.value);
++ // stanza(8) + mtctr + bctrl = 10 instructions.
++ m_buffer.enterNoPool(kNoPoolPatchableBranchInsns);
++ BufferOffset bo = emitLoad64Stanza(CallReg, addr);
++ addPendingJump(bo, target, RelocationKind::HARDCODED);
++ xs_mtctr(CallReg);
++ as_bctr(LinkB);
++ m_buffer.leaveNoPool();
++}
++
++CodeOffset MacroAssembler::call(wasm::SymbolicAddress target) {
++ movePtr(target, CallReg);
++ return call(CallReg);
++}
++
++void MacroAssembler::callWithABINoProfiler(const Address& fun, ABIType result) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ loadPtr(fun, scratch);
++
++ uint32_t stackAdjust;
++ callWithABIPre(&stackAdjust);
++ call(scratch);
++ callWithABIPost(stackAdjust, result);
++}
++
++void MacroAssembler::callWithABIPre(uint32_t* stackAdjust, bool callFromWasm) {
++ MOZ_ASSERT(inCall_);
++ uint32_t stackForCall = abiArgs_.stackBytesConsumedSoFar();
++
++ // Reserve place for LR save.
++ stackForCall += sizeof(intptr_t);
++
++ if (dynamicAlignment_) {
++ stackForCall += ComputeByteAlignment(stackForCall, ABIStackAlignment);
++ } else {
++ uint32_t alignmentAtPrologue = callFromWasm ? sizeof(wasm::Frame) : 0;
++ stackForCall += ComputeByteAlignment(
++ stackForCall + framePushed() + alignmentAtPrologue, ABIStackAlignment);
++ }
++
++ *stackAdjust = stackForCall;
++ reserveStack(stackForCall);
++
++ // Save LR. Restore it in callWithABIPost.
++ {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ xs_mflr(scratch);
++ storePtr(scratch, Address(StackPointer, stackForCall - sizeof(intptr_t)));
++ }
++
++ // Position all arguments.
++ {
++ enoughMemory_ &= moveResolver_.resolve();
++ if (!enoughMemory_) {
++ return;
++ }
++
++ MoveEmitter emitter(*this);
++ emitter.emit(moveResolver_);
++ emitter.finish();
++ }
++
++ assertStackAlignment(ABIStackAlignment);
++}
++
++void MacroAssembler::callWithABIPost(uint32_t stackAdjust, ABIType result) {
++ {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ loadPtr(Address(StackPointer, stackAdjust - sizeof(intptr_t)), scratch);
++ xs_mtlr(scratch);
++ }
++
++ if (dynamicAlignment_) {
++ // Restore SP from stack (as stored in setupUnalignedABICall).
++ loadPtr(Address(StackPointer, stackAdjust), StackPointer);
++ adjustFrame(-stackAdjust);
++ } else {
++ freeStack(stackAdjust);
++ }
++
++#ifdef DEBUG
++ MOZ_ASSERT(inCall_);
++ inCall_ = false;
++#endif
++}
++
++// Value operations.
++void MacroAssembler::moveValue(const ValueOperand& src,
++ const ValueOperand& dest) {
++ if (src.valueReg() != dest.valueReg()) {
++ movePtr(src.valueReg(), dest.valueReg());
++ }
++}
++void MacroAssembler::moveValue(const Value& src, const ValueOperand& dest) {
++ if (!src.isGCThing()) {
++ movePtr(ImmWord(src.asRawBits()), dest.valueReg());
++ return;
++ }
++ CodeOffset off = movWithPatch(ImmWord(src.asRawBits()), dest.valueReg());
++ writeDataRelocation(off, src);
++}
++
++// Branch operations.
++void MacroAssembler::branchTestValue(Condition cond, const ValueOperand& lhs,
++ const Value& rhs, Label* label) {
++ MOZ_ASSERT(cond == Equal || cond == NotEqual);
++ MOZ_ASSERT(!rhs.isNaN());
++
++ if (!rhs.isGCThing()) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ MOZ_ASSERT(lhs.valueReg() != scratch);
++ movePtr(ImmWord(rhs.asRawBits()), scratch);
++ branchPtr(cond, lhs.valueReg(), scratch, label);
++ } else {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ MOZ_ASSERT(lhs.valueReg() != scratch);
++ moveValue(rhs, ValueOperand(scratch));
++ branchPtr(cond, lhs.valueReg(), scratch, label);
++ }
++}
++void MacroAssembler::branchTestNaNValue(Condition cond, const ValueOperand& val,
++ Register temp, Label* label) {
++ MOZ_ASSERT(cond == Equal || cond == NotEqual);
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ MOZ_ASSERT(val.valueReg() != scratch);
++
++ // Strip the IEEE sign bit (LSB-numbering bit 63 = PPC-numbering bit 0)
++ // with rldicl SH=0, MB=1: rotate by zero (no-op) then keep bits 1..63 of
++ // PPC-numbering, clearing bit 0. Rotating by 1 instead would also shift
++ // the quiet-NaN bit out of position and cause 1.5 (0x3FF8...) and NaN
++ // (0x7FF8...) to collide after masking — bug 1943704 PPC64 regression.
++ as_rldicl(temp, val.valueReg(), 0, 1);
++
++ // Load canonical NaN (with sign bit 0) and strip its sign bit too.
++ static_assert(JS::detail::CanonicalizedNaNSignBit == 0);
++ moveValue(DoubleValue(JS::GenericNaN()), ValueOperand(scratch));
++ as_rldicl(scratch, scratch, 0, 1);
++
++ branchPtr(cond, temp, scratch, label);
++}
++
++void MacroAssembler::branchPtrInNurseryChunk(Condition cond, Register ptr,
++ Register temp, Label* label) {
++ MOZ_ASSERT(cond == Assembler::Equal || cond == Assembler::NotEqual);
++ MOZ_ASSERT(ptr != temp);
++ MOZ_ASSERT(temp != InvalidReg);
++
++ andPtr(Imm32(int32_t(~gc::ChunkMask)), ptr, temp);
++ branchPtr(InvertCondition(cond), Address(temp, gc::ChunkStoreBufferOffset),
++ ImmWord(0), label);
++}
++void MacroAssembler::branchValueIsNurseryCell(Condition cond,
++ ValueOperand value, Register temp,
++ Label* label) {
++ branchValueIsNurseryCellImpl(cond, value, temp, label);
++}
++
++// Patching / near address operations.
++CodeOffset MacroAssembler::nopPatchableToCall() {
++ // Emit 10 nops that can be patched to a call stanza:
++ // 8 load64 nops + mtctr nop + bctrl nop
++ // Return offset AFTER the stanza (= the return address).
++ m_buffer.enterNoPool(kNoPoolPatchableBranchInsns);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ m_buffer.leaveNoPool();
++ return CodeOffset(currentOffset());
++}
++CodeOffset MacroAssembler::moveNearAddressWithPatch(Register dest) {
++ CodeOffset offset(currentOffset());
++ emitLoad64Stanza(dest, 0);
++ return offset;
++}
++// static
++void MacroAssembler::patchNearAddressMove(CodeLocationLabel loc,
++ CodeLocationLabel target) {
++ Instruction* inst = (Instruction*)loc.raw();
++ UpdateLoad64Value(inst, (uint64_t)target.raw());
++}
++
++// Return address operations (link register architectures).
++//
++// Note: these MUST decrement SP by exactly 8 bytes. wasm::Frame is 16 bytes
++// (callerFP_ + returnAddress_) and GenerateCallablePrologue pairs this with
++// push(FramePointer) to match that layout exactly — a 16-byte decrement here
++// would insert 8 bytes of padding and break FP-chain unwinding. The 8-byte
++// intermediate misalignment between this save and the following push(FP) is
++// never observed by a C call (no intervening transition), and any caller that
++// does make a C call after pushReturnAddress routes through
++// setupUnalignedABICall which re-aligns.
++void MacroAssembler::pushReturnAddress() {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ xs_mflr(scratch);
++ push(scratch);
++}
++void MacroAssembler::popReturnAddress() {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ pop(scratch);
++ xs_mtlr(scratch);
++}
++
++// ABI setup.
++void MacroAssembler::setupUnalignedABICall(Register scratch) {
++ MOZ_ASSERT(!IsCompilingWasm(), "wasm should only use aligned ABI calls");
++ setupNativeABICall();
++ dynamicAlignment_ = true;
++
++ movePtr(StackPointer, scratch);
++
++ // Force sp to be aligned.
++ subPtr(Imm32(sizeof(uintptr_t)), StackPointer);
++ andPtr(Imm32(~(ABIStackAlignment - 1)), StackPointer);
++ storePtr(scratch, Address(StackPointer, 0));
++}
++
++// ===============================================================
++// Arithmetic helpers.
++
++void MacroAssembler::flexibleDivMod32(Register lhs, Register rhs,
++ Register divOutput, Register remOutput,
++ bool isUnsigned, const LiveRegisterSet&) {
++ MOZ_ASSERT(lhs != divOutput && lhs != remOutput, "lhs is preserved");
++ MOZ_ASSERT(rhs != divOutput && rhs != remOutput, "rhs is preserved");
++
++ // PPC64 has no modulus instruction. Compute: rem = lhs - (lhs/rhs)*rhs
++ // PPC64 divw(INT32_MIN, -1) is undefined; quotient=INT32_MIN, remainder=0.
++ Label done;
++ if (!isUnsigned) {
++ Label notMinOverflow;
++ branchPtr(Assembler::NotEqual, lhs, ImmWord(INT32_MIN), ¬MinOverflow);
++ branchPtr(Assembler::NotEqual, rhs, ImmWord(-1), ¬MinOverflow);
++ move32(Imm32(INT32_MIN), divOutput);
++ move32(Imm32(0), remOutput);
++ jump(&done);
++ bind(¬MinOverflow);
++ }
++ if (isUnsigned) {
++ as_divwu(divOutput, lhs, rhs);
++ } else {
++ as_divw(divOutput, lhs, rhs);
++ }
++ as_extsw(divOutput, divOutput);
++ if (HasPOWER9()) {
++ if (isUnsigned) {
++ as_moduw(remOutput, lhs, rhs);
++ } else {
++ as_modsw(remOutput, lhs, rhs);
++ }
++ } else {
++ as_mullw(remOutput, divOutput, rhs);
++ as_subf(remOutput, remOutput, lhs);
++ }
++ as_extsw(remOutput, remOutput);
++ bind(&done);
++}
++
++void MacroAssembler::shiftIndex32AndAdd(Register indexTemp32, int shift,
++ Register pointer) {
++ if (IsShiftInScaleRange(shift)) {
++ computeEffectiveAddress(
++ BaseIndex(pointer, indexTemp32, ShiftToScale(shift)), pointer);
++ return;
++ }
++ lshift32(Imm32(shift), indexTemp32);
++ addPtr(indexTemp32, pointer);
++}
++
++void MacroAssembler::convertInt64ToDouble(Register64 src, FloatRegister dest) {
++ as_mtvsrd(dest, src.reg);
++ as_fcfid(dest, dest);
++}
++
++void MacroAssembler::nearbyIntDouble(RoundingMode mode, FloatRegister src,
++ FloatRegister dest) {
++ switch (mode) {
++ case RoundingMode::NearestTiesToEven: {
++ // PPC64's frin rounds ties away from zero, NOT to even (ISA v3.1).
++ // Use fctid+fcfid which uses FPSCR RN (default = round-to-nearest-even).
++ // Guard: if |src| >= 2^52, value is already integral (or NaN/Inf) —
++ // just copy src. This preserves NaN, Inf, and -0.
++ // Check via integer exponent extraction to avoid FP temp conflicts.
++ Label done;
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ moveDouble(src, ScratchDoubleReg);
++ if (src != dest) {
++ moveDouble(src, dest);
++ }
++ if (HasPOWER9()) {
++ // xsxexpdp lays the 11-bit biased exponent in XT.dw0 with the
++ // rest zeroed, so mfvsrd reads it directly — drops the
++ // srdi+andi. masking pair.
++ ScratchSimd128Scope expScratch(*this);
++ as_xsxexpdp(expScratch, ScratchDoubleReg);
++ as_mfvsrd(scratch, expScratch);
++ } else {
++ as_mfvsrd(scratch, ScratchDoubleReg);
++ x_srdi(scratch, scratch, 52);
++ as_andi_rc(scratch, scratch, 0x7FF);
++ }
++ // Biased exponent >= 1075 (= 1023+52) means |val| >= 2^52.
++ // Also catches Inf (exp=2047) and NaN (exp=2047).
++ ma_cmp(scratch, Imm32(1075), Assembler::GreaterThanOrEqual);
++ ma_b(Assembler::GreaterThanOrEqual, &done);
++ as_fctid(dest, ScratchDoubleReg);
++ as_fcfid(dest, dest);
++ as_fcpsgn(dest, ScratchDoubleReg, dest);
++ bind(&done);
++ break;
++ }
++ case RoundingMode::TowardsZero:
++ as_friz(dest, src);
++ break;
++ case RoundingMode::Up:
++ as_frip(dest, src);
++ break;
++ case RoundingMode::Down:
++ as_frim(dest, src);
++ break;
++ default:
++ MOZ_CRASH("Unexpected rounding mode");
++ }
++}
++
++void MacroAssembler::nearbyIntFloat32(RoundingMode mode, FloatRegister src,
++ FloatRegister dest) {
++ // PPC FP rounding instructions operate on double-precision.
++ // For single-precision, we round as double then round back to single.
++ // The frsp instruction handles the double->single conversion.
++ nearbyIntDouble(mode, src, dest);
++ as_frsp(dest, dest);
++}
++
++// ===============================================================
++// Far jump support.
++
++CodeOffset MacroAssembler::farJumpWithPatch() {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++
++ // stanza(8) + mtctr + bctr = 10 instructions.
++ CodeOffset loadOffset(currentOffset());
++ m_buffer.enterNoPool(kNoPoolPatchableBranchInsns);
++ emitLoad64Stanza(scratch, 0);
++ xs_mtctr(scratch);
++ as_bctr();
++ m_buffer.leaveNoPool();
++
++ return loadOffset;
++}
++
++// ===============================================================
++void MacroAssembler::flush() { Assembler::flush(); }
++
++// Wasm support.
++
++FaultingCodeOffset MacroAssembler::wasmTrapInstruction() {
++ m_buffer.flushPool(); // see comment in wasmLoadImpl
++ FaultingCodeOffset fco = FaultingCodeOffset(currentOffset());
++ xs_trap();
++ return fco;
++}
++
++// PPC64 SlowCallMarker: `ori r0, r0, 0` -- a NOP-like instruction
++// that won't appear in normal code generation.
++// ori r0, r0, 0 = 0x60000000 -- that's actually PPC_nop.
++// Use a distinguishable encoding: `ori r12, r12, 0` = 0x618C0000
++static const int32_t SlowCallMarker = 0x618C0000;
++
++void MacroAssembler::wasmMarkCallAsSlow() {
++ // Emit: ori r12, r12, 0
++ as_ori(CallReg, CallReg, 0);
++}
++
++void MacroAssembler::wasmCheckSlowCallsite(Register ra_, Label* notSlow,
++ Register temp1, Register temp2) {
++ MOZ_ASSERT(ra_ != temp2);
++ load32(Address(ra_, 0), temp2);
++ branch32(Assembler::NotEqual, temp2, Imm32(SlowCallMarker), notSlow);
++}
++
++CodeOffset MacroAssembler::wasmMarkedSlowCall(const wasm::CallSiteDesc& desc,
++ const Register reg) {
++ CodeOffset offset = call(desc, reg);
++ wasmMarkCallAsSlow();
++ return offset;
++}
++
++// ===============================================================
++// Additional stack operations.
++
++void MacroAssembler::Push(FloatRegister f) {
++ push(f);
++ adjustFrame(int32_t(sizeof(double)));
++}
++void MacroAssembler::Pop(FloatRegister f) {
++ pop(f);
++ adjustFrame(-int32_t(sizeof(double)));
++}
++void MacroAssembler::Pop(const ValueOperand& val) {
++ popValue(val);
++ adjustFrame(-int32_t(sizeof(Value)));
++}
++
++// static
++size_t MacroAssembler::PushRegsInMaskSizeInBytes(LiveRegisterSet set) {
++ return set.gprs().size() * sizeof(intptr_t) + set.fpus().getPushSizeInBytes();
++}
++
++void MacroAssembler::storeRegsInMask(LiveRegisterSet set, Address dest,
++ Register scratch) {
++ FloatRegisterSet fpuSet(set.fpus().reduceSetForPush());
++ mozilla::DebugOnly<unsigned> numFpu = fpuSet.size();
++ mozilla::DebugOnly<int32_t> diffF = fpuSet.getPushSizeInBytes();
++ mozilla::DebugOnly<int32_t> diffG = set.gprs().size() * sizeof(intptr_t);
++
++ MOZ_ASSERT(dest.offset >= diffG + diffF);
++
++ for (GeneralRegisterBackwardIterator iter(set.gprs()); iter.more(); ++iter) {
++ diffG -= sizeof(intptr_t);
++ dest.offset -= sizeof(intptr_t);
++ storePtr(*iter, dest);
++ }
++ MOZ_ASSERT(diffG == 0);
++
++ // Natural per-kind slot. See PushRegsInMask comment.
++ for (FloatRegisterBackwardIterator iter(fpuSet); iter.more(); ++iter) {
++ FloatRegister reg = *iter;
++ diffF -= reg.size();
++ numFpu -= 1;
++ dest.offset -= reg.size();
++ if (reg.isSimd128()) {
++ storeUnalignedSimd128(reg, dest);
++ } else {
++ storeDouble(reg.asDouble(), dest);
++ }
++ }
++ MOZ_ASSERT(diffF == 0);
++}
++
++void MacroAssembler::freeStackTo(uint32_t framePushed) {
++ MOZ_ASSERT(framePushed <= framePushed_);
++ // SP = FP - framePushed
++ movePtr(FramePointer, StackPointer);
++ if (framePushed) {
++ subPtr(Imm32(framePushed), StackPointer);
++ }
++ framePushed_ = framePushed;
++}
++
++// ===============================================================
++// Additional call / patch operations.
++
++void MacroAssembler::call(JitCode* c) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++
++ uint64_t addr = uintptr_t(c->raw());
++ BufferOffset bo = emitLoad64Stanza(scratch, addr);
++ addPendingJump(bo, ImmPtr(c->raw()), RelocationKind::JITCODE);
++
++ callJitNoProfiler(scratch);
++}
++
++CodeOffset MacroAssembler::callWithPatch() {
++ // Emit a CallTag-sized stanza of nops. Will be patched by patchCall.
++ // Return offset AFTER the stanza (= the return address when bl executes).
++ m_buffer.enterNoPool(kNoPoolPatchableBranchInsns);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ m_buffer.leaveNoPool();
++ return CodeOffset(currentOffset());
++}
++
++void MacroAssembler::patchCall(uint32_t callerOffset, uint32_t calleeOffset) {
++ // callerOffset points AFTER the 10-instruction stanza (the return address).
++ // Subtract to find the stanza start. The `bl` goes at inst[9].
++ uint32_t stanzaStart = callerOffset - 10 * sizeof(uint32_t);
++ Instruction* i0 = (Instruction*)(m_buffer.getInst(BufferOffset(stanzaStart)));
++ // bl offset is relative to inst[9], which is at stanzaStart + 36.
++ intptr_t blAddr = (intptr_t)stanzaStart + 9 * (intptr_t)sizeof(uint32_t);
++ intptr_t callOffset = (intptr_t)calleeOffset - blAddr;
++ if (JOffImm26::IsInRange(callOffset)) {
++ i0[0].makeNop();
++ i0[1].makeNop();
++ i0[2].makeNop();
++ i0[3].makeNop();
++ i0[4].makeNop();
++ i0[5].makeNop();
++ i0[6].makeNop();
++ i0[7].makeNop();
++ i0[8].makeNop();
++ i0[9].setData(PPC_b | JOffImm26(callOffset).encode() | LinkB);
++ } else {
++ addLongJump(BufferOffset(stanzaStart), BufferOffset(calleeOffset));
++ WriteLoad64Instructions(i0, SecondScratchReg, LabelBase::INVALID_OFFSET);
++ i0[8].makeOp_mtctr(SecondScratchReg);
++ i0[9].makeOp_bctr(LinkB);
++ }
++}
++
++void MacroAssembler::patchFarJump(CodeOffset farJump, uint32_t targetOffset) {
++ Instruction* inst =
++ (Instruction*)m_buffer.getInst(BufferOffset(farJump.offset()));
++ // Extract the destination register from the existing stanza. Both shapes
++ // encode rD at LE bits [21..25] of their first "register-touching" slot:
++ // P8 = mflr rD at [2], P9+ = addpcis rD at [0]. Major opcode of slot [0]
++ // distinguishes (31 = mfspr, 19 = addpcis).
++ uint32_t i0 = inst[0].encode();
++ uint32_t regCode = (((i0 >> 26) & 0x3f) == 19)
++ ? ((i0 >> 21) & 0x1f)
++ : ((inst[2].encode() >> 21) & 0x1f);
++ Register reg = Register::FromCode(regCode);
++ WriteLoad64Instructions(inst, reg, LabelBase::INVALID_OFFSET);
++ addLongJump(BufferOffset(farJump.offset()), BufferOffset(targetOffset));
++}
++
++// static
++void MacroAssembler::patchFarJump(uint8_t* farJump, uint8_t* target) {
++ UpdateLoad64Value((Instruction*)farJump, (uint64_t)(uintptr_t)target);
++ FlushICache(farJump, 8 * sizeof(Instruction));
++}
++
++// static
++void MacroAssembler::patchNopToCall(uint8_t* callsite, uint8_t* target) {
++ // callsite points AFTER the 10-instruction stanza. Subtract to find start.
++ Instruction* inst = (Instruction*)callsite - 10;
++ WriteLoad64Instructions(inst, SecondScratchReg, (uint64_t)(uintptr_t)target);
++ inst[8].makeOp_mtctr(SecondScratchReg);
++ inst[9].makeOp_bctr(LinkB);
++ FlushICache(inst, 10 * sizeof(Instruction));
++}
++
++// static
++void MacroAssembler::patchCallToNop(uint8_t* callsite) {
++ // callsite points AFTER the 10-instruction stanza. Subtract to find start.
++ Instruction* inst = (Instruction*)callsite - 10;
++ for (int i = 0; i < 10; i++) {
++ inst[i].makeNop();
++ }
++ FlushICache(inst, 10 * sizeof(Instruction));
++}
++
++void MacroAssembler::patchMove32(CodeOffset offset, Imm32 n) {
++ // Patch an 8-instruction load64 sequence with a 32-bit value.
++ Instruction* inst =
++ (Instruction*)m_buffer.getInst(BufferOffset(offset.offset()));
++ UpdateLoad64Value(inst, uint64_t(int64_t(n.value)));
++}
++
++uint32_t MacroAssembler::pushFakeReturnAddress(Register scratch) {
++ CodeLabel cl;
++
++ // Use mov(CodeLabel*, Register) which always emits a full 8-instruction
++ // load64 sequence (via NOPs + WriteLoad64Instructions). This is critical
++ // because movePtr(ImmWord(0)) would optimize to a single li instruction,
++ // but processCodeLabels->Bind->UpdateLoad64Value expects the full
++ // 8-instruction literal pool sequence at the patchAt offset.
++ mov(&cl, scratch);
++
++ Push(scratch);
++
++ bind(&cl);
++ uint32_t retAddr = currentOffset();
++
++ addCodeLabel(cl);
++ return retAddr;
++}
++
++void MacroAssembler::callWithABINoProfiler(Register fun, ABIType result) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ // Save fun to scratch since fun might be clobbered by callWithABIPre.
++ movePtr(fun, scratch);
++
++ uint32_t stackAdjust;
++ callWithABIPre(&stackAdjust);
++ call(scratch);
++ callWithABIPost(stackAdjust, result);
++}
++
++// ===============================================================
++// Additional arithmetic helpers.
++
++void MacroAssembler::flexibleRemainder32(Register lhs, Register rhs,
++ Register dest, bool isUnsigned,
++ const LiveRegisterSet&) {
++ // rem = lhs - (lhs/rhs)*rhs
++ // PPC64 divw(INT32_MIN, -1) is undefined; result is 0.
++ Label done;
++ if (!isUnsigned) {
++ Label notMinOverflow;
++ branchPtr(Assembler::NotEqual, lhs, ImmWord(INT32_MIN), ¬MinOverflow);
++ branchPtr(Assembler::NotEqual, rhs, ImmWord(-1), ¬MinOverflow);
++ move32(Imm32(0), dest);
++ jump(&done);
++ bind(¬MinOverflow);
++ }
++ if (HasPOWER9()) {
++ if (isUnsigned) {
++ as_moduw(dest, lhs, rhs);
++ } else {
++ as_modsw(dest, lhs, rhs);
++ }
++ } else {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ if (isUnsigned) {
++ as_divwu(scratch, lhs, rhs);
++ } else {
++ as_divw(scratch, lhs, rhs);
++ }
++ as_mullw(scratch, scratch, rhs);
++ as_subf(dest, scratch, lhs);
++ }
++ as_extsw(dest, dest);
++ bind(&done);
++}
++
++void MacroAssembler::flexibleQuotientPtr(Register lhs, Register rhs,
++ Register dest, bool isUnsigned,
++ const LiveRegisterSet&) {
++ // PPC64 divd(INT64_MIN, -1) is undefined; return INT64_MIN to match
++ // ARM64/LoongArch64 hardware sdiv behavior.
++ Label done;
++ if (!isUnsigned) {
++ Label notMinOverflow;
++ branchPtr(Assembler::NotEqual, lhs, ImmWord(INT64_MIN), ¬MinOverflow);
++ branchPtr(Assembler::NotEqual, rhs, ImmWord(-1), ¬MinOverflow);
++ movePtr(ImmWord(INT64_MIN), dest);
++ jump(&done);
++ bind(¬MinOverflow);
++ }
++ if (isUnsigned) {
++ as_divdu(dest, lhs, rhs);
++ } else {
++ as_divd(dest, lhs, rhs);
++ }
++ bind(&done);
++}
++
++void MacroAssembler::flexibleRemainderPtr(Register lhs, Register rhs,
++ Register dest, bool isUnsigned,
++ const LiveRegisterSet&) {
++ // rem = lhs - (lhs/rhs)*rhs
++ // PPC64 divd(INT64_MIN, -1) is undefined; result is 0.
++ Label done;
++ if (!isUnsigned) {
++ Label notMinOverflow;
++ branchPtr(Assembler::NotEqual, lhs, ImmWord(INT64_MIN), ¬MinOverflow);
++ branchPtr(Assembler::NotEqual, rhs, ImmWord(-1), ¬MinOverflow);
++ movePtr(ImmWord(0), dest);
++ jump(&done);
++ bind(¬MinOverflow);
++ }
++ if (HasPOWER9()) {
++ if (isUnsigned) {
++ as_modud(dest, lhs, rhs);
++ } else {
++ as_modsd(dest, lhs, rhs);
++ }
++ } else {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ if (isUnsigned) {
++ as_divdu(scratch, lhs, rhs);
++ } else {
++ as_divd(scratch, lhs, rhs);
++ }
++ as_mulld(scratch, scratch, rhs);
++ as_subf(dest, scratch, lhs);
++ }
++ bind(&done);
++}
++
++// ===============================================================
++// Rounding helpers.
++
++void MacroAssembler::floorDoubleToInt32(FloatRegister src, Register dest,
++ Label* fail) {
++ ScratchDoubleScope fpscratch(asMasm());
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++
++ // Round toward negative infinity, then convert to int64.
++ as_frim(fpscratch, src);
++ as_fctidz(fpscratch, fpscratch);
++ as_mfvsrd(dest, fpscratch);
++
++ // Check if result fits in int32.
++ as_extsw(scratch, dest);
++ as_cmpd(dest, scratch);
++ ma_b(NotEqual, fail);
++
++ // Check for -0 and NaN when result is zero.
++ Label notZero;
++ as_cmpdi(dest, 0);
++ ma_b(NotEqual, ¬Zero);
++ {
++ // If top 2 bits of src are set, it's negative or NaN.
++ as_mfvsrd(dest, src);
++ // rldicl. = x_srdi + record form: dest = top 2 bits, CR0[eq]=(dest==0).
++ // Folds the explicit cmpdi src,0 that would otherwise drive the branch.
++ as_rldicl_rc(dest, dest, 2, 62);
++ ma_b(NotEqual, fail);
++ }
++ bind(¬Zero);
++}
++
++void MacroAssembler::floorFloat32ToInt32(FloatRegister src, Register dest,
++ Label* fail) {
++ ScratchDoubleScope fpscratch(asMasm());
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++
++ // PPC FP rounding works on doubles. Single-precision FPRs are
++ // already in double-width registers, so frim works fine.
++ as_frim(fpscratch, src);
++ as_fctidz(fpscratch, fpscratch);
++ as_mfvsrd(dest, fpscratch);
++
++ // Check if result fits in int32.
++ as_extsw(scratch, dest);
++ as_cmpd(dest, scratch);
++ ma_b(NotEqual, fail);
++
++ // Check for -0 and NaN when result is zero.
++ Label notZero;
++ as_cmpdi(dest, 0);
++ ma_b(NotEqual, ¬Zero);
++ {
++ // src is held in the FPR as a 64-bit double (lfs widens float32 to
++ // double on load), so the same top-2-bits check used for doubles
++ // applies: bit 63 = sign, bit 62 = exponent MSB. Nonzero means -0,
++ // ±Inf, NaN, or a large magnitude — none of which is +0.
++ as_mfvsrd(dest, src);
++ // rldicl. = x_srdi + record form: dest = top 2 bits, CR0[eq]=(dest==0).
++ // Folds the explicit cmpdi src,0 that would otherwise drive the branch.
++ as_rldicl_rc(dest, dest, 2, 62);
++ ma_b(NotEqual, fail);
++ }
++ bind(¬Zero);
++}
++
++void MacroAssembler::ceilDoubleToInt32(FloatRegister src, Register dest,
++ Label* fail) {
++ ScratchDoubleScope fpscratch(asMasm());
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++
++ as_frip(fpscratch, src);
++ as_fctidz(fpscratch, fpscratch);
++ as_mfvsrd(dest, fpscratch);
++
++ // Check if result fits in int32.
++ as_extsw(scratch, dest);
++ as_cmpd(dest, scratch);
++ ma_b(NotEqual, fail);
++
++ // Check for (-1, -0] and NaN when result is zero.
++ Label notZero;
++ as_cmpdi(dest, 0);
++ ma_b(NotEqual, ¬Zero);
++ {
++ // If binary value is not zero, input was not 0 (could be -0 or NaN).
++ as_mfvsrd(dest, src);
++ as_cmpdi(dest, 0);
++ ma_b(NotEqual, fail);
++ }
++ bind(¬Zero);
++}
++
++void MacroAssembler::ceilFloat32ToInt32(FloatRegister src, Register dest,
++ Label* fail) {
++ ScratchDoubleScope fpscratch(asMasm());
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++
++ as_frip(fpscratch, src);
++ as_fctidz(fpscratch, fpscratch);
++ as_mfvsrd(dest, fpscratch);
++
++ // Check if result fits in int32.
++ as_extsw(scratch, dest);
++ as_cmpd(dest, scratch);
++ ma_b(NotEqual, fail);
++
++ // Check for (-1, -0] and NaN when result is zero.
++ Label notZero;
++ as_cmpdi(dest, 0);
++ ma_b(NotEqual, ¬Zero);
++ {
++ as_mfvsrd(dest, src);
++ as_cmpdi(dest, 0);
++ ma_b(NotEqual, fail);
++ }
++ bind(¬Zero);
++}
++
++void MacroAssembler::truncDoubleToInt32(FloatRegister src, Register dest,
++ Label* fail) {
++ ScratchDoubleScope fpscratch(asMasm());
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++
++ as_fctidz(fpscratch, src);
++ as_mfvsrd(dest, fpscratch);
++
++ // Check if result fits in int32.
++ as_extsw(scratch, dest);
++ as_cmpd(dest, scratch);
++ ma_b(NotEqual, fail);
++
++ // Check for -0 and NaN when result is zero.
++ Label notZero;
++ as_cmpdi(dest, 0);
++ ma_b(NotEqual, ¬Zero);
++ {
++ as_mfvsrd(dest, src);
++ // rldicl. = x_srdi + record form: dest = top 2 bits, CR0[eq]=(dest==0).
++ // Folds the explicit cmpdi src,0 that would otherwise drive the branch.
++ as_rldicl_rc(dest, dest, 2, 62);
++ ma_b(NotEqual, fail);
++ }
++ bind(¬Zero);
++}
++
++void MacroAssembler::truncFloat32ToInt32(FloatRegister src, Register dest,
++ Label* fail) {
++ ScratchDoubleScope fpscratch(asMasm());
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++
++ as_fctidz(fpscratch, src);
++ as_mfvsrd(dest, fpscratch);
++
++ // Check if result fits in int32.
++ as_extsw(scratch, dest);
++ as_cmpd(dest, scratch);
++ ma_b(NotEqual, fail);
++
++ // Check for -0 and NaN when result is zero.
++ Label notZero;
++ as_cmpdi(dest, 0);
++ ma_b(NotEqual, ¬Zero);
++ {
++ as_mfvsrd(dest, src);
++ // rldicl. = x_srdi + record form: dest = top 2 bits, CR0[eq]=(dest==0).
++ // Folds the explicit cmpdi src,0 that would otherwise drive the branch.
++ as_rldicl_rc(dest, dest, 2, 62);
++ ma_b(NotEqual, fail);
++ }
++ bind(¬Zero);
++}
++
++void MacroAssembler::roundDoubleToInt32(FloatRegister src, Register dest,
++ FloatRegister temp, Label* fail) {
++ ScratchDoubleScope fpscratch(asMasm());
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++
++ Label negative, end, performRound;
++
++ // Branch for negative inputs.
++ zeroDouble(fpscratch);
++ branchDouble(DoubleGreaterThanOrEqual, src, fpscratch, &performRound);
++
++ // Input is negative.
++ loadConstantDouble(-0.5, fpscratch);
++ branchDouble(DoubleGreaterThanOrEqual, src, fpscratch, fail);
++ jump(&performRound);
++
++ bind(&performRound);
++ {
++ loadConstantDouble(GetBiggestNumberLessThan(0.5), temp);
++ as_fadd(fpscratch, src, temp);
++ as_frim(fpscratch, fpscratch);
++ as_fctidz(fpscratch, fpscratch);
++ as_mfvsrd(dest, fpscratch);
++
++ // Check if result fits in int32.
++ as_extsw(scratch, dest);
++ as_cmpd(dest, scratch);
++ ma_b(NotEqual, fail);
++ }
++ bind(&end);
++
++ // Check for -0 and NaN when result is zero.
++ Label notZero;
++ as_cmpdi(dest, 0);
++ ma_b(NotEqual, ¬Zero);
++ {
++ as_mfvsrd(dest, src);
++ as_cmpdi(dest, 0);
++ ma_b(NotEqual, fail);
++ }
++ bind(¬Zero);
++}
++
++void MacroAssembler::roundFloat32ToInt32(FloatRegister src, Register dest,
++ FloatRegister temp, Label* fail) {
++ ScratchDoubleScope fpscratch(asMasm());
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++
++ Label negative, end, performRound;
++
++ // Branch for non-negative inputs.
++ loadConstantFloat32(0.0f, fpscratch);
++ branchFloat(DoubleGreaterThanOrEqual, src, fpscratch, &performRound);
++
++ // Input is negative.
++ loadConstantFloat32(-0.5f, fpscratch);
++ branchFloat(DoubleGreaterThanOrEqual, src, fpscratch, fail);
++ jump(&performRound);
++
++ bind(&performRound);
++ {
++ loadConstantFloat32(float(GetBiggestNumberLessThan(0.5)), temp);
++ as_fadds(fpscratch, src, temp);
++ as_frim(fpscratch, fpscratch);
++ as_fctidz(fpscratch, fpscratch);
++ as_mfvsrd(dest, fpscratch);
++
++ // Check if result fits in int32.
++ as_extsw(scratch, dest);
++ as_cmpd(dest, scratch);
++ ma_b(NotEqual, fail);
++ }
++ bind(&end);
++
++ // Check for -0 and NaN when result is zero.
++ Label notZero;
++ as_cmpdi(dest, 0);
++ ma_b(NotEqual, ¬Zero);
++ {
++ as_mfvsrd(dest, src);
++ as_cmpdi(dest, 0);
++ ma_b(NotEqual, fail);
++ }
++ bind(¬Zero);
++}
++
++// ===============================================================
++// FP conversion / copy-sign.
++
++void MacroAssembler::convertIntPtrToDouble(Register src, FloatRegister dest) {
++ convertInt64ToDouble(Register64(src), dest);
++}
++
++void MacroAssembler::copySignDouble(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister output) {
++ // fcpsgn frt, fra, frb: copies sign of fra to magnitude of frb.
++ // lhs = magnitude source, rhs = sign source.
++ as_fcpsgn(output, rhs, lhs);
++}
++
++void MacroAssembler::copySignFloat32(FloatRegister lhs, FloatRegister rhs,
++ FloatRegister output) {
++ as_fcpsgn(output, rhs, lhs);
++}
++
++// ===============================================================
++// GC / nursery helpers.
++
++void MacroAssembler::loadStoreBuffer(Register ptr, Register buffer) {
++ andPtr(Imm32(int32_t(~gc::ChunkMask)), ptr, buffer);
++ loadPtr(Address(buffer, gc::ChunkStoreBufferOffset), buffer);
++}
++
++void MacroAssembler::branchValueIsNurseryCell(Condition cond,
++ const Address& address,
++ Register temp, Label* label) {
++ branchValueIsNurseryCellImpl(cond, address, temp, label);
++}
++
++template <typename T>
++void MacroAssembler::branchValueIsNurseryCellImpl(Condition cond,
++ const T& value, Register temp,
++ Label* label) {
++ MOZ_ASSERT(cond == Assembler::Equal || cond == Assembler::NotEqual);
++ MOZ_ASSERT(temp != InvalidReg);
++ Label done;
++ branchTestGCThing(Assembler::NotEqual, value,
++ cond == Assembler::Equal ? &done : label);
++
++ getGCThingValueChunk(value, temp);
++ loadPtr(Address(temp, gc::ChunkStoreBufferOffset), temp);
++ branchPtr(InvertCondition(cond), temp, ImmWord(0), label);
++
++ bind(&done);
++}
++
++// ===============================================================
++// Template instantiations.
++
++template <typename T>
++void MacroAssembler::storeUnboxedValue(const ConstantOrRegister& value,
++ MIRType valueType, const T& dest) {
++ MOZ_ASSERT(valueType < MIRType::Value);
++
++ if (valueType == MIRType::Double) {
++ boxDouble(value.reg().typedReg().fpu(), dest);
++ return;
++ }
++
++ if (value.constant()) {
++ storeValue(value.value(), dest);
++ } else {
++ storeValue(ValueTypeFromMIRType(valueType), value.reg().typedReg().gpr(),
++ dest);
++ }
++}
++
++template void MacroAssembler::storeUnboxedValue(const ConstantOrRegister& value,
++ MIRType valueType,
++ const Address& dest);
++template void MacroAssembler::storeUnboxedValue(
++ const ConstantOrRegister& value, MIRType valueType,
++ const BaseObjectElementIndex& dest);
++
++// ===============================================================
++// Misc stubs.
++
++void MacroAssembler::comment(const char* msg) {}
++
++void MacroAssembler::speculationBarrier() {
++ // isync provides execution synchronization: discards prefetched
++ // instructions and forces a refetch+reexecute past the barrier.
++ // No instruction following isync may begin (architecturally) until
++ // isync completes, blocking speculative bypass — exactly the
++ // Spectre v1 guarantee needed after a C call returns a value that
++ // may influence subsequent loads. Reachable from shared
++ // CodeGenerator under JitOptions.spectreJitToCxxCalls.
++ as_isync();
++}
++
++void MacroAssembler::atomicPause() { nop(); }
++
++void MacroAssembler::enterFakeExitFrameForWasm(Register cxreg, Register scratch,
++ ExitFrameType type) {
++ enterFakeExitFrame(cxreg, scratch, type);
++}
++
++void MacroAssembler::wasmBoundsCheck32(Condition cond, Register index,
++ Register boundsCheckLimit,
++ Label* label) {
++ ma_cmp(index, boundsCheckLimit, cond);
++ ma_b(cond, label);
++}
++
++void MacroAssembler::wasmBoundsCheck32(Condition cond, Register index,
++ Address boundsCheckLimit, Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ load32(boundsCheckLimit, scratch);
++ ma_cmp(index, scratch, cond);
++ ma_b(cond, label);
++}
++
++void MacroAssembler::wasmBoundsCheck64(Condition cond, Register64 index,
++ Register64 boundsCheckLimit,
++ Label* label) {
++ ma_cmp(index.reg, boundsCheckLimit.reg, cond);
++ ma_b(cond, label);
++}
++
++void MacroAssembler::wasmBoundsCheck64(Condition cond, Register64 index,
++ Address boundsCheckLimit, Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ loadPtr(boundsCheckLimit, scratch);
++ ma_cmp(index.reg, scratch, cond);
++ ma_b(cond, label);
++}
++
++CodeOffset MacroAssembler::move32WithPatch(Register dest) {
++ CodeOffset offset(currentOffset());
++ emitLoad64Stanza(dest, 0);
++ return offset;
++}
++
++CodeOffset MacroAssembler::sub32FromMemAndBranchIfNegativeWithPatch(
++ Address address, Label* label) {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ MOZ_ASSERT(scratch != address.base);
++ load32(address, scratch);
++ // Subtract a placeholder value (will be patched).
++ // Use addi with positive placeholder (128), which will be patched to
++ // addi with negative value. The immediate is in the addi instruction.
++ as_addi(scratch, scratch, 128);
++ CodeOffset patchPoint = CodeOffset(currentOffset());
++ store32(scratch, address);
++ // Branch if result is negative (signed).
++ as_cmpwi(scratch, 0);
++ ma_b(LessThan, label);
++ return patchPoint;
++}
++
++bool MacroAssembler::convertUInt64ToDoubleNeedsTemp() { return false; }
++
++void MacroAssembler::call(ImmWord imm) { call(ImmPtr((void*)imm.value)); }
++
++void MacroAssembler::convertUInt64ToDouble(Register64 src, FloatRegister dest,
++ Register temp) {
++ MOZ_ASSERT(temp == Register::Invalid());
++ // POWER7+ has fcfidu (unsigned i64 → f64) as a single instruction; no
++ // sign-split / branch / GPR scratch needed.
++ as_mtvsrd(dest, src.reg);
++ as_fcfidu(dest, dest);
++}
++
++void MacroAssembler::convertInt64ToFloat32(Register64 src, FloatRegister dest) {
++ as_mtvsrd(dest, src.reg);
++ as_fcfids(dest, dest);
++}
++
++void MacroAssembler::convertUInt64ToFloat32(Register64 src, FloatRegister dest,
++ Register temp) {
++ MOZ_ASSERT(temp == Register::Invalid());
++ // POWER7+ has fcfidus (unsigned i64 → f32) as a single instruction.
++ as_mtvsrd(dest, src.reg);
++ as_fcfidus(dest, dest);
++}
++
++void MacroAssembler::flexibleQuotient32(
++ Register lhs, Register rhs, Register dest, bool isUnsigned,
++ const LiveRegisterSet& volatileLiveRegs) {
++ // PPC64 divw(INT32_MIN, -1) is undefined; return INT32_MIN to match
++ // ARM64/LoongArch64 hardware sdiv behavior.
++ Label done;
++ if (!isUnsigned) {
++ Label notMinOverflow;
++ branchPtr(Assembler::NotEqual, lhs, ImmWord(INT32_MIN), ¬MinOverflow);
++ branchPtr(Assembler::NotEqual, rhs, ImmWord(-1), ¬MinOverflow);
++ move32(Imm32(INT32_MIN), dest);
++ jump(&done);
++ bind(¬MinOverflow);
++ }
++ if (isUnsigned) {
++ as_divwu(dest, lhs, rhs);
++ } else {
++ as_divw(dest, lhs, rhs);
++ }
++ as_extsw(dest, dest);
++ bind(&done);
++}
++
++void MacroAssembler::oolWasmTruncateCheckF32ToI32(
++ FloatRegister input, Register output, TruncFlags flags,
++ const wasm::TrapSiteDesc& trapSiteDesc, Label* rejoin) {
++ outOfLineWasmTruncateToInt32Check(input, output, MIRType::Float32, flags,
++ rejoin, trapSiteDesc);
++}
++
++void MacroAssembler::oolWasmTruncateCheckF32ToI64(
++ FloatRegister input, Register64 output, TruncFlags flags,
++ const wasm::TrapSiteDesc& trapSiteDesc, Label* rejoin) {
++ outOfLineWasmTruncateToInt64Check(input, output, MIRType::Float32, flags,
++ rejoin, trapSiteDesc);
++}
++
++void MacroAssembler::oolWasmTruncateCheckF64ToI32(
++ FloatRegister input, Register output, TruncFlags flags,
++ const wasm::TrapSiteDesc& trapSiteDesc, Label* rejoin) {
++ outOfLineWasmTruncateToInt32Check(input, output, MIRType::Double, flags,
++ rejoin, trapSiteDesc);
++}
++
++void MacroAssembler::oolWasmTruncateCheckF64ToI64(
++ FloatRegister input, Register64 output, TruncFlags flags,
++ const wasm::TrapSiteDesc& trapSiteDesc, Label* rejoin) {
++ outOfLineWasmTruncateToInt64Check(input, output, MIRType::Double, flags,
++ rejoin, trapSiteDesc);
++}
++
++void MacroAssemblerPPC64Compat::outOfLineWasmTruncateToInt32Check(
++ FloatRegister input, Register output, MIRType fromType, TruncFlags flags,
++ Label* rejoin, const wasm::TrapSiteDesc& trapSiteDesc) {
++ bool isUnsigned = flags & TRUNC_UNSIGNED;
++ bool isSaturating = flags & TRUNC_SATURATING;
++
++ if (isSaturating) {
++ ScratchDoubleScope fpscratch(asMasm());
++ if (fromType == MIRType::Double) {
++ asMasm().loadConstantDouble(0.0, fpscratch);
++ } else {
++ asMasm().loadConstantFloat32(0.0f, fpscratch);
++ }
++
++ if (isUnsigned) {
++ // If input < 0 or NaN, output = 0; else output = UINT32_MAX.
++ Label notNegOrNaN;
++ if (fromType == MIRType::Double) {
++ asMasm().branchDouble(Assembler::DoubleGreaterThanOrEqual, input,
++ fpscratch, ¬NegOrNaN);
++ } else {
++ asMasm().branchFloat(Assembler::DoubleGreaterThanOrEqual, input,
++ fpscratch, ¬NegOrNaN);
++ }
++ asMasm().move32(Imm32(0), output);
++ asMasm().jump(rejoin);
++ asMasm().bind(¬NegOrNaN);
++ asMasm().move32(Imm32(UINT32_MAX), output);
++ } else {
++ // Signed: NaN -> 0, negative overflow -> INT32_MIN,
++ // positive overflow already saturated to INT32_MAX.
++ Label notNaN, done;
++ if (fromType == MIRType::Double) {
++ asMasm().branchDouble(Assembler::DoubleOrdered, input, input, ¬NaN);
++ } else {
++ asMasm().branchFloat(Assembler::DoubleOrdered, input, input, ¬NaN);
++ }
++ asMasm().move32(Imm32(0), output);
++ asMasm().jump(rejoin);
++
++ asMasm().bind(¬NaN);
++ if (fromType == MIRType::Double) {
++ asMasm().branchDouble(Assembler::DoubleGreaterThanOrEqual, input,
++ fpscratch, rejoin);
++ } else {
++ asMasm().branchFloat(Assembler::DoubleGreaterThanOrEqual, input,
++ fpscratch, rejoin);
++ }
++ asMasm().move32(Imm32(INT32_MIN), output);
++ }
++
++ MOZ_ASSERT(rejoin->bound());
++ asMasm().jump(rejoin);
++ return;
++ }
++
++ Label inputIsNaN;
++ if (fromType == MIRType::Double) {
++ asMasm().branchDouble(Assembler::DoubleUnordered, input, input,
++ &inputIsNaN);
++ } else {
++ asMasm().branchFloat(Assembler::DoubleUnordered, input, input, &inputIsNaN);
++ }
++
++ asMasm().wasmTrap(wasm::Trap::IntegerOverflow, trapSiteDesc);
++ asMasm().bind(&inputIsNaN);
++ asMasm().wasmTrap(wasm::Trap::InvalidConversionToInteger, trapSiteDesc);
++}
++
++void MacroAssemblerPPC64Compat::outOfLineWasmTruncateToInt64Check(
++ FloatRegister input, Register64 output_, MIRType fromType, TruncFlags flags,
++ Label* rejoin, const wasm::TrapSiteDesc& trapSiteDesc) {
++ bool isUnsigned = flags & TRUNC_UNSIGNED;
++ bool isSaturating = flags & TRUNC_SATURATING;
++
++ if (isSaturating) {
++ ScratchDoubleScope fpscratch(asMasm());
++ Register output = output_.reg;
++
++ if (fromType == MIRType::Double) {
++ asMasm().loadConstantDouble(0.0, fpscratch);
++ } else {
++ asMasm().loadConstantFloat32(0.0f, fpscratch);
++ }
++
++ if (isUnsigned) {
++ Label notNegOrNaN;
++ if (fromType == MIRType::Double) {
++ asMasm().branchDouble(Assembler::DoubleGreaterThanOrEqual, input,
++ fpscratch, ¬NegOrNaN);
++ } else {
++ asMasm().branchFloat(Assembler::DoubleGreaterThanOrEqual, input,
++ fpscratch, ¬NegOrNaN);
++ }
++ asMasm().movePtr(ImmWord(0), output);
++ asMasm().jump(rejoin);
++ asMasm().bind(¬NegOrNaN);
++ asMasm().movePtr(ImmWord(UINT64_MAX), output);
++ } else {
++ Label notNaN;
++ if (fromType == MIRType::Double) {
++ asMasm().branchDouble(Assembler::DoubleOrdered, input, input, ¬NaN);
++ } else {
++ asMasm().branchFloat(Assembler::DoubleOrdered, input, input, ¬NaN);
++ }
++ asMasm().movePtr(ImmWord(0), output);
++ asMasm().jump(rejoin);
++
++ asMasm().bind(¬NaN);
++ if (fromType == MIRType::Double) {
++ asMasm().branchDouble(Assembler::DoubleGreaterThanOrEqual, input,
++ fpscratch, rejoin);
++ } else {
++ asMasm().branchFloat(Assembler::DoubleGreaterThanOrEqual, input,
++ fpscratch, rejoin);
++ }
++ asMasm().movePtr(ImmWord(INT64_MIN), output);
++ }
++
++ MOZ_ASSERT(rejoin->bound());
++ asMasm().jump(rejoin);
++ return;
++ }
++
++ Label inputIsNaN;
++ if (fromType == MIRType::Double) {
++ asMasm().branchDouble(Assembler::DoubleUnordered, input, input,
++ &inputIsNaN);
++ } else {
++ asMasm().branchFloat(Assembler::DoubleUnordered, input, input, &inputIsNaN);
++ }
++
++ asMasm().wasmTrap(wasm::Trap::IntegerOverflow, trapSiteDesc);
++ asMasm().bind(&inputIsNaN);
++ asMasm().wasmTrap(wasm::Trap::InvalidConversionToInteger, trapSiteDesc);
++}
++
++void MacroAssembler::PopStackPtr() {
++ loadPtr(Address(StackPointer, 0), StackPointer);
++ adjustFrame(-int32_t(sizeof(intptr_t)));
++}
++
++void MacroAssembler::patchSub32FromMemAndBranchIfNegative(CodeOffset offset,
++ Imm32 imm) {
++ int32_t val = imm.value;
++ MOZ_RELEASE_ASSERT(val >= 1 && val <= 127);
++ // Patch the addi instruction that's right before patchPoint.
++ // addi is 1 instruction before the CodeOffset (which is after the addi).
++ Instruction* inst =
++ (Instruction*)m_buffer.getInst(BufferOffset(offset.offset() - 4));
++ // Rewrite the immediate field to -val.
++ // PPC addi: opcode(6) | RT(5) | RA(5) | SI(16)
++ uint32_t instWord = inst->encode();
++ uint32_t base = instWord & 0xffff0000;
++ inst->setData(base | (uint16_t)(-val & 0xffff));
++}
++
++void MacroAssembler::wasmTruncateDoubleToInt32(FloatRegister input,
++ Register output,
++ bool isSaturating,
++ Label* oolEntry) {
++ ScratchDoubleScope fpscratch(asMasm());
++ // Clear VXCVI (bit 23) before the conversion so we can detect overflow.
++ as_mtfsb0(23);
++ as_fctiwz(fpscratch, input);
++ as_mfvsrd(output, fpscratch);
++ as_extsw(output, output);
++ // Move FPSCR field 5 (which contains VXCVI) to CR0.
++ // If the conversion was invalid (NaN or out-of-range), VXCVI=1 → SO set.
++ as_mcrfs(cr0, 5);
++ ma_b(SOBit, oolEntry);
++}
++
++void MacroAssembler::wasmTruncateDoubleToUInt32(FloatRegister input,
++ Register output,
++ bool isSaturating,
++ Label* oolEntry) {
++ ScratchDoubleScope fpscratch(asMasm());
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ // Always check for NaN — the ool handler clamps for saturating mode.
++ as_fcmpu(input, input);
++ ma_b(DoubleUnordered, oolEntry);
++ as_fctidz(fpscratch, input);
++ as_mfvsrd(output, fpscratch);
++ x_srdi(scratch, output, 32);
++ as_extsw(output, output);
++ as_cmpdi(scratch, 0);
++ ma_b(NotEqual, oolEntry);
++}
++
++void MacroAssembler::wasmTruncateFloat32ToInt32(FloatRegister input,
++ Register output,
++ bool isSaturating,
++ Label* oolEntry) {
++ ScratchDoubleScope fpscratch(asMasm());
++ as_mtfsb0(23);
++ as_fctiwz(fpscratch, input);
++ as_mfvsrd(output, fpscratch);
++ as_extsw(output, output);
++ as_mcrfs(cr0, 5);
++ ma_b(SOBit, oolEntry);
++}
++
++void MacroAssembler::wasmTruncateFloat32ToUInt32(FloatRegister input,
++ Register output,
++ bool isSaturating,
++ Label* oolEntry) {
++ ScratchDoubleScope fpscratch(asMasm());
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++ as_fcmpu(input, input);
++ ma_b(DoubleUnordered, oolEntry);
++ as_fctidz(fpscratch, input);
++ as_mfvsrd(output, fpscratch);
++ x_srdi(scratch, output, 32);
++ as_extsw(output, output);
++ as_cmpdi(scratch, 0);
++ ma_b(NotEqual, oolEntry);
++}
++
++void MacroAssembler::wasmTruncateDoubleToInt64(
++ FloatRegister input, Register64 output, bool isSaturating, Label* oolEntry,
++ Label* oolRejoin, FloatRegister tempDouble) {
++ MOZ_ASSERT(tempDouble.isInvalid());
++ ScratchDoubleScope fpscratch(asMasm());
++ as_mtfsb0(23);
++ as_fctidz(fpscratch, input);
++ as_mfvsrd(output.reg, fpscratch);
++ as_mcrfs(cr0, 5);
++ ma_b(SOBit, oolEntry);
++ if (isSaturating) {
++ bind(oolRejoin);
++ }
++}
++
++void MacroAssembler::wasmTruncateFloat32ToInt64(
++ FloatRegister input, Register64 output, bool isSaturating, Label* oolEntry,
++ Label* oolRejoin, FloatRegister tempFloat) {
++ MOZ_ASSERT(tempFloat.isInvalid());
++ ScratchDoubleScope fpscratch(asMasm());
++ as_mtfsb0(23);
++ as_fctidz(fpscratch, input);
++ as_mfvsrd(output.reg, fpscratch);
++ as_mcrfs(cr0, 5);
++ ma_b(SOBit, oolEntry);
++ if (isSaturating) {
++ bind(oolRejoin);
++ }
++}
++
++void MacroAssembler::wasmTruncateDoubleToUInt64(
++ FloatRegister input, Register64 output, bool isSaturating, Label* oolEntry,
++ Label* oolRejoin, FloatRegister tempDouble) {
++ MOZ_ASSERT(tempDouble.isInvalid());
++ ScratchDoubleScope fpscratch(asMasm());
++ as_mtfsb0(23);
++ as_fctiduz(fpscratch, input);
++ as_mfvsrd(output.reg, fpscratch);
++ as_mcrfs(cr0, 5);
++ ma_b(SOBit, oolEntry);
++ if (isSaturating) {
++ bind(oolRejoin);
++ }
++}
++
++void MacroAssembler::wasmTruncateFloat32ToUInt64(
++ FloatRegister input, Register64 output, bool isSaturating, Label* oolEntry,
++ Label* oolRejoin, FloatRegister tempFloat) {
++ MOZ_ASSERT(tempFloat.isInvalid());
++ ScratchDoubleScope fpscratch(asMasm());
++ as_mtfsb0(23);
++ as_fctiduz(fpscratch, input);
++ as_mfvsrd(output.reg, fpscratch);
++ as_mcrfs(cr0, 5);
++ ma_b(SOBit, oolEntry);
++ if (isSaturating) {
++ bind(oolRejoin);
++ }
++}
++
++void MacroAssemblerPPC64Compat::profilerEnterFrame(Register framePtr,
++ Register scratch) {
++ asMasm().loadJSContext(scratch);
++ loadPtr(Address(scratch, offsetof(JSContext, profilingActivation_)), scratch);
++ storePtr(framePtr,
++ Address(scratch, JitActivation::offsetOfLastProfilingFrame()));
++ storePtr(ImmPtr(nullptr),
++ Address(scratch, JitActivation::offsetOfLastProfilingCallSite()));
++}
++
++void MacroAssemblerPPC64Compat::profilerExitFrame() {
++ jump(asMasm().runtime()->jitRuntime()->getProfilerExitFrameTail());
++}
++
++void MacroAssemblerPPC64Compat::ma_mod_mask(Register src, Register dest,
++ Register hold, Register remain,
++ int32_t shift, Label* negZero) {
++ // Compute x % ((1<<shift) - 1) by digit-summing in base b = 1<<shift.
++ // Since b % (b-1) == 1, x % (b-1) == sum of base-b digits of x, mod (b-1).
++ int32_t mask = (1 << shift) - 1;
++ Label head, negative, sumSigned, done;
++
++ as_or_(remain, src, src); // move src -> remain
++ xs_li(dest, 0);
++
++ // Check sign (32-bit signed comparison)
++ as_cmpwi(remain, 0);
++ ma_b(Assembler::LessThan, &negative);
++ xs_li(hold, 1);
++ jump(&head);
++
++ bind(&negative);
++ xs_li(hold, -1);
++ as_neg(remain, remain);
++ as_rldicl(remain, remain, 0, 32);
++
++ bind(&head);
++ {
++ UseScratchRegisterScope temps(asMasm());
++ Register scratch = temps.Acquire();
++
++ // Extract bottom 'shift' bits: scratch = remain & mask
++ move32(Imm32(mask), scratch);
++ as_and_(scratch, remain, scratch);
++
++ // Add to accumulator
++ as_add(dest, dest, scratch);
++
++ // Trial subtraction: scratch = dest - mask
++ move32(Imm32(mask), scratch);
++ as_subf(scratch, scratch, dest); // scratch = dest - scratch
++
++ // If (dest - mask) > 0, keep the subtracted value
++ as_cmpwi(scratch, 0);
++ ma_b(Assembler::LessThan, &sumSigned);
++ as_or_(dest, scratch, scratch); // dest = scratch
++ bind(&sumSigned);
++
++ // Shift out the bits we just processed
++ x_srwi(remain, remain, shift);
++
++ // Continue if remain != 0
++ as_cmpwi(remain, 0);
++ ma_b(Assembler::NotEqual, &head);
++ }
++
++ // If input was negative, negate result
++ as_cmpwi(hold, 0);
++ ma_b(Assembler::GreaterThanOrEqual, &done);
++
++ if (negZero != nullptr) {
++ as_cmpwi(dest, 0);
++ ma_b(Assembler::Equal, negZero);
++ }
++
++ as_neg(dest, dest);
++ as_extsw(dest, dest);
++
++ bind(&done);
++}
++
++// ========================================================================
++// Atomic operations.
++
++template <typename T>
++static void CompareExchange(MacroAssembler& masm,
++ const wasm::MemoryAccessDesc* access,
++ Scalar::Type type, Synchronization sync,
++ const T& mem, Register oldval, Register newval,
++ Register valueTemp, Register offsetTemp,
++ Register maskTemp, Register output) {
++ UseScratchRegisterScope temps(masm);
++ bool signExtend = Scalar::isSignedIntType(type);
++ unsigned nbytes = Scalar::byteSize(type);
++
++ switch (nbytes) {
++ case 1:
++ case 2:
++ break;
++ case 4:
++ MOZ_ASSERT(valueTemp == InvalidReg);
++ MOZ_ASSERT(offsetTemp == InvalidReg);
++ MOZ_ASSERT(maskTemp == InvalidReg);
++ break;
++ default:
++ MOZ_CRASH();
++ }
++
++ Label again, end;
++
++ Register scratch = temps.Acquire();
++ masm.computeEffectiveAddress(mem, scratch);
++
++ if (nbytes == 4) {
++ masm.memoryBarrierBefore(sync);
++ masm.bind(&again);
++
++ if (access) {
++ masm.flushBuffer(); // see comment in wasmLoadImpl
++ masm.append(*access, wasm::TrapMachineInsn::Atomic,
++ FaultingCodeOffset(masm.currentOffset()));
++ }
++
++ masm.as_lwarx(output, r0, scratch);
++ // ma_cmp(..., is32bit=true) emits cmpw, which compares only bits
++ // 32:63 (low 32) of both operands per ISA v3.0B. The upper
++ // 32 bits of oldval are ignored, so no canonicalising extsw needed.
++ masm.ma_cmp(output, oldval, Assembler::NotEqual, /* is32bit */ true);
++ masm.ma_b(Assembler::NotEqual, &end);
++ masm.as_stwcx(newval, r0, scratch);
++ masm.ma_b(Assembler::NotEqual, &again);
++
++ masm.memoryBarrierAfter(sync);
++ masm.bind(&end);
++ // lwarx zero-extends; sign-extend for 32-bit canonical form.
++ masm.as_extsw(output, output);
++
++ return;
++ }
++
++ // Sub-word (1 or 2 byte) compare-exchange via native lbarx/lharx +
++ // stbcx./sthcx. POWER7+ (well below our POWER8 baseline). Replaces the prior
++ // round-down-to-word
++ // + mask + RMW dance. lXarx zero-extends the loaded byte/half; stXcx. stores
++ // only the low 8/16 bits of RS, so no pre-masking is needed on the store
++ // side. offsetTemp / maskTemp are still allocated by the lowering but unused
++ // here.
++ (void)offsetTemp;
++ (void)maskTemp;
++
++ masm.memoryBarrierBefore(sync);
++
++ masm.bind(&again);
++
++ if (access) {
++ masm.flushBuffer(); // see comment in wasmLoadImpl
++ masm.append(*access, wasm::TrapMachineInsn::Atomic,
++ FaultingCodeOffset(masm.currentOffset()));
++ }
++
++ switch (nbytes) {
++ case 1:
++ masm.as_lbarx(output, r0, scratch);
++ if (signExtend) {
++ masm.as_extsb(valueTemp, oldval);
++ masm.as_extsb(output, output);
++ } else {
++ masm.as_andi_rc(valueTemp, oldval, 0xff);
++ }
++ break;
++ case 2:
++ masm.as_lharx(output, r0, scratch);
++ if (signExtend) {
++ masm.as_extsh(valueTemp, oldval);
++ masm.as_extsh(output, output);
++ } else {
++ masm.as_rlwinm(valueTemp, oldval, 0, 16, 31);
++ }
++ break;
++ }
++
++ masm.ma_cmp(output, valueTemp, Assembler::NotEqual, /* is32bit */ true);
++ masm.ma_b(Assembler::NotEqual, &end);
++
++ if (nbytes == 1) {
++ masm.as_stbcx(newval, r0, scratch);
++ } else {
++ masm.as_sthcx(newval, r0, scratch);
++ }
++ masm.ma_b(Assembler::NotEqual, &again);
++
++ masm.memoryBarrierAfter(sync);
++
++ masm.bind(&end);
++}
++
++template <typename T>
++static void CompareExchange64(MacroAssembler& masm,
++ const wasm::MemoryAccessDesc* access,
++ Synchronization sync, const T& mem,
++ Register64 expect, Register64 replace,
++ Register64 output) {
++ MOZ_ASSERT(expect != output && replace != output);
++ UseScratchRegisterScope temps(masm);
++ Register scratch = temps.Acquire();
++ masm.computeEffectiveAddress(mem, scratch);
++
++ Label tryAgain;
++ Label exit;
++
++ masm.memoryBarrierBefore(sync);
++
++ masm.bind(&tryAgain);
++
++ if (access) {
++ masm.flushBuffer(); // see comment in wasmLoadImpl
++ masm.append(*access, wasm::TrapMachineInsn::Atomic,
++ FaultingCodeOffset(masm.currentOffset()));
++ }
++
++ masm.as_ldarx(output.reg, r0, scratch);
++
++ masm.ma_cmp(output.reg, expect.reg, Assembler::NotEqual);
++ masm.ma_b(Assembler::NotEqual, &exit);
++ masm.as_stdcx(replace.reg, r0, scratch);
++ masm.ma_b(Assembler::NotEqual, &tryAgain);
++
++ masm.memoryBarrierAfter(sync);
++
++ masm.bind(&exit);
++}
++
++template <typename T>
++static void AtomicExchange(MacroAssembler& masm,
++ const wasm::MemoryAccessDesc* access,
++ Scalar::Type type, Synchronization sync,
++ const T& mem, Register value, Register valueTemp,
++ Register offsetTemp, Register maskTemp,
++ Register output) {
++ UseScratchRegisterScope temps(masm);
++ bool signExtend = Scalar::isSignedIntType(type);
++ unsigned nbytes = Scalar::byteSize(type);
++
++ switch (nbytes) {
++ case 1:
++ case 2:
++ break;
++ case 4:
++ MOZ_ASSERT(valueTemp == InvalidReg);
++ MOZ_ASSERT(offsetTemp == InvalidReg);
++ MOZ_ASSERT(maskTemp == InvalidReg);
++ break;
++ default:
++ MOZ_CRASH();
++ }
++
++ Label again;
++
++ Register memTemp = temps.Acquire();
++ masm.computeEffectiveAddress(mem, memTemp);
++
++ if (nbytes == 4) {
++ masm.memoryBarrierBefore(sync);
++ masm.bind(&again);
++
++ if (access) {
++ masm.flushBuffer(); // see comment in wasmLoadImpl
++ masm.append(*access, wasm::TrapMachineInsn::Atomic,
++ FaultingCodeOffset(masm.currentOffset()));
++ }
++
++ masm.as_lwarx(output, r0, memTemp);
++ masm.as_stwcx(value, r0, memTemp);
++ masm.ma_b(Assembler::NotEqual, &again);
++
++ masm.memoryBarrierAfter(sync);
++ // lwarx zero-extends; sign-extend for 32-bit canonical form.
++ masm.as_extsw(output, output);
++
++ return;
++ }
++
++ // Sub-word exchange via native lbarx/lharx + stbcx./sthcx. (POWER7+).
++ // valueTemp / offsetTemp / maskTemp are still allocated by the lowering but
++ // unused here.
++ (void)valueTemp;
++ (void)offsetTemp;
++ (void)maskTemp;
++
++ masm.memoryBarrierBefore(sync);
++
++ masm.bind(&again);
++
++ if (access) {
++ masm.flushBuffer(); // see comment in wasmLoadImpl
++ masm.append(*access, wasm::TrapMachineInsn::Atomic,
++ FaultingCodeOffset(masm.currentOffset()));
++ }
++
++ if (nbytes == 1) {
++ masm.as_lbarx(output, r0, memTemp);
++ masm.as_stbcx(value, r0, memTemp);
++ } else {
++ masm.as_lharx(output, r0, memTemp);
++ masm.as_sthcx(value, r0, memTemp);
++ }
++ masm.ma_b(Assembler::NotEqual, &again);
++
++ if (signExtend) {
++ if (nbytes == 1) {
++ masm.as_extsb(output, output);
++ } else {
++ masm.as_extsh(output, output);
++ }
++ }
++ // Unsigned: lbarx/lharx already zero-extend; output is canonical.
++
++ masm.memoryBarrierAfter(sync);
++}
++
++template <typename T>
++static void AtomicExchange64(MacroAssembler& masm,
++ const wasm::MemoryAccessDesc* access,
++ Synchronization sync, const T& mem,
++ Register64 value, Register64 output) {
++ MOZ_ASSERT(value != output);
++ UseScratchRegisterScope temps(masm);
++
++ Register scratch = temps.Acquire();
++ masm.computeEffectiveAddress(mem, scratch);
++
++ Label tryAgain;
++
++ masm.memoryBarrierBefore(sync);
++
++ masm.bind(&tryAgain);
++
++ if (access) {
++ masm.flushBuffer(); // see comment in wasmLoadImpl
++ masm.append(*access, wasm::TrapMachineInsn::Atomic,
++ FaultingCodeOffset(masm.currentOffset()));
++ }
++
++ masm.as_ldarx(output.reg, r0, scratch);
++
++ masm.as_stdcx(value.reg, r0, scratch);
++ masm.ma_b(Assembler::NotEqual, &tryAgain);
++
++ masm.memoryBarrierAfter(sync);
++}
++
++template <typename T>
++static void AtomicFetchOp(MacroAssembler& masm,
++ const wasm::MemoryAccessDesc* access,
++ Scalar::Type type, Synchronization sync, AtomicOp op,
++ const T& mem, Register value, Register valueTemp,
++ Register offsetTemp, Register maskTemp,
++ Register output) {
++ UseScratchRegisterScope temps(masm);
++ bool signExtend = Scalar::isSignedIntType(type);
++ unsigned nbytes = Scalar::byteSize(type);
++
++ switch (nbytes) {
++ case 1:
++ case 2:
++ break;
++ case 4:
++ MOZ_ASSERT(valueTemp == InvalidReg);
++ MOZ_ASSERT(offsetTemp == InvalidReg);
++ MOZ_ASSERT(maskTemp == InvalidReg);
++ break;
++ default:
++ MOZ_CRASH();
++ }
++
++ Label again;
++
++ Register memTemp = temps.Acquire();
++ masm.computeEffectiveAddress(mem, memTemp);
++
++ Register scratch = temps.Acquire();
++
++ if (nbytes == 4) {
++ masm.memoryBarrierBefore(sync);
++ masm.bind(&again);
++
++ if (access) {
++ masm.flushBuffer(); // see comment in wasmLoadImpl
++ masm.append(*access, wasm::TrapMachineInsn::Atomic,
++ FaultingCodeOffset(masm.currentOffset()));
++ }
++
++ masm.as_lwarx(output, r0, memTemp);
++
++ switch (op) {
++ case AtomicOp::Add:
++ masm.as_add(scratch, output, value);
++ break;
++ case AtomicOp::Sub:
++ masm.as_subf(scratch, value, output);
++ break;
++ case AtomicOp::And:
++ masm.as_and_(scratch, output, value);
++ break;
++ case AtomicOp::Or:
++ masm.as_or_(scratch, output, value);
++ break;
++ case AtomicOp::Xor:
++ masm.as_xor_(scratch, output, value);
++ break;
++ default:
++ MOZ_CRASH();
++ }
++
++ masm.as_stwcx(scratch, r0, memTemp);
++ masm.ma_b(Assembler::NotEqual, &again);
++
++ masm.memoryBarrierAfter(sync);
++ // lwarx zero-extends; sign-extend for 32-bit canonical form.
++ masm.as_extsw(output, output);
++
++ return;
++ }
++
++ // Sub-word fetch-and-op via native lbarx/lharx + stbcx./sthcx. (POWER7+).
++ // `output` holds the pre-op loaded value (returned to caller); `valueTemp`
++ // is the post-op value we condition-store. stXcx. only stores low 8/16 bits
++ // of RS, so no pre-mask of valueTemp is needed.
++ // offsetTemp / maskTemp are still allocated by the lowering but unused; the
++ // local `scratch` is only used in the 4-byte branch above.
++ (void)offsetTemp;
++ (void)maskTemp;
++
++ masm.memoryBarrierBefore(sync);
++
++ masm.bind(&again);
++
++ if (access) {
++ masm.flushBuffer(); // see comment in wasmLoadImpl
++ masm.append(*access, wasm::TrapMachineInsn::Atomic,
++ FaultingCodeOffset(masm.currentOffset()));
++ }
++
++ if (nbytes == 1) {
++ masm.as_lbarx(output, r0, memTemp);
++ } else {
++ masm.as_lharx(output, r0, memTemp);
++ }
++
++ switch (op) {
++ case AtomicOp::Add:
++ masm.as_add(valueTemp, output, value);
++ break;
++ case AtomicOp::Sub:
++ masm.as_subf(valueTemp, value, output);
++ break;
++ case AtomicOp::And:
++ masm.as_and_(valueTemp, output, value);
++ break;
++ case AtomicOp::Or:
++ masm.as_or_(valueTemp, output, value);
++ break;
++ case AtomicOp::Xor:
++ masm.as_xor_(valueTemp, output, value);
++ break;
++ default:
++ MOZ_CRASH();
++ }
++
++ if (nbytes == 1) {
++ masm.as_stbcx(valueTemp, r0, memTemp);
++ } else {
++ masm.as_sthcx(valueTemp, r0, memTemp);
++ }
++ masm.ma_b(Assembler::NotEqual, &again);
++
++ if (signExtend) {
++ if (nbytes == 1) {
++ masm.as_extsb(output, output);
++ } else {
++ masm.as_extsh(output, output);
++ }
++ }
++ // Unsigned: lbarx/lharx already zero-extend; output is canonical.
++
++ masm.memoryBarrierAfter(sync);
++}
++
++template <typename T>
++static void AtomicFetchOp64(MacroAssembler& masm,
++ const wasm::MemoryAccessDesc* access,
++ Synchronization sync, AtomicOp op, Register64 value,
++ const T& mem, Register64 temp, Register64 output) {
++ MOZ_ASSERT(value != output);
++ MOZ_ASSERT(value != temp);
++ UseScratchRegisterScope temps(masm);
++ Register scratch = temps.Acquire();
++ masm.computeEffectiveAddress(mem, scratch);
++
++ Label tryAgain;
++
++ masm.memoryBarrierBefore(sync);
++
++ masm.bind(&tryAgain);
++
++ if (access) {
++ masm.flushBuffer(); // see comment in wasmLoadImpl
++ masm.append(*access, wasm::TrapMachineInsn::Atomic,
++ FaultingCodeOffset(masm.currentOffset()));
++ }
++
++ masm.as_ldarx(output.reg, r0, scratch);
++
++ switch (op) {
++ case AtomicOp::Add:
++ masm.as_add(temp.reg, output.reg, value.reg);
++ break;
++ case AtomicOp::Sub:
++ masm.as_subf(temp.reg, value.reg, output.reg);
++ break;
++ case AtomicOp::And:
++ masm.as_and_(temp.reg, output.reg, value.reg);
++ break;
++ case AtomicOp::Or:
++ masm.as_or_(temp.reg, output.reg, value.reg);
++ break;
++ case AtomicOp::Xor:
++ masm.as_xor_(temp.reg, output.reg, value.reg);
++ break;
++ default:
++ MOZ_CRASH();
++ }
++
++ masm.as_stdcx(temp.reg, r0, scratch);
++ masm.ma_b(Assembler::NotEqual, &tryAgain);
++
++ masm.memoryBarrierAfter(sync);
++}
++
++template <typename T>
++static void AtomicEffectOp(MacroAssembler& masm,
++ const wasm::MemoryAccessDesc* access,
++ Scalar::Type type, Synchronization sync, AtomicOp op,
++ const T& mem, Register value, Register valueTemp,
++ Register offsetTemp, Register maskTemp) {
++ UseScratchRegisterScope temps(masm);
++ unsigned nbytes = Scalar::byteSize(type);
++
++ switch (nbytes) {
++ case 1:
++ case 2:
++ break;
++ case 4:
++ MOZ_ASSERT(valueTemp == InvalidReg);
++ MOZ_ASSERT(offsetTemp == InvalidReg);
++ MOZ_ASSERT(maskTemp == InvalidReg);
++ break;
++ default:
++ MOZ_CRASH();
++ }
++
++ Label again;
++
++ Register scratch = temps.Acquire();
++ masm.computeEffectiveAddress(mem, scratch);
++
++ Register scratch2 = temps.Acquire();
++
++ if (nbytes == 4) {
++ masm.memoryBarrierBefore(sync);
++ masm.bind(&again);
++
++ if (access) {
++ masm.flushBuffer(); // see comment in wasmLoadImpl
++ masm.append(*access, wasm::TrapMachineInsn::Atomic,
++ FaultingCodeOffset(masm.currentOffset()));
++ }
++
++ masm.as_lwarx(scratch2, r0, scratch);
++
++ switch (op) {
++ case AtomicOp::Add:
++ masm.as_add(scratch2, scratch2, value);
++ break;
++ case AtomicOp::Sub:
++ masm.as_subf(scratch2, value, scratch2);
++ break;
++ case AtomicOp::And:
++ masm.as_and_(scratch2, scratch2, value);
++ break;
++ case AtomicOp::Or:
++ masm.as_or_(scratch2, scratch2, value);
++ break;
++ case AtomicOp::Xor:
++ masm.as_xor_(scratch2, scratch2, value);
++ break;
++ default:
++ MOZ_CRASH();
++ }
++
++ masm.as_stwcx(scratch2, r0, scratch);
++ masm.ma_b(Assembler::NotEqual, &again);
++
++ masm.memoryBarrierAfter(sync);
++
++ return;
++ }
++
++ // Sub-word effect-only op via native lbarx/lharx + stbcx./sthcx. (POWER7+).
++ // No output to return; scratch2 holds the load+op+store value.
++ // valueTemp / offsetTemp / maskTemp are still allocated by the lowering but
++ // unused here.
++ (void)valueTemp;
++ (void)offsetTemp;
++ (void)maskTemp;
++
++ masm.memoryBarrierBefore(sync);
++
++ masm.bind(&again);
++
++ if (access) {
++ masm.flushBuffer(); // see comment in wasmLoadImpl
++ masm.append(*access, wasm::TrapMachineInsn::Atomic,
++ FaultingCodeOffset(masm.currentOffset()));
++ }
++
++ if (nbytes == 1) {
++ masm.as_lbarx(scratch2, r0, scratch);
++ } else {
++ masm.as_lharx(scratch2, r0, scratch);
++ }
++
++ switch (op) {
++ case AtomicOp::Add:
++ masm.as_add(scratch2, scratch2, value);
++ break;
++ case AtomicOp::Sub:
++ masm.as_subf(scratch2, value, scratch2);
++ break;
++ case AtomicOp::And:
++ masm.as_and_(scratch2, scratch2, value);
++ break;
++ case AtomicOp::Or:
++ masm.as_or_(scratch2, scratch2, value);
++ break;
++ case AtomicOp::Xor:
++ masm.as_xor_(scratch2, scratch2, value);
++ break;
++ default:
++ MOZ_CRASH();
++ }
++
++ if (nbytes == 1) {
++ masm.as_stbcx(scratch2, r0, scratch);
++ } else {
++ masm.as_sthcx(scratch2, r0, scratch);
++ }
++ masm.ma_b(Assembler::NotEqual, &again);
++
++ masm.memoryBarrierAfter(sync);
++}
++
++// Public MacroAssembler methods.
++
++void MacroAssembler::compareExchange(Scalar::Type type, Synchronization sync,
++ const Address& mem, Register oldval,
++ Register newval, Register valueTemp,
++ Register offsetTemp, Register maskTemp,
++ Register output) {
++ CompareExchange(*this, nullptr, type, sync, mem, oldval, newval, valueTemp,
++ offsetTemp, maskTemp, output);
++}
++
++void MacroAssembler::compareExchange(Scalar::Type type, Synchronization sync,
++ const BaseIndex& mem, Register oldval,
++ Register newval, Register valueTemp,
++ Register offsetTemp, Register maskTemp,
++ Register output) {
++ CompareExchange(*this, nullptr, type, sync, mem, oldval, newval, valueTemp,
++ offsetTemp, maskTemp, output);
++}
++
++void MacroAssembler::compareExchange64(Synchronization sync, const Address& mem,
++ Register64 expect, Register64 replace,
++ Register64 output) {
++ CompareExchange64(*this, nullptr, sync, mem, expect, replace, output);
++}
++
++void MacroAssembler::compareExchange64(Synchronization sync,
++ const BaseIndex& mem, Register64 expect,
++ Register64 replace, Register64 output) {
++ CompareExchange64(*this, nullptr, sync, mem, expect, replace, output);
++}
++
++void MacroAssembler::wasmCompareExchange(const wasm::MemoryAccessDesc& access,
++ const Address& mem, Register oldval,
++ Register newval, Register valueTemp,
++ Register offsetTemp, Register maskTemp,
++ Register output) {
++ CompareExchange(*this, &access, access.type(), access.sync(), mem, oldval,
++ newval, valueTemp, offsetTemp, maskTemp, output);
++}
++
++void MacroAssembler::wasmCompareExchange(const wasm::MemoryAccessDesc& access,
++ const BaseIndex& mem, Register oldval,
++ Register newval, Register valueTemp,
++ Register offsetTemp, Register maskTemp,
++ Register output) {
++ CompareExchange(*this, &access, access.type(), access.sync(), mem, oldval,
++ newval, valueTemp, offsetTemp, maskTemp, output);
++}
++
++void MacroAssembler::wasmCompareExchange64(const wasm::MemoryAccessDesc& access,
++ const Address& mem,
++ Register64 expect,
++ Register64 replace,
++ Register64 output) {
++ CompareExchange64(*this, &access, access.sync(), mem, expect, replace,
++ output);
++}
++
++void MacroAssembler::wasmCompareExchange64(const wasm::MemoryAccessDesc& access,
++ const BaseIndex& mem,
++ Register64 expect,
++ Register64 replace,
++ Register64 output) {
++ CompareExchange64(*this, &access, access.sync(), mem, expect, replace,
++ output);
++}
++
++void MacroAssembler::atomicExchange(Scalar::Type type, Synchronization sync,
++ const Address& mem, Register value,
++ Register valueTemp, Register offsetTemp,
++ Register maskTemp, Register output) {
++ AtomicExchange(*this, nullptr, type, sync, mem, value, valueTemp, offsetTemp,
++ maskTemp, output);
++}
++
++void MacroAssembler::atomicExchange(Scalar::Type type, Synchronization sync,
++ const BaseIndex& mem, Register value,
++ Register valueTemp, Register offsetTemp,
++ Register maskTemp, Register output) {
++ AtomicExchange(*this, nullptr, type, sync, mem, value, valueTemp, offsetTemp,
++ maskTemp, output);
++}
++
++void MacroAssembler::atomicExchange64(Synchronization sync, const Address& mem,
++ Register64 value, Register64 output) {
++ AtomicExchange64(*this, nullptr, sync, mem, value, output);
++}
++
++void MacroAssembler::atomicExchange64(Synchronization sync,
++ const BaseIndex& mem, Register64 value,
++ Register64 output) {
++ AtomicExchange64(*this, nullptr, sync, mem, value, output);
++}
++
++void MacroAssembler::wasmAtomicExchange(const wasm::MemoryAccessDesc& access,
++ const Address& mem, Register value,
++ Register valueTemp, Register offsetTemp,
++ Register maskTemp, Register output) {
++ AtomicExchange(*this, &access, access.type(), access.sync(), mem, value,
++ valueTemp, offsetTemp, maskTemp, output);
++}
++
++void MacroAssembler::wasmAtomicExchange(const wasm::MemoryAccessDesc& access,
++ const BaseIndex& mem, Register value,
++ Register valueTemp, Register offsetTemp,
++ Register maskTemp, Register output) {
++ AtomicExchange(*this, &access, access.type(), access.sync(), mem, value,
++ valueTemp, offsetTemp, maskTemp, output);
++}
++
++template <typename T>
++static void WasmAtomicExchange64(MacroAssembler& masm,
++ const wasm::MemoryAccessDesc& access,
++ const T& mem, Register64 value,
++ Register64 output) {
++ AtomicExchange64(masm, &access, access.sync(), mem, value, output);
++}
++
++void MacroAssembler::wasmAtomicExchange64(const wasm::MemoryAccessDesc& access,
++ const Address& mem, Register64 src,
++ Register64 output) {
++ WasmAtomicExchange64(*this, access, mem, src, output);
++}
++
++void MacroAssembler::wasmAtomicExchange64(const wasm::MemoryAccessDesc& access,
++ const BaseIndex& mem, Register64 src,
++ Register64 output) {
++ WasmAtomicExchange64(*this, access, mem, src, output);
++}
++
++void MacroAssembler::atomicFetchOp(Scalar::Type type, Synchronization sync,
++ AtomicOp op, Register value,
++ const Address& mem, Register valueTemp,
++ Register offsetTemp, Register maskTemp,
++ Register output) {
++ AtomicFetchOp(*this, nullptr, type, sync, op, mem, value, valueTemp,
++ offsetTemp, maskTemp, output);
++}
++
++void MacroAssembler::atomicFetchOp(Scalar::Type type, Synchronization sync,
++ AtomicOp op, Register value,
++ const BaseIndex& mem, Register valueTemp,
++ Register offsetTemp, Register maskTemp,
++ Register output) {
++ AtomicFetchOp(*this, nullptr, type, sync, op, mem, value, valueTemp,
++ offsetTemp, maskTemp, output);
++}
++
++void MacroAssembler::atomicFetchOp64(Synchronization sync, AtomicOp op,
++ Register64 value, const Address& mem,
++ Register64 temp, Register64 output) {
++ AtomicFetchOp64(*this, nullptr, sync, op, value, mem, temp, output);
++}
++
++void MacroAssembler::atomicFetchOp64(Synchronization sync, AtomicOp op,
++ Register64 value, const BaseIndex& mem,
++ Register64 temp, Register64 output) {
++ AtomicFetchOp64(*this, nullptr, sync, op, value, mem, temp, output);
++}
++
++void MacroAssembler::atomicEffectOp64(Synchronization sync, AtomicOp op,
++ Register64 value, const Address& mem,
++ Register64 temp) {
++ AtomicFetchOp64(*this, nullptr, sync, op, value, mem, temp, temp);
++}
++
++void MacroAssembler::atomicEffectOp64(Synchronization sync, AtomicOp op,
++ Register64 value, const BaseIndex& mem,
++ Register64 temp) {
++ AtomicFetchOp64(*this, nullptr, sync, op, value, mem, temp, temp);
++}
++
++void MacroAssembler::wasmAtomicFetchOp(const wasm::MemoryAccessDesc& access,
++ AtomicOp op, Register value,
++ const Address& mem, Register valueTemp,
++ Register offsetTemp, Register maskTemp,
++ Register output) {
++ AtomicFetchOp(*this, &access, access.type(), access.sync(), op, mem, value,
++ valueTemp, offsetTemp, maskTemp, output);
++}
++
++void MacroAssembler::wasmAtomicFetchOp(const wasm::MemoryAccessDesc& access,
++ AtomicOp op, Register value,
++ const BaseIndex& mem, Register valueTemp,
++ Register offsetTemp, Register maskTemp,
++ Register output) {
++ AtomicFetchOp(*this, &access, access.type(), access.sync(), op, mem, value,
++ valueTemp, offsetTemp, maskTemp, output);
++}
++
++void MacroAssembler::wasmAtomicFetchOp64(const wasm::MemoryAccessDesc& access,
++ AtomicOp op, Register64 value,
++ const Address& mem, Register64 temp,
++ Register64 output) {
++ AtomicFetchOp64(*this, &access, access.sync(), op, value, mem, temp, output);
++}
++
++void MacroAssembler::wasmAtomicFetchOp64(const wasm::MemoryAccessDesc& access,
++ AtomicOp op, Register64 value,
++ const BaseIndex& mem, Register64 temp,
++ Register64 output) {
++ AtomicFetchOp64(*this, &access, access.sync(), op, value, mem, temp, output);
++}
++
++void MacroAssembler::wasmAtomicEffectOp(const wasm::MemoryAccessDesc& access,
++ AtomicOp op, Register value,
++ const Address& mem, Register valueTemp,
++ Register offsetTemp,
++ Register maskTemp) {
++ AtomicEffectOp(*this, &access, access.type(), access.sync(), op, mem, value,
++ valueTemp, offsetTemp, maskTemp);
++}
++
++void MacroAssembler::wasmAtomicEffectOp(const wasm::MemoryAccessDesc& access,
++ AtomicOp op, Register value,
++ const BaseIndex& mem,
++ Register valueTemp, Register offsetTemp,
++ Register maskTemp) {
++ AtomicEffectOp(*this, &access, access.type(), access.sync(), op, mem, value,
++ valueTemp, offsetTemp, maskTemp);
++}
++
++// ========================================================================
++// JS atomic operations.
++
++template <typename T>
++static void CompareExchangeJS(MacroAssembler& masm, Scalar::Type arrayType,
++ Synchronization sync, const T& mem,
++ Register oldval, Register newval,
++ Register valueTemp, Register offsetTemp,
++ Register maskTemp, Register temp,
++ AnyRegister output) {
++ if (arrayType == Scalar::Uint32) {
++ masm.compareExchange(arrayType, sync, mem, oldval, newval, valueTemp,
++ offsetTemp, maskTemp, temp);
++ masm.convertUInt32ToDouble(temp, output.fpu());
++ } else {
++ masm.compareExchange(arrayType, sync, mem, oldval, newval, valueTemp,
++ offsetTemp, maskTemp, output.gpr());
++ }
++}
++
++template <typename T>
++static void AtomicExchangeJS(MacroAssembler& masm, Scalar::Type arrayType,
++ Synchronization sync, const T& mem, Register value,
++ Register valueTemp, Register offsetTemp,
++ Register maskTemp, Register temp,
++ AnyRegister output) {
++ if (arrayType == Scalar::Uint32) {
++ masm.atomicExchange(arrayType, sync, mem, value, valueTemp, offsetTemp,
++ maskTemp, temp);
++ masm.convertUInt32ToDouble(temp, output.fpu());
++ } else {
++ masm.atomicExchange(arrayType, sync, mem, value, valueTemp, offsetTemp,
++ maskTemp, output.gpr());
++ }
++}
++
++template <typename T>
++static void AtomicFetchOpJS(MacroAssembler& masm, Scalar::Type arrayType,
++ Synchronization sync, AtomicOp op, Register value,
++ const T& mem, Register valueTemp,
++ Register offsetTemp, Register maskTemp,
++ Register temp, AnyRegister output) {
++ if (arrayType == Scalar::Uint32) {
++ masm.atomicFetchOp(arrayType, sync, op, value, mem, valueTemp, offsetTemp,
++ maskTemp, temp);
++ masm.convertUInt32ToDouble(temp, output.fpu());
++ } else {
++ masm.atomicFetchOp(arrayType, sync, op, value, mem, valueTemp, offsetTemp,
++ maskTemp, output.gpr());
++ }
++}
++
++void MacroAssembler::compareExchangeJS(Scalar::Type arrayType,
++ Synchronization sync, const Address& mem,
++ Register oldval, Register newval,
++ Register valueTemp, Register offsetTemp,
++ Register maskTemp, Register temp,
++ AnyRegister output) {
++ CompareExchangeJS(*this, arrayType, sync, mem, oldval, newval, valueTemp,
++ offsetTemp, maskTemp, temp, output);
++}
++
++void MacroAssembler::compareExchangeJS(Scalar::Type arrayType,
++ Synchronization sync,
++ const BaseIndex& mem, Register oldval,
++ Register newval, Register valueTemp,
++ Register offsetTemp, Register maskTemp,
++ Register temp, AnyRegister output) {
++ CompareExchangeJS(*this, arrayType, sync, mem, oldval, newval, valueTemp,
++ offsetTemp, maskTemp, temp, output);
++}
++
++void MacroAssembler::atomicExchangeJS(Scalar::Type arrayType,
++ Synchronization sync, const Address& mem,
++ Register value, Register valueTemp,
++ Register offsetTemp, Register maskTemp,
++ Register temp, AnyRegister output) {
++ AtomicExchangeJS(*this, arrayType, sync, mem, value, valueTemp, offsetTemp,
++ maskTemp, temp, output);
++}
++
++void MacroAssembler::atomicExchangeJS(Scalar::Type arrayType,
++ Synchronization sync,
++ const BaseIndex& mem, Register value,
++ Register valueTemp, Register offsetTemp,
++ Register maskTemp, Register temp,
++ AnyRegister output) {
++ AtomicExchangeJS(*this, arrayType, sync, mem, value, valueTemp, offsetTemp,
++ maskTemp, temp, output);
++}
++
++void MacroAssembler::atomicFetchOpJS(Scalar::Type arrayType,
++ Synchronization sync, AtomicOp op,
++ Register value, const Address& mem,
++ Register valueTemp, Register offsetTemp,
++ Register maskTemp, Register temp,
++ AnyRegister output) {
++ AtomicFetchOpJS(*this, arrayType, sync, op, value, mem, valueTemp, offsetTemp,
++ maskTemp, temp, output);
++}
++
++void MacroAssembler::atomicFetchOpJS(Scalar::Type arrayType,
++ Synchronization sync, AtomicOp op,
++ Register value, const BaseIndex& mem,
++ Register valueTemp, Register offsetTemp,
++ Register maskTemp, Register temp,
++ AnyRegister output) {
++ AtomicFetchOpJS(*this, arrayType, sync, op, value, mem, valueTemp, offsetTemp,
++ maskTemp, temp, output);
++}
++
++void MacroAssembler::atomicEffectOpJS(Scalar::Type arrayType,
++ Synchronization sync, AtomicOp op,
++ Register value, const BaseIndex& mem,
++ Register valueTemp, Register offsetTemp,
++ Register maskTemp) {
++ AtomicEffectOp(*this, nullptr, arrayType, sync, op, mem, value, valueTemp,
++ offsetTemp, maskTemp);
++}
++
++void MacroAssembler::atomicEffectOpJS(Scalar::Type arrayType,
++ Synchronization sync, AtomicOp op,
++ Register value, const Address& mem,
++ Register valueTemp, Register offsetTemp,
++ Register maskTemp) {
++ AtomicEffectOp(*this, nullptr, arrayType, sync, op, mem, value, valueTemp,
++ offsetTemp, maskTemp);
++}
++
++// ========================================================================
++// Wasm address offset carry tests.
++
++void MacroAssemblerPPC64Compat::ma_add32TestCarry(Condition cond, Register rd,
++ Register rs, Imm32 imm,
++ Label* overflow) {
++ MOZ_ASSERT(cond == Assembler::CarrySet || cond == Assembler::CarryClear);
++ if (rd != rs) {
++ asMasm().move32(rs, rd);
++ asMasm().add32(imm, rd);
++ as_cmplw(rd, rs);
++ } else {
++ // visitWasmAddOffset uses useRegisterAtStart, so the LIR allocator may
++ // collapse rd onto rs. move32 + add32 would clobber rs before the
++ // compare; save rs to a scratch first.
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ asMasm().move32(rs, scratch);
++ asMasm().add32(imm, rd);
++ as_cmplw(rd, scratch);
++ }
++ ma_b(cond == Assembler::CarrySet ? LessThan : GreaterThanOrEqual, overflow);
++}
++
++void MacroAssemblerPPC64Compat::ma_addPtrTestCarry(Condition cond, Register rd,
++ Register rs, ImmWord imm,
++ Label* overflow) {
++ MOZ_ASSERT(cond == Assembler::CarrySet || cond == Assembler::CarryClear);
++ if (rd != rs) {
++ asMasm().movePtr(rs, rd);
++ asMasm().addPtr(ImmWord(imm.value), rd);
++ as_cmpld(rd, rs);
++ } else {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ asMasm().movePtr(rs, scratch);
++ asMasm().addPtr(ImmWord(imm.value), rd);
++ as_cmpld(rd, scratch);
++ }
++ ma_b(cond == Assembler::CarrySet ? LessThan : GreaterThanOrEqual, overflow);
++}
++
++// ========================================================================
++// Wasm load/store helpers.
++
++void MacroAssemblerPPC64Compat::wasmProbeLastByte(
++ const wasm::MemoryAccessDesc& access, Register memoryBase, Register ptr) {
++ if (HasPOWER9()) {
++ return;
++ }
++ const unsigned size = Scalar::byteSize(access.type());
++ if (size <= 1) {
++ return;
++ }
++ UseScratchRegisterScope temps(asMasm());
++ Register probeAddr = temps.Acquire();
++ // size is at most 16 (Simd128), well within the int16_t range of as_addi.
++ as_addi(probeAddr, ptr, int16_t(size - 1));
++ // Record the probe as a wasm trap site so its SIGSEGV dispatches
++ // through the wasm signal handler the same way the real access would.
++ m_buffer.flushPool();
++ append(access, wasm::TrapMachineInsn::Load8,
++ FaultingCodeOffset(currentOffset()));
++ // Probing 1-byte load; result discarded.
++ as_lbzx(probeAddr, memoryBase, probeAddr);
++}
++
++void MacroAssemblerPPC64Compat::wasmLoadImpl(
++ const wasm::MemoryAccessDesc& access, Register memoryBase, Register ptr,
++ Register ptrScratch, AnyRegister output) {
++ access.assertOffsetInGuardPages();
++ uint32_t offset = access.offset32();
++ MOZ_ASSERT_IF(offset, ptrScratch != InvalidReg);
++
++ if (offset) {
++ asMasm().addPtr(ImmWord(offset), ptrScratch);
++ ptr = ptrScratch;
++ }
++
++ wasmProbeLastByte(access, memoryBase, ptr);
++
++ asMasm().memoryBarrierBefore(access.sync());
++ // Flush any pending constant pool entries before recording the trap site,
++ // otherwise a pool body inserted between the recorded offset and the
++ // emitted load shifts the load and leaves the pool guard branch at the
++ // recorded offset (SummarizeTrapInstruction then rejects the trap site).
++ m_buffer.flushPool();
++ append(access, wasm::TrapMachineInsnForLoad(Scalar::byteSize(access.type())),
++ FaultingCodeOffset(currentOffset()));
++
++ switch (access.type()) {
++ case Scalar::Int8:
++ as_lbzx(output.gpr(), memoryBase, ptr);
++ as_extsb(output.gpr(), output.gpr());
++ break;
++ case Scalar::Uint8:
++ as_lbzx(output.gpr(), memoryBase, ptr);
++ break;
++ case Scalar::Int16:
++ as_lhax(output.gpr(), memoryBase, ptr);
++ break;
++ case Scalar::Uint16:
++ as_lhzx(output.gpr(), memoryBase, ptr);
++ break;
++ case Scalar::Int32:
++ case Scalar::Uint32:
++ as_lwzx(output.gpr(), memoryBase, ptr);
++ as_extsw(output.gpr(), output.gpr());
++ break;
++ case Scalar::Float64:
++ if (access.isZeroExtendSimd128Load() || access.isSplatSimd128Load() ||
++ access.isWidenSimd128Load()) {
++ // lfdx is X-form scalar FP — encodes only 5-bit FRT, so a
++ // Simd128 dest (encoding 32+) corrupts the opcode. Bridge
++ // through ScratchDoubleReg (FPR f0, encoding 0).
++ ScratchDoubleScope dscratch(asMasm());
++ as_lfdx(dscratch, memoryBase, ptr);
++ if (access.isZeroExtendSimd128Load()) {
++ // Loaded value goes to BE dw1 (= LE dw0 = lane 0); BE dw0 = 0.
++ as_xxlxor(ScratchSimd128Reg, ScratchSimd128Reg, ScratchSimd128Reg);
++ as_xxpermdi(output.fpu(), ScratchSimd128Reg, dscratch, 0);
++ } else if (access.isSplatSimd128Load()) {
++ as_xxpermdi(output.fpu(), dscratch, dscratch, 0);
++ } else {
++ // widen: place loaded 64 bits in LE dw0 (= BE dw1) for widenLow.
++ as_xxpermdi(output.fpu(), dscratch, dscratch, 2);
++ switch (access.widenSimdOp()) {
++ case wasm::SimdOp::V128Load8x8S:
++ asMasm().widenLowInt8x16(output.fpu(), output.fpu());
++ break;
++ case wasm::SimdOp::V128Load8x8U:
++ asMasm().unsignedWidenLowInt8x16(output.fpu(), output.fpu());
++ break;
++ case wasm::SimdOp::V128Load16x4S:
++ asMasm().widenLowInt16x8(output.fpu(), output.fpu());
++ break;
++ case wasm::SimdOp::V128Load16x4U:
++ asMasm().unsignedWidenLowInt16x8(output.fpu(), output.fpu());
++ break;
++ case wasm::SimdOp::V128Load32x2S:
++ asMasm().widenLowInt32x4(output.fpu(), output.fpu());
++ break;
++ case wasm::SimdOp::V128Load32x2U:
++ asMasm().unsignedWidenLowInt32x4(output.fpu(), output.fpu());
++ break;
++ default:
++ MOZ_CRASH("Unexpected widen op");
++ }
++ }
++ } else {
++ as_lfdx(output.fpu(), memoryBase, ptr);
++ }
++ break;
++ case Scalar::Float32:
++ if (access.isZeroExtendSimd128Load()) {
++ // v128.load32_zero: load 32 raw bits into lane 0, zero the rest.
++ UseScratchRegisterScope temps(asMasm());
++ Register tmp = temps.Acquire();
++ as_lwzx(tmp, memoryBase, ptr);
++ as_xxlxor(output.fpu(), output.fpu(), output.fpu());
++ if (HasPOWER9()) {
++ as_mtvsrws(ScratchSimd128Reg, tmp);
++ as_xxinsertw(output.fpu(), ScratchSimd128Reg, 12);
++ } else {
++ // POWER8: mtvsrd puts value in BE dw0 low 32 bits.
++ // xxpermdi(dest, zero, scratch, 0) = {zero[dw0], scratch[dw0]}
++ // in BE, placing the value in LE word 0 with the rest zero.
++ as_mtvsrd(ScratchSimd128Reg, tmp);
++ as_xxpermdi(output.fpu(), output.fpu(), ScratchSimd128Reg, 0);
++ }
++ } else {
++ as_lfsx(output.fpu(), memoryBase, ptr);
++ }
++ break;
++ case Scalar::Simd128:
++ if (HasPOWER9()) {
++ as_lxvx(output.fpu(), memoryBase, ptr);
++ } else {
++ as_lxvd2x(output.fpu(), memoryBase, ptr);
++ as_xxpermdi(output.fpu(), output.fpu(), output.fpu(), 2);
++ }
++ break;
++ default:
++ MOZ_CRASH("unexpected array type");
++ }
++
++ asMasm().memoryBarrierAfter(access.sync());
++}
++
++void MacroAssemblerPPC64Compat::wasmStoreImpl(
++ const wasm::MemoryAccessDesc& access, AnyRegister value,
++ Register memoryBase, Register ptr, Register ptrScratch) {
++ access.assertOffsetInGuardPages();
++ uint32_t offset = access.offset32();
++ MOZ_ASSERT_IF(offset, ptrScratch != InvalidReg);
++
++ if (offset) {
++ asMasm().addPtr(ImmWord(offset), ptrScratch);
++ ptr = ptrScratch;
++ }
++
++ wasmProbeLastByte(access, memoryBase, ptr);
++
++ asMasm().memoryBarrierBefore(access.sync());
++ // Record trap site at the faulting memory instruction. For P8 Simd128
++ // store, the faulting instruction (stxvd2x) is after a byte-swap
++ // (xxpermdi), so we defer the trap site recording.
++ // Flush pool first; see comment in wasmLoadImpl.
++ if (access.type() != Scalar::Simd128 || HasPOWER9()) {
++ m_buffer.flushPool();
++ append(access,
++ wasm::TrapMachineInsnForStore(Scalar::byteSize(access.type())),
++ FaultingCodeOffset(currentOffset()));
++ }
++
++ switch (access.type()) {
++ case Scalar::Int8:
++ case Scalar::Uint8:
++ as_stbx(value.gpr(), memoryBase, ptr);
++ break;
++ case Scalar::Int16:
++ case Scalar::Uint16:
++ as_sthx(value.gpr(), memoryBase, ptr);
++ break;
++ case Scalar::Int32:
++ case Scalar::Uint32:
++ as_stwx(value.gpr(), memoryBase, ptr);
++ break;
++ case Scalar::Int64:
++ as_stdx(value.gpr(), memoryBase, ptr);
++ break;
++ case Scalar::Float64:
++ as_stfdx(value.fpu(), memoryBase, ptr);
++ break;
++ case Scalar::Float32:
++ as_stfsx(value.fpu(), memoryBase, ptr);
++ break;
++ case Scalar::Simd128:
++ if (HasPOWER9()) {
++ as_stxvx(value.fpu(), memoryBase, ptr);
++ } else {
++ as_xxpermdi(ScratchSimd128Reg, value.fpu(), value.fpu(), 2);
++ m_buffer.flushPool(); // see comment in wasmLoadImpl
++ append(access,
++ wasm::TrapMachineInsnForStore(Scalar::byteSize(access.type())),
++ FaultingCodeOffset(currentOffset()));
++ as_stxvd2x(ScratchSimd128Reg, memoryBase, ptr);
++ }
++ break;
++ default:
++ MOZ_CRASH("unexpected array type");
++ }
++
++ asMasm().memoryBarrierAfter(access.sync());
++}
++
++void MacroAssemblerPPC64Compat::wasmLoadI64Impl(
++ const wasm::MemoryAccessDesc& access, Register memoryBase, Register ptr,
++ Register ptrScratch, Register64 output) {
++ uint32_t offset = access.offset32();
++ MOZ_ASSERT_IF(offset, ptrScratch != InvalidReg);
++
++ if (offset) {
++ asMasm().addPtr(ImmWord(offset), ptrScratch);
++ ptr = ptrScratch;
++ }
++
++ wasmProbeLastByte(access, memoryBase, ptr);
++
++ asMasm().memoryBarrierBefore(access.sync());
++ m_buffer.flushPool(); // see comment in wasmLoadImpl
++ append(access, wasm::TrapMachineInsnForLoad(Scalar::byteSize(access.type())),
++ FaultingCodeOffset(currentOffset()));
++
++ switch (access.type()) {
++ case Scalar::Int8:
++ as_lbzx(output.reg, memoryBase, ptr);
++ as_extsb(output.reg, output.reg);
++ break;
++ case Scalar::Uint8:
++ as_lbzx(output.reg, memoryBase, ptr);
++ break;
++ case Scalar::Int16:
++ as_lhax(output.reg, memoryBase, ptr);
++ break;
++ case Scalar::Uint16:
++ as_lhzx(output.reg, memoryBase, ptr);
++ break;
++ case Scalar::Int32:
++ as_lwzx(output.reg, memoryBase, ptr);
++ as_extsw(output.reg, output.reg);
++ break;
++ case Scalar::Uint32:
++ as_lwzx(output.reg, memoryBase, ptr);
++ // Zero-extended by lwzx already
++ break;
++ case Scalar::Int64:
++ as_ldx(output.reg, memoryBase, ptr);
++ break;
++ default:
++ MOZ_CRASH("unexpected array type");
++ }
++
++ asMasm().memoryBarrierAfter(access.sync());
++}
++
++void MacroAssemblerPPC64Compat::wasmStoreI64Impl(
++ const wasm::MemoryAccessDesc& access, Register64 value, Register memoryBase,
++ Register ptr, Register ptrScratch) {
++ uint32_t offset = access.offset32();
++ MOZ_ASSERT_IF(offset, ptrScratch != InvalidReg);
++
++ if (offset) {
++ asMasm().addPtr(ImmWord(offset), ptrScratch);
++ ptr = ptrScratch;
++ }
++
++ wasmProbeLastByte(access, memoryBase, ptr);
++
++ asMasm().memoryBarrierBefore(access.sync());
++ m_buffer.flushPool(); // see comment in wasmLoadImpl
++ append(access, wasm::TrapMachineInsnForStore(Scalar::byteSize(access.type())),
++ FaultingCodeOffset(currentOffset()));
++
++ switch (access.type()) {
++ case Scalar::Int8:
++ case Scalar::Uint8:
++ as_stbx(value.reg, memoryBase, ptr);
++ break;
++ case Scalar::Int16:
++ case Scalar::Uint16:
++ as_sthx(value.reg, memoryBase, ptr);
++ break;
++ case Scalar::Int32:
++ case Scalar::Uint32:
++ as_stwx(value.reg, memoryBase, ptr);
++ break;
++ case Scalar::Int64:
++ as_stdx(value.reg, memoryBase, ptr);
++ break;
++ default:
++ MOZ_CRASH("unexpected array type");
++ }
++
++ asMasm().memoryBarrierAfter(access.sync());
++}
++
++void MacroAssembler::wasmLoad(const wasm::MemoryAccessDesc& access,
++ Register memoryBase, Register ptr,
++ Register ptrScratch, AnyRegister output) {
++ wasmLoadImpl(access, memoryBase, ptr, ptrScratch, output);
++}
++
++void MacroAssembler::wasmLoadI64(const wasm::MemoryAccessDesc& access,
++ Register memoryBase, Register ptr,
++ Register ptrScratch, Register64 output) {
++ wasmLoadI64Impl(access, memoryBase, ptr, ptrScratch, output);
++}
++
++void MacroAssembler::wasmStore(const wasm::MemoryAccessDesc& access,
++ AnyRegister value, Register memoryBase,
++ Register ptr, Register ptrScratch) {
++ wasmStoreImpl(access, value, memoryBase, ptr, ptrScratch);
++}
++
++void MacroAssembler::wasmStoreI64(const wasm::MemoryAccessDesc& access,
++ Register64 value, Register memoryBase,
++ Register ptr, Register ptrScratch) {
++ wasmStoreI64Impl(access, value, memoryBase, ptr, ptrScratch);
++}
++
++//}}} check_macroassembler_style
++
++} // namespace jit
++} // namespace js
++
++#ifdef ENABLE_WASM_SIMD
++// static
++bool MacroAssembler::MustMaskShiftCountSimd128(wasm::SimdOp op, int32_t* mask) {
++ return false;
++}
++#endif
+diff --git a/js/src/jit/ppc64/MacroAssembler-ppc64.h b/js/src/jit/ppc64/MacroAssembler-ppc64.h
+new file mode 100644
+index 000000000000..bc2143b67465
+--- /dev/null
++++ b/js/src/jit/ppc64/MacroAssembler-ppc64.h
+@@ -0,0 +1,2031 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifndef jit_ppc64_MacroAssembler_ppc64_h
++#define jit_ppc64_MacroAssembler_ppc64_h
++
++#include "jit/MoveResolver.h"
++#include "jit/ppc64/Assembler-ppc64.h"
++#include "wasm/WasmBuiltins.h"
++
++namespace js {
++namespace jit {
++
++inline bool is_intN(int64_t x, unsigned n) {
++ MOZ_ASSERT((0 < n) && (n < 64));
++ int64_t limit = static_cast<int64_t>(1) << (n - 1);
++ return (-limit <= x) && (x < limit);
++}
++
++inline bool is_uintN(uint64_t x, unsigned n) {
++ MOZ_ASSERT((0 < n) && (n < 64));
++ return !(x >> n);
++}
++
++// enterNoPool() guard sizes. Inhibiting the constant pool keeps these
++// stanzas at a fixed instruction count, which patchers and long-branch
++// resolvers rely on. Each constant names a distinct stanza shape; see
++// the emitting call site for the exact layout.
++//
++// kNoPoolLoad64StanzaInsns (8): emitLoad64Stanza body — 8 NOPs that
++// WriteLoad64Instructions later overwrites in place. Two shapes share
++// the same 8-slot footprint with the .quad fixed at slots [6..7]:
++// - POWER9+ (HasPOWER9()): addpcis + ld + b + 3 NOPs (2 dynamic insns,
++// no LR clobber). Preferred path.
++// - POWER8 fallback: mflr/bcl/mflr/mtlr/ld/b LR-bouncing sequence
++// (6 dynamic insns, RAS-thrashing — kept only because P8 has no
++// addpcis).
++//
++// kNoPoolPatchableBranchInsns (10): patchable far call / jump /
++// unconditional branch. Three alternative shapes, all fitting the
++// same budget:
++// - load64 stanza (8) + mtctr + bctr[l] = 10 (bound call/jump)
++// - 9 NOPs + bl = 10 (short bound call)
++// - xs_trap_tagged(TAG) + chain + 8 NOPs = 10 (fwd-ref stanza)
++//
++// kNoPoolCondLongBranchInsnsP8Max (14): conditional long branch, POWER8
++// Overflow worst case. POWER8 has no mcrxrx so overflow/carry test is
++// mfxer+rlwinm+mtcrf (3 insns) on top of the base shape. Budget =
++// 3 (XER inspection) + 1 (bc) + 8 (load64 stanza) + 2 (mtctr+bctr) = 14.
++static constexpr size_t kNoPoolLoad64StanzaInsns = 8;
++static constexpr size_t kNoPoolPatchableBranchInsns = 10;
++static constexpr size_t kNoPoolCondLongBranchInsnsP8Max = 14;
++
++enum LoadStoreSize {
++ SizeByte = 8,
++ SizeHalfWord = 16,
++ SizeWord = 32,
++ SizeDouble = 64
++};
++
++enum LoadStoreExtension { ZeroExtend = 0, SignExtend = 1 };
++
++static Register CallReg = r12;
++
++struct ImmShiftedTag : public ImmWord {
++ explicit ImmShiftedTag(JSValueShiftedTag shtag) : ImmWord((uintptr_t)shtag) {}
++ explicit ImmShiftedTag(JSValueType type)
++ : ImmWord(((uintptr_t)JSVAL_TYPE_TO_SHIFTED_TAG(type))) {}
++};
++
++struct ImmTag : public Imm32 {
++ explicit ImmTag(JSValueTag tag) : Imm32(tag) {}
++};
++
++class ScratchTagScope {
++ UseScratchRegisterScope temps_;
++ Register scratch_;
++ bool owned_;
++ mozilla::DebugOnly<bool> released_;
++
++ public:
++ ScratchTagScope(Assembler& masm, const ValueOperand&)
++ : temps_(masm), owned_(true), released_(false) {
++ scratch_ = temps_.Acquire();
++ }
++
++ operator Register() {
++ MOZ_ASSERT(!released_);
++ return scratch_;
++ }
++
++ void release() {
++ MOZ_ASSERT(!released_);
++ released_ = true;
++ if (owned_) {
++ temps_.Release(scratch_);
++ owned_ = false;
++ }
++ }
++
++ void reacquire() {
++ MOZ_ASSERT(released_);
++ released_ = false;
++ if (!owned_) {
++ scratch_ = temps_.Acquire();
++ owned_ = true;
++ }
++ }
++};
++
++class ScratchTagScopeRelease {
++ ScratchTagScope* ts_;
++
++ public:
++ explicit ScratchTagScopeRelease(ScratchTagScope* ts) : ts_(ts) {
++ ts_->release();
++ }
++ ~ScratchTagScopeRelease() { ts_->reacquire(); }
++};
++
++class MacroAssemblerPPC64 : public Assembler {
++ protected:
++ MacroAssembler& asMasm();
++ const MacroAssembler& asMasm() const;
++};
++
++class MacroAssemblerPPC64Compat : public MacroAssemblerPPC64 {
++ public:
++ using MacroAssemblerPPC64::MacroAssemblerPPC64;
++
++ MacroAssemblerPPC64Compat() {}
++
++ bool buildOOLFakeExitFrame(void* fakeReturnAddr);
++
++ // ===============================================================
++ // Conversion functions
++
++ void convertBoolToInt32(Register src, Register dest) {
++ as_rlwinm(dest, src, 0, 31, 31);
++ }
++ void convertInt32ToDouble(Register src, FloatRegister dest) {
++ // mtvsrwa: VSR[dest].dw0 = sign_ext_64(src[32:63]); P8+ (ISA 2.07).
++ // Replaces extsw + mtvsrd (2 insns + scratch GPR) with 1 insn.
++ as_mtvsrwa(dest, src);
++ as_fcfid(dest, dest);
++ }
++ void convertInt32ToDouble(const Address& src, FloatRegister dest) {
++ // lfiwax (P7+): FPR.dw[0] = sign_ext_64(MEM[addr, 4]). X-form indexed
++ // — no immediate offset, so when offset != 0 we add it into a scratch
++ // first. Replaces lwz + extsw + mtvsrd with lfiwax (one insn) plus
++ // optional address add.
++ if (src.offset == 0) {
++ as_lfiwax(dest, r0, src.base);
++ } else {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ if (is_intN(src.offset, 16)) {
++ as_addi(scratch, src.base, src.offset);
++ as_lfiwax(dest, r0, scratch);
++ } else {
++ // X-form indexed: lfiwax computes base + scratch directly, no add.
++ movePtr(ImmWord(src.offset), scratch);
++ as_lfiwax(dest, src.base, scratch);
++ }
++ }
++ as_fcfid(dest, dest);
++ }
++ void convertInt32ToDouble(const BaseIndex& src, FloatRegister dest) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ computeScaledAddress(src, scratch);
++ convertInt32ToDouble(Address(scratch, src.offset), dest);
++ }
++ void convertUInt32ToDouble(Register src, FloatRegister dest);
++ void convertUInt32ToFloat32(Register src, FloatRegister dest);
++ void convertDoubleToFloat32(FloatRegister src, FloatRegister dest) {
++ as_frsp(dest, src);
++ }
++ // POWER9 FP16 conversions (1 insn each). Caller must have verified
++ // HasPOWER9() — SupportsFloat{64,32}To16 gates that. PPC64 FPRs hold
++ // doubles internally; an "FP32-in-FPR" is just the FP32 value stored
++ // as exact FP64, so xscvdphp/xscvhpdp work for both FP32↔FP16 and
++ // FP64↔FP16 (FP16 fits exactly in FP32 which fits exactly in FP64).
++ void convertDoubleToFloat16(FloatRegister src, FloatRegister dest) {
++ MOZ_ASSERT(HasPOWER9());
++ as_xscvdphp(dest, src);
++ }
++ void convertFloat16ToDouble(FloatRegister src, FloatRegister dest) {
++ MOZ_ASSERT(HasPOWER9());
++ as_xscvhpdp(dest, src);
++ }
++ void convertFloat32ToFloat16(FloatRegister src, FloatRegister dest) {
++ MOZ_ASSERT(HasPOWER9());
++ as_xscvdphp(dest, src);
++ }
++ void convertFloat16ToFloat32(FloatRegister src, FloatRegister dest) {
++ MOZ_ASSERT(HasPOWER9());
++ as_xscvhpdp(dest, src);
++ }
++ void convertInt32ToFloat16(Register src, FloatRegister dest) {
++ MOZ_ASSERT(HasPOWER9());
++ convertInt32ToFloat32(src, dest);
++ convertFloat32ToFloat16(dest, dest);
++ }
++ void convertDoubleToInt32(FloatRegister src, Register dest, Label* fail,
++ bool negativeZeroCheck = true);
++ void convertDoubleToPtr(FloatRegister src, Register dest, Label* fail,
++ bool negativeZeroCheck = true);
++ void convertFloat32ToInt32(FloatRegister src, Register dest, Label* fail,
++ bool negativeZeroCheck = true);
++ void convertFloat32ToDouble(FloatRegister src, FloatRegister dest) {
++ // PPC64 FPRs hold every FP32 value in its FP64-equivalent representation,
++ // so f64.promote_f32 is conceptually a no-op except that wasm requires
++ // sNaN inputs to be quieted. frsp (Round to Single-Precision) is the
++ // identity for SP-representable inputs but applies IEEE NaN-quieting as
++ // a side effect, replacing the prior fmr + fcmpu + branch + canonical-
++ // NaN-load (5+ insns + scratch GPR) with a single instruction. Result
++ // matches what x86 vcvtss2sd / ARM fcvt produce.
++ as_frsp(dest, src);
++ }
++ void convertInt32ToFloat32(Register src, FloatRegister dest) {
++ // mtvsrwa + fcfids; same recipe as convertInt32ToDouble(Register).
++ as_mtvsrwa(dest, src);
++ as_fcfids(dest, dest);
++ }
++ void convertInt32ToFloat32(const Address& src, FloatRegister dest) {
++ // lfiwax + fcfids; same recipe as convertInt32ToDouble(Address).
++ if (src.offset == 0) {
++ as_lfiwax(dest, r0, src.base);
++ } else {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ if (is_intN(src.offset, 16)) {
++ as_addi(scratch, src.base, src.offset);
++ as_lfiwax(dest, r0, scratch);
++ } else {
++ movePtr(ImmWord(src.offset), scratch);
++ as_lfiwax(dest, src.base, scratch);
++ }
++ }
++ as_fcfids(dest, dest);
++ }
++
++ // POWER9 FP16 load: lxsihzx writes the 2 memory bytes directly into
++ // dw[0] low 16 bits with the rest zeroed — matching the layout that
++ // xscvhpdp expects, in a single instruction.
++ FaultingCodeOffset loadFloat16(const Address& addr, FloatRegister dest,
++ Register temp) {
++ MOZ_ASSERT(HasPOWER9());
++ if (addr.offset == 0) {
++ return FaultingCodeOffset(as_lxsihzx(dest, r0, addr.base).getOffset());
++ }
++ if (is_intN(addr.offset, 16)) {
++ as_addi(temp, addr.base, addr.offset);
++ return FaultingCodeOffset(as_lxsihzx(dest, r0, temp).getOffset());
++ }
++ movePtr(ImmWord(addr.offset), temp);
++ return FaultingCodeOffset(as_lxsihzx(dest, addr.base, temp).getOffset());
++ }
++ FaultingCodeOffset loadFloat16(const BaseIndex& src, FloatRegister dest,
++ Register temp) {
++ MOZ_ASSERT(HasPOWER9());
++ computeEffectiveAddress(src, temp);
++ return FaultingCodeOffset(as_lxsihzx(dest, r0, temp).getOffset());
++ }
++
++ // ===============================================================
++ // Effective address computation
++
++ void computeScaledAddress(const BaseIndex& address, Register dest) {
++ if (address.scale == TimesOne) {
++ as_add(dest, address.base, address.index);
++ } else if (dest != address.base && dest != address.index) {
++ x_sldi(dest, address.index, address.scale);
++ as_add(dest, address.base, dest);
++ } else {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ x_sldi(scratch, address.index, address.scale);
++ as_add(dest, address.base, scratch);
++ }
++ }
++
++ void computeEffectiveAddress(const Address& address, Register dest) {
++ if (address.offset == 0) {
++ if (dest != address.base) {
++ xs_mr(dest, address.base);
++ }
++ } else if (is_intN(address.offset, 16)) {
++ as_addi(dest, address.base, address.offset);
++ } else if (HasPOWER10() && is_intN(address.offset, 34)) {
++ // Single-insn 34-bit-signed reg+imm add. Avoids the scratch GPR.
++ as_paddi(dest, address.base, address.offset, /*R=*/false);
++ } else {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ MOZ_ASSERT(scratch != dest);
++ movePtr(ImmWord(address.offset), scratch);
++ as_add(dest, address.base, scratch);
++ }
++ }
++ void computeEffectiveAddress(const BaseIndex& address, Register dest) {
++ computeScaledAddress(address, dest);
++ if (address.offset) {
++ if (is_intN(address.offset, 16)) {
++ as_addi(dest, dest, address.offset);
++ } else if (HasPOWER10() && is_intN(address.offset, 34)) {
++ as_paddi(dest, dest, address.offset, /*R=*/false);
++ } else {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ MOZ_ASSERT(scratch != dest);
++ movePtr(ImmWord(address.offset), scratch);
++ as_add(dest, dest, scratch);
++ }
++ }
++ }
++
++ // ===============================================================
++ // Move instructions
++
++ void mov(Register src, Register dest) { xs_mr(dest, src); }
++ void mov(ImmWord imm, Register dest) { movePtr(imm, dest); }
++ void mov(ImmPtr imm, Register dest) {
++ mov(ImmWord(uintptr_t(imm.value)), dest);
++ }
++ // Emit an 8-instruction NOP stanza for a patchable 64-bit load.
++ // Pool flushes are inhibited during emission to prevent pool data
++ // from being inserted mid-stanza.
++ BufferOffset emitLoad64Stanza(Register dest, uint64_t value) {
++ m_buffer.enterNoPool(kNoPoolLoad64StanzaInsns);
++ BufferOffset bo = writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ m_buffer.leaveNoPool();
++ // If any of the 8 writeInst calls hit OOM, only some of the stanza
++ // was reserved in the buffer. WriteLoad64Instructions writes 32 bytes
++ // unconditionally, so calling it here would overflow the Vector's
++ // backing store and corrupt the next heap chunk's metadata, surfacing
++ // later as a malloc-detected free-time crash.
++ if (m_buffer.oom()) {
++ return bo;
++ }
++ WriteLoad64Instructions((Instruction*)editSrc(bo), dest, value);
++ return bo;
++ }
++
++ void mov(CodeLabel* label, Register dest) {
++ BufferOffset bo = emitLoad64Stanza(dest, LabelBase::INVALID_OFFSET);
++ label->patchAt()->bind(bo.getOffset());
++ label->setLinkMode(CodeLabel::MoveImmediate);
++ }
++ void mov(Register src, Address dest) { storePtr(src, dest); }
++ void mov(Address src, Register dest) { loadPtr(src, dest); }
++
++ void move32(Imm32 imm, Register dest) {
++ if (is_intN(imm.value, 16)) {
++ xs_li(dest, (int16_t)imm.value);
++ } else if (is_uintN((uint32_t)imm.value, 16)) {
++ xs_li(dest, 0);
++ as_ori(dest, dest, (uint16_t)imm.value);
++ } else {
++ xs_lis(dest, (int16_t)((uint32_t)imm.value >> 16));
++ if (imm.value & 0xffff) {
++ as_ori(dest, dest, (uint16_t)imm.value);
++ }
++ }
++ }
++ void move32(Register src, Register dest) { as_extsw(dest, src); }
++
++ void movePtr(Register src, Register dest) {
++ if (src != dest) {
++ xs_mr(dest, src);
++ }
++ }
++ void movePtr(ImmWord imm, Register dest) {
++ if (imm.value == 0) {
++ xs_li(dest, 0);
++ } else if (is_intN((intptr_t)imm.value, 16)) {
++ xs_li(dest, (int16_t)imm.value);
++ } else if (is_uintN(imm.value, 16)) {
++ xs_li(dest, 0);
++ as_ori(dest, dest, (uint16_t)imm.value);
++ } else if (is_intN((intptr_t)imm.value, 32)) {
++ // 32-bit signed: lis + ori (2 instructions).
++ xs_lis(dest, (int16_t)((uint32_t)imm.value >> 16));
++ if (imm.value & 0xFFFF) {
++ as_ori(dest, dest, (uint16_t)imm.value);
++ }
++ } else if (HasPOWER10() && is_intN((intptr_t)imm.value, 34)) {
++ // POWER10 single-instruction 34-bit signed immediate. Replaces the
++ // 5-insn fallback for values in (33-34)-bit signed range.
++ // 8 bytes vs 20 bytes; one slot temp register is no longer needed.
++ as_paddi(dest, r0, (int64_t)imm.value, /*R=*/false);
++ } else {
++ // Full 64-bit: GCC-style lis+ori+lis+ori+rldimi (5 instructions).
++ // No LR clobber, no embedded data — pure instruction sequence.
++ uint32_t lo32 = (uint32_t)(imm.value);
++ uint32_t hi32 = (uint32_t)(imm.value >> 32);
++ Register temp = (dest != SecondScratchReg) ? SecondScratchReg
++ : SavedScratchRegister;
++ m_buffer.ensureSpace(5 * sizeof(uint32_t));
++ xs_lis(dest, (int16_t)(lo32 >> 16));
++ as_ori(dest, dest, lo32 & 0xFFFF);
++ xs_lis(temp, (int16_t)(hi32 >> 16));
++ as_ori(temp, temp, hi32 & 0xFFFF);
++ as_rldimi(dest, temp, 32, 0);
++ }
++ }
++ void movePtr(ImmPtr imm, Register dest) {
++ movePtr(ImmWord(uintptr_t(imm.value)), dest);
++ }
++
++ // Load a 64-bit FPR constant from the inline constant pool.
++ // POWER9: 2 instructions (addpcis + lfd) -- no alignment constraint.
++ // POWER10: 1 prefixed instruction (plfd, 2 slots), or 3 slots in the
++ // (loadAddr & 63) == 60 alignment-leading-nop case. Reserve 3 to
++ // cover both cases conservatively.
++ // POWER8: not used -- loadConstantDouble inlines the constant.
++ BufferOffset loadFromPoolFloat64(FloatRegister dest, double value) {
++ size_t slots = HasPOWER10() ? 3 : 2;
++ uint32_t hint = (uint32_t(dest.encoding()) << 16) |
++ (uint32_t(PoolLoadFPR64) << 21) | 0xF0000000;
++ uint32_t inst[3] = {hint, NopInst, NopInst};
++ return m_buffer.allocEntry(slots, 2, (uint8_t*)inst, (uint8_t*)&value);
++ }
++ // Load a 32-bit FPR constant from the inline constant pool.
++ // Same shape as loadFromPoolFloat64 (above). lfs/plfs auto-expand the
++ // 32-bit single-precision value to double in the FPR, so no follow-up
++ // xscvspdpn is needed.
++ BufferOffset loadFromPoolFloat32(FloatRegister dest, float value) {
++ size_t slots = HasPOWER10() ? 3 : 2;
++ uint32_t hint = (uint32_t(dest.encoding()) << 16) |
++ (uint32_t(PoolLoadFPR32) << 21) | 0xF0000000;
++ uint32_t inst[3] = {hint, NopInst, NopInst};
++ return m_buffer.allocEntry(slots, 1, (uint8_t*)inst, (uint8_t*)&value);
++ }
++ // Load a 128-bit SIMD constant from the inline constant pool.
++ // Per-arch slot reservation -- the patcher writes only the slots
++ // each micro-arch actually needs:
++ // P8: 5 (bcl + mflr + addi + lxvd2x + xxpermdi)
++ // P9: 3 (addpcis + addi + lxvx) -- no LR touch, no RAS hazard
++ // P10: 3 (alignment-safe: prefix + suffix + 1 reserve for the
++ // (loadAddr & 63) == 60 leading-nop case)
++ // Pool entry is 4 × 4-byte words = 16 bytes. P9 uses
++ // SavedScratchRegister (r16) as the PC base; P10 emits a single
++ // PC-relative plxv with no scratch and no LR touch. Only P8 still
++ // clobbers LR (correctness-only fallback; live by design).
++ BufferOffset loadFromPoolSimd128(FloatRegister dest,
++ const SimdConstant& v) {
++ size_t slots;
++ if (HasPOWER10()) {
++ slots = 3;
++ } else if (HasPOWER9()) {
++ slots = 3;
++ } else {
++ slots = 5;
++ }
++ // Simd128 encoding is 32-63; mask to 5 bits for hint.
++ // PatchConstantPoolLoad sets TX bit unconditionally for Simd128.
++ uint32_t hint = ((uint32_t(dest.encoding()) & 0x1F) << 16) |
++ (uint32_t(PoolLoadSimd128) << 21) | 0xF0000000;
++ uint32_t inst[5] = {hint, NopInst, NopInst, NopInst, NopInst};
++ return m_buffer.allocEntry(slots, 4, (uint8_t*)inst, (uint8_t*)v.bytes());
++ }
++ void movePtr(wasm::SymbolicAddress imm, Register dest) {
++ BufferOffset bo = emitLoad64Stanza(dest, (uint64_t)-1);
++ append(wasm::SymbolicAccess(CodeOffset(bo.getOffset()), imm));
++ }
++ void movePtr(ImmGCPtr imm, Register dest) {
++ BufferOffset bo = emitLoad64Stanza(dest,
++ (uint64_t)uintptr_t(imm.value));
++ Assembler::writeDataRelocation(bo, imm);
++ }
++
++ void moveFloat32(FloatRegister src, FloatRegister dest) {
++ if (src != dest) {
++ as_fmr(dest, src);
++ }
++ }
++ void moveDouble(FloatRegister src, FloatRegister dest) {
++ if (src != dest) {
++ as_fmr(dest, src);
++ }
++ }
++
++ // ===============================================================
++ // Branch functions
++
++ void branch(JitCode* c) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ BufferOffset bo = emitLoad64Stanza(scratch, (uint64_t)uintptr_t(c->raw()));
++ addPendingJump(bo, ImmPtr(c->raw()), RelocationKind::JITCODE);
++ xs_mtctr(scratch);
++ as_bctr();
++ }
++ void branch(const Register reg) {
++ xs_mtctr(reg);
++ as_bctr();
++ }
++
++ void jump(Label* label) {
++ if (label->bound()) {
++ // Open the no-pool window BEFORE computing the displacement. The
++ // enterNoPool() call itself can trigger a pool flush, which advances
++ // currentOffset(). Computing the displacement against the pre-flush
++ // offset and then emitting the b at the post-flush offset would land
++ // the branch (poolSize) bytes past the intended target.
++ m_buffer.enterNoPool(2);
++ int32_t offset = label->offset() - currentOffset();
++ if (JOffImm26::IsInRange(offset)) {
++ as_b(offset);
++ writeInst(NopInst);
++ m_buffer.leaveNoPool();
++ return;
++ }
++ m_buffer.leaveNoPool();
++ // Long jump to bound label.
++ BufferOffset bo =
++ emitLoad64Stanza(SecondScratchReg, LabelBase::INVALID_OFFSET);
++ xs_mtctr(SecondScratchReg);
++ as_bctr();
++ addLongJump(bo, BufferOffset(label->offset()));
++ return;
++ }
++ // Unbound label: emit trap-tagged stanza (10 slots).
++ m_buffer.enterNoPool(kNoPoolPatchableBranchInsns);
++ BufferOffset bo = xs_trap_tagged(BTag);
++ writeInst(label->used() ? label->offset() : LabelBase::INVALID_OFFSET);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ m_buffer.leaveNoPool();
++ if (!oom()) {
++ label->use(bo.getOffset());
++ }
++ }
++ void jump(Register reg) {
++ xs_mtctr(reg);
++ as_bctr();
++ }
++ void jump(const Address& address) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ loadPtr(address, scratch);
++ xs_mtctr(scratch);
++ as_bctr();
++ }
++ void jump(JitCode* code) { branch(code); }
++ void jump(ImmPtr ptr) {
++ BufferOffset bo =
++ emitLoad64Stanza(SecondScratchReg, (uint64_t)uintptr_t(ptr.value));
++ addPendingJump(bo, ptr, RelocationKind::HARDCODED);
++ xs_mtctr(SecondScratchReg);
++ as_bctr();
++ }
++ void jump(TrampolinePtr code) { jump(ImmPtr(code.value)); }
++
++ // Conditional branch to label. Assumes a compare instruction has already
++ // been emitted that sets CR0.
++ template <typename CondT>
++ void ma_b(CondT cond, Label* label) {
++ if constexpr (std::is_same_v<CondT, Condition>) {
++ if (cond == Always) {
++ jump(label);
++ return;
++ }
++ }
++ if (label->bound()) {
++ // Open the no-pool window BEFORE computing the displacement. Same
++ // hazard as jump(): enterNoPool may itself flush a pending pool,
++ // advancing currentOffset(); the bc must emit with a displacement
++ // computed against the post-flush offset. Budget covers max 6
++ // instructions: POWER8 Overflow XER ops (3) + cror (1) + bc (1) +
++ // nop (1) for the worst-case DoubleCondition+Overflow short path.
++ m_buffer.enterNoPool(6);
++ // For DoubleCondition, as_bc emits cror/crandc before the bc
++ // instruction, advancing currentOffset() by 4. Account for this
++ // in the offset calculation.
++ int32_t crAdjust = 0;
++ if constexpr (std::is_same_v<CondT, DoubleCondition>) {
++ crAdjust = -(int32_t)sizeof(uint32_t);
++ }
++ int32_t offset = label->offset() - currentOffset() + crAdjust;
++ if (BOffImm16::IsInRange(offset)) {
++ as_bc((int16_t)offset, cond);
++ writeInst(NopInst);
++ m_buffer.leaveNoPool();
++ return;
++ }
++ m_buffer.leaveNoPool();
++ // Long conditional branch for bound label.
++ // XER ops(0-3) + cror(0-1) + bc(1) + stanza(8) + mtctr(1) + bctr(1).
++ // P8 Overflow: mfxer+rlwinm+mtcrf+bc+stanza+mtctr+bctr = 14 max.
++ m_buffer.enterNoPool(kNoPoolCondLongBranchInsnsP8Max);
++ as_bc((int16_t)44, InvertCondition(cond));
++ BufferOffset boLoad =
++ emitLoad64Stanza(SecondScratchReg, LabelBase::INVALID_OFFSET);
++ xs_mtctr(SecondScratchReg);
++ as_bctr();
++ m_buffer.leaveNoPool();
++ addLongJump(boLoad, BufferOffset(label->offset()));
++ return;
++ }
++ // Forward reference: emit BCTag stanza.
++ // XER ops(0-3) + cror(0-1) + bc(1) + trap_tagged(1) + chain(1) + 8 NOPs.
++ // P8 Overflow: mfxer+rlwinm+mtcrf+bc+trap+chain+8NOPs = 14 max.
++ m_buffer.enterNoPool(kNoPoolCondLongBranchInsnsP8Max);
++ as_bc((int16_t)44, InvertCondition(cond));
++ BufferOffset bo = xs_trap_tagged(BCTag);
++ writeInst(label->used() ? label->offset() : LabelBase::INVALID_OFFSET);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ writeInst(NopInst);
++ m_buffer.leaveNoPool();
++ if (!oom()) {
++ label->use(bo.getOffset());
++ }
++ }
++
++ // Set dest = 1 if CR0 satisfies cond, else dest = 0.
++ // POWER10: setbc/setbcr (1 insn). P8/P9: isel-based path with the
++ // r0-as-zero trick on the BranchOnClear half.
++ void ma_cmp_set(Register dest, Condition cond) {
++ uint32_t base = uint32_t(cond) & 0xff;
++ uint32_t setbase = (base & ~BranchOptionMask) | BranchOnSet;
++ if (HasPOWER10()) {
++ if ((base & BranchOptionMask) == BranchOnSet) {
++ as_setbc(dest, setbase, cr0);
++ } else {
++ as_setbcr(dest, setbase, cr0);
++ }
++ return;
++ }
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ xs_li(scratch, 1);
++ if ((base & BranchOptionMask) == BranchOnSet) {
++ xs_li(dest, 0);
++ as_isel(dest, scratch, dest, setbase, cr0);
++ } else {
++ as_isel0(dest, r0, scratch, setbase, cr0);
++ }
++ }
++
++ void ma_cmp_set_dbl(Register dest, DoubleCondition cond) {
++ uint32_t base = uint32_t(cond) & 0xff;
++ bool hasUnorderedFlag = uint32_t(cond) & DoubleConditionUnordered;
++ uint32_t setbase = (base & ~BranchOptionMask) | BranchOnSet;
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ if (HasPOWER10()) {
++ if ((base & BranchOptionMask) == BranchOnSet) {
++ as_setbc(dest, setbase, cr0);
++ } else {
++ as_setbcr(dest, setbase, cr0);
++ }
++ // Fixup paths below still need scratch=1 for the SO-isel.
++ if (hasUnorderedFlag || ((base & BranchOptionMask) != BranchOnSet &&
++ cond != DoubleOrdered)) {
++ xs_li(scratch, 1);
++ }
++ } else {
++ xs_li(scratch, 1);
++ if ((base & BranchOptionMask) == BranchOnSet) {
++ xs_li(dest, 0);
++ as_isel(dest, scratch, dest, setbase, cr0);
++ } else {
++ as_isel0(dest, r0, scratch, setbase, cr0);
++ }
++ }
++ if (hasUnorderedFlag) {
++ // Condition includes unordered (NaN): force dest=1 when SO is set.
++ // isel dest, scratch(=1), dest, SO
++ as_isel(dest, scratch, dest, uint16_t(SOBit), cr0);
++ } else if ((base & BranchOptionMask) != BranchOnSet &&
++ cond != DoubleOrdered) {
++ // Ordered comparison that negates a CR bit (BranchOnClear): NaN
++ // produces all-zero LT/GT/EQ bits which makes the negation return
++ // true. Fix by forcing dest=0 when SO is set.
++ as_isel0(dest, r0, dest, uint16_t(SOBit), cr0);
++ }
++ }
++
++ // Conditional move: if CR0 satisfies cond, dest = src.
++ void ma_cmp_move(Register dest, Register src, Condition cond) {
++ uint32_t base = uint32_t(cond) & 0xff;
++ uint32_t setbase = (base & ~BranchOptionMask) | BranchOnSet;
++ if ((base & BranchOptionMask) == BranchOnSet) {
++ as_isel(dest, src, dest, setbase, cr0);
++ } else {
++ as_isel(dest, dest, src, setbase, cr0);
++ }
++ }
++
++ // If cond == 0, move src to dst; otherwise dst is unchanged. The only
++ // callers are wasm select, whose condition is a 32-bit value: test its
++ // 32-bit sign with cmpwi so high-bit garbage (e.g. under register pressure)
++ // does not make a zero condition read as non-zero.
++ void moveIfZero(Register dst, Register src, Register cond) {
++ as_cmpwi(cond, 0);
++ as_isel(dst, src, dst, Equal, cr0);
++ }
++
++ void ma_add32TestCarry(Condition cond, Register rd, Register rs, Imm32 imm,
++ Label* overflow);
++ void ma_addPtrTestCarry(Condition cond, Register rd, Register rs, ImmWord imm,
++ Label* overflow);
++
++ // Issue the correct compare instruction for the given condition and
++ // operand sizes. Returns the condition to use with ma_b or ma_cmp_set
++ // (usually the same, but unsigned conditions use cmpl* variants).
++ Condition ma_cmp(Register lhs, Register rhs, Condition cond,
++ bool is32bit = false) {
++ Condition base =
++ static_cast<Condition>(cond & ~(ConditionUnsigned | ConditionZero));
++ bool isUnsigned = (cond & ConditionUnsigned) != 0;
++ // ConditionZero-flagged conditions (Signed, NotSigned, Zero, NonZero)
++ // test a single register against zero, not two registers against each
++ // other. Compare against immediate 0.
++ if ((cond & ConditionZero) != 0) {
++ if (is32bit) {
++ as_cmpwi(lhs, 0);
++ } else {
++ as_cmpdi(lhs, 0);
++ }
++ return base;
++ }
++ if (is32bit) {
++ if (isUnsigned) {
++ as_cmplw(lhs, rhs);
++ } else {
++ as_cmpw(lhs, rhs);
++ }
++ } else {
++ if (isUnsigned) {
++ as_cmpld(lhs, rhs);
++ } else {
++ as_cmpd(lhs, rhs);
++ }
++ }
++ return base;
++ }
++
++ Condition ma_cmp(Register lhs, Imm32 rhs, Condition cond,
++ bool is32bit = false) {
++ Condition base =
++ static_cast<Condition>(cond & ~(ConditionUnsigned | ConditionZero));
++ bool isUnsigned = (cond & ConditionUnsigned) != 0;
++ if (isUnsigned) {
++ if (is_uintN(rhs.value, 16)) {
++ if (is32bit) {
++ as_cmplwi(lhs, rhs.value);
++ } else {
++ as_cmpldi(lhs, rhs.value);
++ }
++ return base;
++ }
++ } else {
++ if (is_intN(rhs.value, 16)) {
++ if (is32bit) {
++ as_cmpwi(lhs, rhs.value);
++ } else {
++ as_cmpdi(lhs, rhs.value);
++ }
++ return base;
++ }
++ }
++ // Immediate doesn't fit — materialize into scratch and compare.
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ move32(rhs, scratch);
++ return ma_cmp(lhs, scratch, cond, is32bit);
++ }
++
++ Condition ma_cmp(Register lhs, ImmWord rhs, Condition cond) {
++ Condition base =
++ static_cast<Condition>(cond & ~(ConditionUnsigned | ConditionZero));
++ bool isUnsigned = (cond & ConditionUnsigned) != 0;
++ if (isUnsigned) {
++ if (is_uintN(rhs.value, 16)) {
++ as_cmpldi(lhs, rhs.value);
++ return base;
++ }
++ } else {
++ if (is_intN(rhs.value, 16)) {
++ as_cmpdi(lhs, rhs.value);
++ return base;
++ }
++ }
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ movePtr(rhs, scratch);
++ return ma_cmp(lhs, scratch, cond);
++ }
++
++ Condition ma_cmp(Register lhs, ImmPtr rhs, Condition cond) {
++ return ma_cmp(lhs, ImmWord(uintptr_t(rhs.value)), cond);
++ }
++
++ Condition ma_cmp(Register lhs, ImmGCPtr rhs, Condition cond) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ movePtr(rhs, scratch);
++ return ma_cmp(lhs, scratch, cond);
++ }
++
++ Condition ma_cmp(Register lhs, ImmTag rhs, Condition cond) {
++ // Tag values on PUNBOX64 are 17-bit (0x1FFF0+), too large for 16-bit
++ // signed or unsigned immediates.
++ Condition base =
++ static_cast<Condition>(cond & ~(ConditionUnsigned | ConditionZero));
++ bool isUnsigned = (cond & ConditionUnsigned) != 0;
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ move32(Imm32(rhs.value), scratch);
++ if (isUnsigned) {
++ as_cmpld(lhs, scratch);
++ } else {
++ as_cmpd(lhs, scratch);
++ }
++ return base;
++ }
++
++ // Compare a tag register against an ImmTag constant and branch, WITHOUT
++ // acquiring a scratch register. Uses xoris+cmplwi which MODIFIES tagReg.
++ // Only safe when tagReg is a scratch register owned by the caller.
++ void branchTestTag(Condition cond, Register tagReg, ImmTag tag, Label* label) {
++ MOZ_ASSERT(cond == Equal || cond == NotEqual);
++ uint32_t t = tag.value;
++ as_xoris(tagReg, tagReg, t >> 16);
++ as_cmplwi(tagReg, t & 0xFFFF);
++ Condition c = (cond == Equal) ? Equal : NotEqual;
++ ma_b(c, label);
++ }
++
++ void ma_mod_mask(Register src, Register dest, Register hold, Register remain,
++ int32_t shift, Label* negZero = nullptr);
++
++ void nop() { writeInst(NopInst); }
++ void breakpoint(uint32_t value = 0) { xs_trap(); }
++
++ inline void retn(Imm32 n);
++
++ // ===============================================================
++ // Stack operations
++
++ void push(Imm32 imm) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ move32(imm, scratch);
++ push(scratch);
++ }
++ void push(ImmWord imm) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ movePtr(imm, scratch);
++ push(scratch);
++ }
++ void push(ImmGCPtr imm) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ movePtr(imm, scratch);
++ push(scratch);
++ }
++ void push(const Address& address) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ loadPtr(address, scratch);
++ push(scratch);
++ }
++ void push(Register reg) { as_stdu(reg, StackPointer, -8); }
++ void push(FloatRegister reg) {
++ // stfdu/stfsu fuses the SP decrement and the FP store: EA=SP-8,
++ // MEM[EA]=reg, SP=EA. 1 insn instead of addi+stfd/stfs.
++ if (reg.isSingle()) {
++ as_stfsu(reg, StackPointer, -8);
++ } else {
++ as_stfdu(reg, StackPointer, -8);
++ }
++ }
++ void pop(Register reg) {
++ as_ld(reg, StackPointer, 0);
++ as_addi(StackPointer, StackPointer, 8);
++ }
++ void pop(FloatRegister reg) {
++ if (reg.isSingle()) {
++ as_lfs(reg, StackPointer, 0);
++ } else {
++ as_lfd(reg, StackPointer, 0);
++ }
++ as_addi(StackPointer, StackPointer, 8);
++ }
++
++ CodeOffset pushWithPatch(ImmWord imm) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ CodeOffset offset = movWithPatch(imm, scratch);
++ push(scratch);
++ return offset;
++ }
++ CodeOffset movWithPatch(ImmWord imm, Register dest) {
++ BufferOffset bo = emitLoad64Stanza(dest, (uint64_t)imm.value);
++ return CodeOffset(bo.getOffset());
++ }
++ CodeOffset movWithPatch(ImmPtr imm, Register dest) {
++ return movWithPatch(ImmWord(uintptr_t(imm.value)), dest);
++ }
++
++ // ===============================================================
++ // Tag/unbox operations
++
++ void splitTag(Register src, Register dest) {
++ x_srdi(dest, src, JSVAL_TAG_SHIFT);
++ }
++ void splitTag(const ValueOperand& operand, Register dest) {
++ splitTag(operand.valueReg(), dest);
++ }
++ void splitTagForTest(const ValueOperand& value, ScratchTagScope& tag) {
++ splitTag(value, tag);
++ }
++
++ void unboxNonDouble(const ValueOperand& operand, Register dest,
++ JSValueType type) {
++ unboxNonDouble(operand.valueReg(), dest, type);
++ }
++ template <typename T>
++ void unboxNonDouble(T src, Register dest, JSValueType type) {
++ MOZ_ASSERT(type != JSVAL_TYPE_DOUBLE);
++ if (type == JSVAL_TYPE_INT32 || type == JSVAL_TYPE_BOOLEAN) {
++ load32(src, dest);
++ return;
++ }
++ loadPtr(src, dest);
++ unboxNonDouble(dest, dest, type);
++ }
++ void unboxNonDouble(Register src, Register dest, JSValueType type) {
++ MOZ_ASSERT(type != JSVAL_TYPE_DOUBLE);
++ if (type == JSVAL_TYPE_INT32 || type == JSVAL_TYPE_BOOLEAN) {
++ as_extsw(dest, src);
++ return;
++ }
++ // Extract the payload (lower 47 bits) by clearing the tag.
++ // This avoids acquiring a scratch register, preventing pool exhaustion
++ // when called from nested scratch scopes (e.g., ScratchTagScope →
++ // branchTestStringTruthy → unboxString → here).
++ // rldicl dest, src, 0, 17 — clear upper 17 bits (tag), keep lower 47.
++ as_rldicl(dest, src, 0, 17);
++ }
++ void unboxGCThingForGCBarrier(const Address& src, Register dest) {
++ loadPtr(src, dest);
++ // Clear tag bits (top 17 bits on 64-bit).
++ as_rldicl(dest, dest, 0, 64 - JSVAL_TAG_SHIFT);
++ }
++ void unboxGCThingForGCBarrier(const ValueOperand& src, Register dest) {
++ as_rldicl(dest, src.valueReg(), 0, 64 - JSVAL_TAG_SHIFT);
++ }
++ void unboxWasmAnyRefGCThingForGCBarrier(const Address& src, Register dest) {
++ static_assert(wasm::AnyRef::TagShift == 2);
++ loadPtr(src, dest);
++ as_rldicr(dest, dest, 0, 61);
++ }
++ void getGCThingValueChunk(const Address& src, Register dest) {
++ loadPtr(src, dest);
++ as_rldicl(dest, dest, 0, 17);
++ as_rldicr(dest, dest, 0, 43);
++ }
++ void getGCThingValueChunk(const ValueOperand& src, Register dest) {
++ as_rldicl(dest, src.valueReg(), 0, 17);
++ as_rldicr(dest, dest, 0, 43);
++ }
++
++ void boxDouble(FloatRegister src, const ValueOperand& dest, FloatRegister) {
++ as_mfvsrd(dest.valueReg(), src);
++ }
++ void boxNonDouble(JSValueType type, Register src, const ValueOperand& dest) {
++ boxValue(type, src, dest.valueReg());
++ }
++ void boxNonDouble(Register type, Register src, const ValueOperand& dest) {
++ boxValue(type, src, dest.valueReg());
++ }
++ void unboxInt32(const ValueOperand& operand, Register dest) {
++ as_extsw(dest, operand.valueReg());
++ }
++ void unboxInt32(const Address& src, Register dest) { load32(src, dest); }
++ void unboxInt32(const BaseIndex& src, Register dest) { load32(src, dest); }
++ void unboxBoolean(const ValueOperand& operand, Register dest) {
++ as_extsw(dest, operand.valueReg());
++ }
++ void unboxBoolean(const Address& src, Register dest) { load32(src, dest); }
++ void unboxBoolean(const BaseIndex& src, Register dest) { load32(src, dest); }
++ void unboxDouble(const ValueOperand& operand, FloatRegister dest) {
++ as_mtvsrd(dest, operand.valueReg());
++ }
++ void unboxDouble(const Address& src, FloatRegister dest) {
++ loadDouble(src, dest);
++ }
++ void unboxDouble(const BaseIndex& src, FloatRegister dest) {
++ loadDouble(src, dest);
++ }
++ void unboxString(const ValueOperand& operand, Register dest) {
++ unboxNonDouble(operand, dest, JSVAL_TYPE_STRING);
++ }
++ void unboxString(const Address& src, Register dest) {
++ unboxNonDouble(src, dest, JSVAL_TYPE_STRING);
++ }
++ void unboxSymbol(const ValueOperand& operand, Register dest) {
++ unboxNonDouble(operand, dest, JSVAL_TYPE_SYMBOL);
++ }
++ void unboxSymbol(const Address& src, Register dest) {
++ unboxNonDouble(src, dest, JSVAL_TYPE_SYMBOL);
++ }
++ void unboxBigInt(const ValueOperand& operand, Register dest) {
++ unboxNonDouble(operand, dest, JSVAL_TYPE_BIGINT);
++ }
++ void unboxBigInt(const Address& src, Register dest) {
++ unboxNonDouble(src, dest, JSVAL_TYPE_BIGINT);
++ }
++ void unboxObject(const ValueOperand& src, Register dest) {
++ unboxNonDouble(src, dest, JSVAL_TYPE_OBJECT);
++ }
++ void unboxObject(const Address& src, Register dest) {
++ unboxNonDouble(src, dest, JSVAL_TYPE_OBJECT);
++ }
++ void unboxObject(const BaseIndex& src, Register dest) {
++ unboxNonDouble(src, dest, JSVAL_TYPE_OBJECT);
++ }
++ void unboxValue(const ValueOperand& src, AnyRegister dest, JSValueType type) {
++ if (dest.isFloat()) {
++ unboxDouble(src, dest.fpu());
++ } else {
++ unboxNonDouble(src, dest.gpr(), type);
++ }
++ }
++ void unboxObjectOrNull(const Address& src, Register dest) {
++ loadPtr(src, dest);
++ // Object pointers have the object tag in high bits; null has a different
++ // tag. Clear the top bits to get either a valid pointer or zero.
++ as_rldicl(dest, dest, 0, 64 - JSVAL_TAG_SHIFT);
++ }
++
++ void tagValue(JSValueType type, Register payload, ValueOperand dest) {
++ MOZ_ASSERT(type != JSVAL_TYPE_UNDEFINED && type != JSVAL_TYPE_NULL);
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ MOZ_ASSERT(scratch != payload && scratch != dest.valueReg());
++ tagValueWithScratch(type, payload, dest, scratch);
++ }
++ void tagValueWithScratch(JSValueType type, Register payload,
++ ValueOperand dest, Register scratch) {
++ movePtr(ImmShiftedTag(type), scratch);
++ if (type == JSVAL_TYPE_INT32 || type == JSVAL_TYPE_BOOLEAN ||
++ type == JSVAL_TYPE_MAGIC) {
++ if (payload != dest.valueReg()) {
++ movePtr(payload, dest.valueReg());
++ }
++ as_rldicl(dest.valueReg(), dest.valueReg(), 0, 32);
++ as_or_(dest.valueReg(), dest.valueReg(), scratch);
++ } else {
++ if (payload != dest.valueReg()) {
++ movePtr(payload, dest.valueReg());
++ }
++ as_or_(dest.valueReg(), dest.valueReg(), scratch);
++ }
++ }
++ void boxValue(JSValueType type, Register src, Register dest) {
++ MOZ_ASSERT(src != dest);
++ MOZ_ASSERT(type != JSVAL_TYPE_UNDEFINED && type != JSVAL_TYPE_NULL);
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ boxValueWithScratch(type, src, dest, scratch);
++ }
++ void boxValueWithScratch(JSValueType type, Register src, Register dest,
++ Register scratch) {
++ if (type == JSVAL_TYPE_INT32 || type == JSVAL_TYPE_BOOLEAN ||
++ type == JSVAL_TYPE_MAGIC) {
++ as_rldicl(dest, src, 0, 32);
++ movePtr(ImmShiftedTag(type), scratch);
++ as_or_(dest, dest, scratch);
++ } else {
++ movePtr(ImmShiftedTag(type), scratch);
++ xs_mr(dest, src);
++ as_or_(dest, dest, scratch);
++ }
++ }
++ void boxValue(Register type, Register src, Register dest) {
++ MOZ_ASSERT(src != dest);
++
++#ifdef DEBUG
++ Label done, isNullOrUndefined, isBoolean, isInt32OrMagic;
++
++ // Use ma_cmp + ma_b instead of asMasm().branch32() because
++ // MacroAssembler is not yet fully defined at this point.
++ Condition cond;
++ cond = ma_cmp(type, Imm32(JSVAL_TYPE_NULL), Equal, true);
++ ma_b(cond, &isNullOrUndefined);
++ cond = ma_cmp(type, Imm32(JSVAL_TYPE_UNDEFINED), Equal, true);
++ ma_b(cond, &isNullOrUndefined);
++ cond = ma_cmp(type, Imm32(JSVAL_TYPE_BOOLEAN), Equal, true);
++ ma_b(cond, &isBoolean);
++ cond = ma_cmp(type, Imm32(JSVAL_TYPE_INT32), Equal, true);
++ ma_b(cond, &isInt32OrMagic);
++ cond = ma_cmp(type, Imm32(JSVAL_TYPE_MAGIC), Equal, true);
++ ma_b(cond, &isInt32OrMagic);
++ // GCThing types aren't supported, because as_rldicl truncates
++ // payloads above UINT32_MAX.
++ breakpoint();
++ {
++ bind(&isNullOrUndefined);
++
++ // Ensure no payload for null and undefined.
++ cond = ma_cmp(src, ImmWord(0), Equal);
++ ma_b(cond, &done);
++ breakpoint();
++ }
++ {
++ bind(&isBoolean);
++
++ // Ensure boolean values are either 0 or 1.
++ cond = ma_cmp(src, Imm32(1), BelowOrEqual, true);
++ ma_b(cond, &done);
++ breakpoint();
++ }
++ {
++ bind(&isInt32OrMagic);
++
++ // Ensure |src| is sign-extended.
++ UseScratchRegisterScope debugTemps(*this);
++ Register debugScratch = debugTemps.Acquire();
++ as_extsw(debugScratch, src);
++ cond = ma_cmp(src, debugScratch, Equal);
++ ma_b(cond, &done);
++ breakpoint();
++ }
++ bind(&done);
++#endif
++
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ MOZ_ASSERT(scratch != dest && scratch != src && scratch != type);
++ // Build tag: (type | JSVAL_TAG_MAX_DOUBLE) << JSVAL_TAG_SHIFT
++ move32(Imm32(JSVAL_TAG_MAX_DOUBLE), scratch);
++ as_or_(scratch, scratch, type);
++ x_sldi(scratch, scratch, JSVAL_TAG_SHIFT);
++ // Insert 32-bit payload.
++ as_rldicl(dest, src, 0, 32);
++ as_or_(dest, dest, scratch);
++ }
++
++ // ===============================================================
++ // Value store/load/push/pop
++
++ void storeValue(ValueOperand val, const Address& dest) {
++ storePtr(val.valueReg(), dest);
++ }
++ void storeValue(ValueOperand val, const BaseIndex& dest) {
++ storePtr(val.valueReg(), dest);
++ }
++ void storeValue(JSValueType type, Register reg, Address dest) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ MOZ_ASSERT(dest.base != scratch);
++ boxValue(type, reg, scratch);
++ storePtr(scratch, dest);
++ }
++ void storeValue(JSValueType type, Register reg, BaseIndex dest) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ MOZ_ASSERT(dest.base != scratch);
++ boxValue(type, reg, scratch);
++ storePtr(scratch, dest);
++ }
++ void storeValue(const Value& val, Address dest) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ MOZ_ASSERT(dest.base != scratch);
++ if (val.isGCThing()) {
++ CodeOffset off = movWithPatch(ImmWord(val.asRawBits()), scratch);
++ writeDataRelocation(off, val);
++ } else {
++ movePtr(ImmWord(val.asRawBits()), scratch);
++ }
++ storePtr(scratch, dest);
++ }
++ void storeValue(const Value& val, BaseIndex dest) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ MOZ_ASSERT(dest.base != scratch);
++ if (val.isGCThing()) {
++ CodeOffset off = movWithPatch(ImmWord(val.asRawBits()), scratch);
++ writeDataRelocation(off, val);
++ } else {
++ movePtr(ImmWord(val.asRawBits()), scratch);
++ }
++ storePtr(scratch, dest);
++ }
++ void storeValue(const Address& src, const Address& dest, Register temp) {
++ loadPtr(src, temp);
++ storePtr(temp, dest);
++ }
++
++ void storePrivateValue(Register src, const Address& dest) {
++ storePtr(src, dest);
++ }
++ void storePrivateValue(ImmGCPtr imm, const Address& dest) {
++ storePtr(imm, dest);
++ }
++
++ void loadValue(Address src, ValueOperand val) {
++ loadPtr(src, val.valueReg());
++ }
++ void loadValue(const BaseIndex& src, ValueOperand val) {
++ loadPtr(src, val.valueReg());
++ }
++ void loadUnalignedValue(const Address& src, ValueOperand dest) {
++ loadPtr(src, dest.valueReg());
++ }
++
++ void pushValue(ValueOperand val) { push(val.valueReg()); }
++ void popValue(ValueOperand val) { pop(val.valueReg()); }
++ void pushValue(const Value& val) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ if (val.isGCThing()) {
++ CodeOffset off = movWithPatch(ImmWord(val.asRawBits()), scratch);
++ writeDataRelocation(off, val);
++ } else {
++ movePtr(ImmWord(val.asRawBits()), scratch);
++ }
++ push(scratch);
++ }
++ void pushValue(JSValueType type, Register reg) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ boxValue(type, reg, scratch);
++ push(scratch);
++ }
++ void pushValue(const Address& addr) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ loadPtr(addr, scratch);
++ push(scratch);
++ }
++ void pushValue(const BaseIndex& addr, Register scratch) {
++ loadPtr(addr, scratch);
++ push(scratch);
++ }
++
++ // ===============================================================
++ // Load instructions
++
++ FaultingCodeOffset load8SignExtend(const Address& address, Register dest) {
++ FaultingCodeOffset fco;
++ if (is_intN(address.offset, 16)) {
++ fco = FaultingCodeOffset(
++ as_lbz(dest, address.base, address.offset).getOffset());
++ } else {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ MOZ_ASSERT(scratch != dest);
++ movePtr(ImmWord(address.offset), scratch);
++ fco =
++ FaultingCodeOffset(as_lbzx(dest, address.base, scratch).getOffset());
++ }
++ as_extsb(dest, dest);
++ return fco;
++ }
++ FaultingCodeOffset load8SignExtend(const BaseIndex& src, Register dest) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ computeScaledAddress(src, scratch);
++ FaultingCodeOffset fco;
++ if (is_intN(src.offset, 16)) {
++ fco = FaultingCodeOffset(as_lbz(dest, scratch, src.offset).getOffset());
++ } else {
++ MOZ_ASSERT(scratch != dest);
++ movePtr(ImmWord(src.offset), dest);
++ fco = FaultingCodeOffset(as_lbzx(dest, scratch, dest).getOffset());
++ }
++ as_extsb(dest, dest);
++ return fco;
++ }
++ FaultingCodeOffset load8ZeroExtend(const Address& address, Register dest) {
++ if (is_intN(address.offset, 16)) {
++ return FaultingCodeOffset(
++ as_lbz(dest, address.base, address.offset).getOffset());
++ }
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ MOZ_ASSERT(scratch != dest);
++ movePtr(ImmWord(address.offset), scratch);
++ return FaultingCodeOffset(as_lbzx(dest, address.base, scratch).getOffset());
++ }
++ FaultingCodeOffset load8ZeroExtend(const BaseIndex& src, Register dest) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ computeScaledAddress(src, scratch);
++ if (is_intN(src.offset, 16)) {
++ return FaultingCodeOffset(as_lbz(dest, scratch, src.offset).getOffset());
++ }
++ MOZ_ASSERT(scratch != dest);
++ movePtr(ImmWord(src.offset), dest);
++ return FaultingCodeOffset(as_lbzx(dest, scratch, dest).getOffset());
++ }
++ FaultingCodeOffset load16SignExtend(const Address& address, Register dest) {
++ if (is_intN(address.offset, 16)) {
++ return FaultingCodeOffset(
++ as_lha(dest, address.base, address.offset).getOffset());
++ }
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ MOZ_ASSERT(scratch != dest);
++ movePtr(ImmWord(address.offset), scratch);
++ return FaultingCodeOffset(as_lhax(dest, address.base, scratch).getOffset());
++ }
++ FaultingCodeOffset load16SignExtend(const BaseIndex& src, Register dest) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ computeScaledAddress(src, scratch);
++ if (is_intN(src.offset, 16)) {
++ return FaultingCodeOffset(as_lha(dest, scratch, src.offset).getOffset());
++ }
++ MOZ_ASSERT(scratch != dest);
++ movePtr(ImmWord(src.offset), dest);
++ return FaultingCodeOffset(as_lhax(dest, scratch, dest).getOffset());
++ }
++ template <typename S>
++ void load16UnalignedSignExtend(const S& src, Register dest) {
++ load16SignExtend(src, dest);
++ }
++ FaultingCodeOffset load16ZeroExtend(const Address& address, Register dest) {
++ if (is_intN(address.offset, 16)) {
++ return FaultingCodeOffset(
++ as_lhz(dest, address.base, address.offset).getOffset());
++ }
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ MOZ_ASSERT(scratch != dest);
++ movePtr(ImmWord(address.offset), scratch);
++ return FaultingCodeOffset(as_lhzx(dest, address.base, scratch).getOffset());
++ }
++ FaultingCodeOffset load16ZeroExtend(const BaseIndex& src, Register dest) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ computeScaledAddress(src, scratch);
++ if (is_intN(src.offset, 16)) {
++ return FaultingCodeOffset(as_lhz(dest, scratch, src.offset).getOffset());
++ }
++ MOZ_ASSERT(scratch != dest);
++ movePtr(ImmWord(src.offset), dest);
++ return FaultingCodeOffset(as_lhzx(dest, scratch, dest).getOffset());
++ }
++ template <typename S>
++ void load16UnalignedZeroExtend(const S& src, Register dest) {
++ load16ZeroExtend(src, dest);
++ }
++
++ FaultingCodeOffset load32(const Address& address, Register dest) {
++ // lwa is DS-form (14-bit displacement × 4 = 16-bit-signed effective
++ // range, 4-byte alignment required). lwax is X-form indexed, no
++ // alignment constraint. Both sign-extend in one instruction; only
++ // the misaligned 16-bit-fitting case still needs lwz + extsw.
++ if (is_intN(address.offset, 16) && (address.offset & 3) == 0) {
++ return FaultingCodeOffset(
++ as_lwa(dest, address.base, address.offset).getOffset());
++ }
++ if (is_intN(address.offset, 16)) {
++ FaultingCodeOffset fco(
++ as_lwz(dest, address.base, address.offset).getOffset());
++ as_extsw(dest, dest);
++ return fco;
++ }
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ MOZ_ASSERT(scratch != dest);
++ movePtr(ImmWord(address.offset), scratch);
++ return FaultingCodeOffset(as_lwax(dest, address.base, scratch).getOffset());
++ }
++ FaultingCodeOffset load32(const BaseIndex& address, Register dest) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ computeScaledAddress(address, scratch);
++ if (is_intN(address.offset, 16) && (address.offset & 3) == 0) {
++ return FaultingCodeOffset(
++ as_lwa(dest, scratch, address.offset).getOffset());
++ }
++ if (is_intN(address.offset, 16)) {
++ FaultingCodeOffset fco(as_lwz(dest, scratch, address.offset).getOffset());
++ as_extsw(dest, dest);
++ return fco;
++ }
++ MOZ_ASSERT(scratch != dest);
++ movePtr(ImmWord(address.offset), dest);
++ return FaultingCodeOffset(as_lwax(dest, scratch, dest).getOffset());
++ }
++ void load32(AbsoluteAddress address, Register dest) {
++ movePtr(ImmWord((uintptr_t)address.addr), dest);
++ as_lwa(dest, dest, 0);
++ }
++ void load32(wasm::SymbolicAddress address, Register dest) {
++ movePtr(address, dest);
++ as_lwa(dest, dest, 0);
++ }
++ template <typename S>
++ void load32Unaligned(const S& src, Register dest) {
++ load32(src, dest);
++ }
++
++ FaultingCodeOffset load64(const Address& address, Register64 dest) {
++ return loadPtr(address, dest.reg);
++ }
++ FaultingCodeOffset load64(const BaseIndex& address, Register64 dest) {
++ return loadPtr(address, dest.reg);
++ }
++ template <typename S>
++ void load64Unaligned(const S& src, Register64 dest) {
++ load64(src, dest);
++ }
++
++ FaultingCodeOffset loadPtr(const Address& address, Register dest) {
++ // as_ld (DS-form) requires 4-byte aligned offset.
++ if (is_intN(address.offset, 16) && !(address.offset & 0x3)) {
++ return FaultingCodeOffset(
++ as_ld(dest, address.base, address.offset).getOffset());
++ }
++ if (HasPOWER10() && is_intN((intptr_t)address.offset, 34)) {
++ return FaultingCodeOffset(
++ as_pld(dest, address.base, (int64_t)address.offset, /*R=*/false)
++ .getOffset());
++ }
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ MOZ_ASSERT(scratch != dest);
++ movePtr(ImmWord(address.offset), scratch);
++ return FaultingCodeOffset(as_ldx(dest, address.base, scratch).getOffset());
++ }
++ FaultingCodeOffset loadPtr(const BaseIndex& src, Register dest) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ computeScaledAddress(src, scratch);
++ if (is_intN(src.offset, 16) && !(src.offset & 0x3)) {
++ return FaultingCodeOffset(as_ld(dest, scratch, src.offset).getOffset());
++ }
++ MOZ_ASSERT(scratch != dest);
++ movePtr(ImmWord(src.offset), dest);
++ return FaultingCodeOffset(as_ldx(dest, scratch, dest).getOffset());
++ }
++ void loadPtr(AbsoluteAddress address, Register dest) {
++ movePtr(ImmWord((uintptr_t)address.addr), dest);
++ as_ld(dest, dest, 0);
++ }
++ void loadPtr(wasm::SymbolicAddress address, Register dest) {
++ movePtr(address, dest);
++ as_ld(dest, dest, 0);
++ }
++
++ void loadPrivate(const Address& address, Register dest) {
++ loadPtr(address, dest);
++ }
++
++ FaultingCodeOffset loadDouble(const Address& addr, FloatRegister dest) {
++ if (is_intN(addr.offset, 16)) {
++ return FaultingCodeOffset(
++ as_lfd(dest, addr.base, addr.offset).getOffset());
++ }
++ if (HasPOWER10() && is_intN((intptr_t)addr.offset, 34)) {
++ return FaultingCodeOffset(
++ as_plfd(dest, addr.base, (int64_t)addr.offset, /*R=*/false)
++ .getOffset());
++ }
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord(addr.offset), scratch);
++ return FaultingCodeOffset(as_lfdx(dest, addr.base, scratch).getOffset());
++ }
++ FaultingCodeOffset loadDouble(const BaseIndex& src, FloatRegister dest) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ computeScaledAddress(src, scratch);
++ if (is_intN(src.offset, 16)) {
++ return FaultingCodeOffset(as_lfd(dest, scratch, src.offset).getOffset());
++ }
++ Register scratch2 = temps.Acquire();
++ movePtr(ImmWord(src.offset), scratch2);
++ return FaultingCodeOffset(as_lfdx(dest, scratch, scratch2).getOffset());
++ }
++ FaultingCodeOffset loadFloat32(const Address& addr, FloatRegister dest) {
++ if (is_intN(addr.offset, 16)) {
++ return FaultingCodeOffset(
++ as_lfs(dest, addr.base, addr.offset).getOffset());
++ }
++ if (HasPOWER10() && is_intN((intptr_t)addr.offset, 34)) {
++ return FaultingCodeOffset(
++ as_plfs(dest, addr.base, (int64_t)addr.offset, /*R=*/false)
++ .getOffset());
++ }
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord(addr.offset), scratch);
++ return FaultingCodeOffset(as_lfsx(dest, addr.base, scratch).getOffset());
++ }
++ FaultingCodeOffset loadFloat32(const BaseIndex& src, FloatRegister dest) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ computeScaledAddress(src, scratch);
++ if (is_intN(src.offset, 16)) {
++ return FaultingCodeOffset(as_lfs(dest, scratch, src.offset).getOffset());
++ }
++ Register scratch2 = temps.Acquire();
++ movePtr(ImmWord(src.offset), scratch2);
++ return FaultingCodeOffset(as_lfsx(dest, scratch, scratch2).getOffset());
++ }
++ // Load a FP constant into `dest`.
++ //
++ // +0.0 / +0.0f: `xxlxor dest, dest, dest` (1 insn). No register clobbers.
++ //
++ // POWER9 non-zero: constant pool load via `addpcis r16, hi; lfd/lfs fD,
++ // lo(r16); nop`. 2 real insns + nop, no LR clobber, no Return Address
++ // Stack corruption. lfs auto-expands single-precision to double, so no
++ // separate xscvspdpn step. Clobbers r16 (SavedScratchRegister). Pool
++ // entries are shared across duplicate constants.
++ //
++ // POWER8 non-zero: inline `movePtr + mtvsrd(+xscvspdpn)` path. We do NOT
++ // use the bcl-based pool path on POWER8: bcl clobbers LR and corrupts
++ // the Return Address Stack, which causes catastrophic mispredicts in
++ // hot FP-constant loops (~200x slowdown observed on cmp-bitselect.js).
++ //
++ // Precondition: must not be called inside an `enterNoPool` region when
++ // HasPOWER9() is true (the pool path calls `allocEntry` which asserts
++ // `inhibitPools_ == 0`). Audit-verified that no such call site exists
++ // today; the POWER8 inline path is unaffected.
++ void loadConstantDouble(double dp, FloatRegister dest) {
++ if (mozilla::IsPositiveZero(dp)) {
++ as_xxlxor(dest, dest, dest);
++ return;
++ }
++ if (HasPOWER9()) {
++ loadFromPoolFloat64(dest, dp);
++ return;
++ }
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ union {
++ double d;
++ uint64_t u;
++ } u;
++ u.d = dp;
++ movePtr(ImmWord(u.u), scratch);
++ as_mtvsrd(dest, scratch);
++ }
++ void loadConstantFloat32(float f, FloatRegister dest) {
++ if (mozilla::IsPositiveZero(f)) {
++ as_xxlxor(dest, dest, dest);
++ return;
++ }
++ if (HasPOWER9()) {
++ loadFromPoolFloat32(dest, f);
++ return;
++ }
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ union {
++ float f;
++ uint32_t u;
++ } u;
++ u.f = f;
++ movePtr(ImmWord(u.u), scratch);
++ x_sldi(scratch, scratch, 32);
++ as_mtvsrd(dest, scratch);
++ as_xscvspdpn(dest, dest);
++ }
++
++ void notBoolean(const ValueOperand& val) {
++ as_xori(val.valueReg(), val.valueReg(), 1);
++ }
++
++ [[nodiscard]] Register extractTag(const Address& address, Register scratch) {
++ loadPtr(address, scratch);
++ x_srdi(scratch, scratch, JSVAL_TAG_SHIFT);
++ return scratch;
++ }
++ [[nodiscard]] Register extractTag(const BaseIndex& address,
++ Register scratch) {
++ if (scratch == r0) {
++ // r0 cannot be used as a base register in D-form/X-form loads,
++ // so we need a separate temp for the intermediate address.
++ UseScratchRegisterScope temps(*this);
++ Register base = temps.Acquire();
++ computeScaledAddress(address, base);
++ loadPtr(Address(base, address.offset), scratch);
++ } else {
++ // scratch is a pool register (r11/r12) or another GPR that can
++ // serve as a base register, so reuse it for the address computation.
++ computeScaledAddress(address, scratch);
++ loadPtr(Address(scratch, address.offset), scratch);
++ }
++ x_srdi(scratch, scratch, JSVAL_TAG_SHIFT);
++ return scratch;
++ }
++ [[nodiscard]] Register extractTag(const ValueOperand& value,
++ Register scratch) {
++ splitTag(value, scratch);
++ return scratch;
++ }
++
++ [[nodiscard]] Register extractObject(const Address& address,
++ Register scratch) {
++ loadPtr(address, scratch);
++ as_rldicl(scratch, scratch, 0, 64 - JSVAL_TAG_SHIFT);
++ return scratch;
++ }
++ [[nodiscard]] Register extractObject(const ValueOperand& value,
++ Register scratch) {
++ unboxObject(value, scratch);
++ return scratch;
++ }
++ [[nodiscard]] Register extractInt32(const ValueOperand& value,
++ Register scratch) {
++ unboxInt32(value, scratch);
++ return scratch;
++ }
++ [[nodiscard]] Register extractString(const ValueOperand& value,
++ Register scratch) {
++ unboxString(value, scratch);
++ return scratch;
++ }
++ [[nodiscard]] Register extractSymbol(const ValueOperand& value,
++ Register scratch) {
++ unboxSymbol(value, scratch);
++ return scratch;
++ }
++ [[nodiscard]] Register extractBoolean(const ValueOperand& value,
++ Register scratch) {
++ unboxBoolean(value, scratch);
++ return scratch;
++ }
++
++ void testObjectSet(Condition cond, const ValueOperand& value, Register dest) {
++ MOZ_ASSERT(cond == Equal || cond == NotEqual);
++ {
++ UseScratchRegisterScope temps(*this);
++ Register tag = temps.Acquire();
++ splitTag(value, tag);
++ uint32_t t = JSVAL_TAG_OBJECT;
++ as_xoris(tag, tag, t >> 16);
++ as_cmplwi(tag, t & 0xFFFF);
++ }
++ ma_cmp_set(dest, cond);
++ }
++ void testUndefinedSet(Condition cond, const ValueOperand& value,
++ Register dest) {
++ MOZ_ASSERT(cond == Equal || cond == NotEqual);
++ {
++ UseScratchRegisterScope temps(*this);
++ Register tag = temps.Acquire();
++ splitTag(value, tag);
++ // Use xoris+cmplwi to compare without a second scratch.
++ uint32_t t = JSVAL_TAG_UNDEFINED;
++ as_xoris(tag, tag, t >> 16);
++ as_cmplwi(tag, t & 0xFFFF);
++ }
++ ma_cmp_set(dest, cond);
++ }
++ void testNullSet(Condition cond, const ValueOperand& value, Register dest) {
++ MOZ_ASSERT(cond == Equal || cond == NotEqual);
++ {
++ UseScratchRegisterScope temps(*this);
++ Register tag = temps.Acquire();
++ splitTag(value, tag);
++ uint32_t t = JSVAL_TAG_NULL;
++ as_xoris(tag, tag, t >> 16);
++ as_cmplwi(tag, t & 0xFFFF);
++ }
++ ma_cmp_set(dest, cond);
++ }
++
++ BufferOffset ret() {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ as_ld(scratch, StackPointer, 0);
++ as_addi(StackPointer, StackPointer, 8);
++ xs_mtlr(scratch);
++ return as_blr();
++ }
++
++ void j(Label* dest) { jump(dest); }
++
++ void getWasmAnyRefGCThingChunk(Register anyref, Register dest) {
++ static_assert(js::gc::ChunkShift == 20);
++ as_rldicr(dest, anyref, 0, 43);
++ }
++
++ template <typename T>
++ void loadUnboxedValue(const T& address, MIRType type, AnyRegister dest) {
++ if (dest.isFloat()) {
++ loadInt32OrDouble(address, dest.fpu());
++ } else {
++ unboxNonDouble(address, dest.gpr(), ValueTypeFromMIRType(type));
++ }
++ }
++
++ void loadInt32OrDouble(const Address& src, FloatRegister dest);
++ void loadInt32OrDouble(const BaseIndex& addr, FloatRegister dest);
++
++ // ===============================================================
++ // Store instructions
++
++ FaultingCodeOffset store8(Register src, const Address& address) {
++ if (is_intN(address.offset, 16)) {
++ return FaultingCodeOffset(
++ as_stb(src, address.base, address.offset).getOffset());
++ }
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord(address.offset), scratch);
++ return FaultingCodeOffset(as_stbx(src, address.base, scratch).getOffset());
++ }
++ FaultingCodeOffset store8(Register src, const BaseIndex& address) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ computeScaledAddress(address, scratch);
++ if (is_intN(address.offset, 16)) {
++ return FaultingCodeOffset(
++ as_stb(src, scratch, address.offset).getOffset());
++ }
++ Register scratch2 = temps.Acquire();
++ movePtr(ImmWord(address.offset), scratch2);
++ return FaultingCodeOffset(as_stbx(src, scratch, scratch2).getOffset());
++ }
++ void store8(Imm32 imm, const Address& address) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ move32(imm, scratch);
++ store8(scratch, address);
++ }
++ void store8(Imm32 imm, const BaseIndex& address) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ move32(imm, scratch);
++ store8(scratch, address);
++ }
++
++ FaultingCodeOffset store16(Register src, const Address& address) {
++ if (is_intN(address.offset, 16)) {
++ return FaultingCodeOffset(
++ as_sth(src, address.base, address.offset).getOffset());
++ }
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord(address.offset), scratch);
++ return FaultingCodeOffset(as_sthx(src, address.base, scratch).getOffset());
++ }
++ FaultingCodeOffset store16(Register src, const BaseIndex& address) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ computeScaledAddress(address, scratch);
++ if (is_intN(address.offset, 16)) {
++ return FaultingCodeOffset(
++ as_sth(src, scratch, address.offset).getOffset());
++ }
++ Register scratch2 = temps.Acquire();
++ movePtr(ImmWord(address.offset), scratch2);
++ return FaultingCodeOffset(as_sthx(src, scratch, scratch2).getOffset());
++ }
++ void store16(Imm32 imm, const Address& address) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ move32(imm, scratch);
++ store16(scratch, address);
++ }
++ void store16(Imm32 imm, const BaseIndex& address) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ move32(imm, scratch);
++ store16(scratch, address);
++ }
++ template <typename T>
++ void store16Unaligned(Register src, const T& dest) {
++ store16(src, dest);
++ }
++
++ FaultingCodeOffset store32(Register src, const Address& address) {
++ if (is_intN(address.offset, 16)) {
++ return FaultingCodeOffset(
++ as_stw(src, address.base, address.offset).getOffset());
++ }
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord(address.offset), scratch);
++ return FaultingCodeOffset(as_stwx(src, address.base, scratch).getOffset());
++ }
++ FaultingCodeOffset store32(Register src, const BaseIndex& address) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ computeScaledAddress(address, scratch);
++ if (is_intN(address.offset, 16)) {
++ return FaultingCodeOffset(
++ as_stw(src, scratch, address.offset).getOffset());
++ }
++ Register scratch2 = temps.Acquire();
++ movePtr(ImmWord(address.offset), scratch2);
++ return FaultingCodeOffset(as_stwx(src, scratch, scratch2).getOffset());
++ }
++ void store32(Register src, AbsoluteAddress address) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord((uintptr_t)address.addr), scratch);
++ as_stw(src, scratch, 0);
++ }
++ void store32(Imm32 src, const Address& address) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ move32(src, scratch);
++ store32(scratch, address);
++ }
++ void store32(Imm32 src, const BaseIndex& address) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ move32(src, scratch);
++ store32(scratch, address);
++ }
++ template <typename T>
++ void store32Unaligned(Register src, const T& dest) {
++ store32(src, dest);
++ }
++
++ void store64(Imm64 imm, Address address) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord(imm.value), scratch);
++ storePtr(scratch, address);
++ }
++ void store64(Imm64 imm, const BaseIndex& address) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord(imm.value), scratch);
++ storePtr(scratch, address);
++ }
++ FaultingCodeOffset store64(Register64 src, Address address) {
++ return storePtr(src.reg, address);
++ }
++ FaultingCodeOffset store64(Register64 src, const BaseIndex& address) {
++ return storePtr(src.reg, address);
++ }
++ template <typename T>
++ void store64Unaligned(Register64 src, const T& dest) {
++ store64(src, dest);
++ }
++
++ template <typename T>
++ void storePtr(ImmWord imm, T address) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ movePtr(imm, scratch);
++ storePtr(scratch, address);
++ }
++ template <typename T>
++ void storePtr(ImmPtr imm, T address) {
++ storePtr(ImmWord(uintptr_t(imm.value)), address);
++ }
++ template <typename T>
++ void storePtr(ImmGCPtr imm, T address) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ movePtr(imm, scratch);
++ storePtr(scratch, address);
++ }
++ void storePtr(Register src, AbsoluteAddress dest) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord((uintptr_t)dest.addr), scratch);
++ as_std(src, scratch, 0);
++ }
++ FaultingCodeOffset storePtr(Register src, const Address& address) {
++ // as_std (DS-form) requires 4-byte aligned offset.
++ if (is_intN(address.offset, 16) && !(address.offset & 0x3)) {
++ return FaultingCodeOffset(
++ as_std(src, address.base, address.offset).getOffset());
++ }
++ if (HasPOWER10() && is_intN((intptr_t)address.offset, 34)) {
++ return FaultingCodeOffset(
++ as_pstd(src, address.base, (int64_t)address.offset, /*R=*/false)
++ .getOffset());
++ }
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ movePtr(ImmWord(address.offset), scratch);
++ return FaultingCodeOffset(as_stdx(src, address.base, scratch).getOffset());
++ }
++ FaultingCodeOffset storePtr(Register src, const BaseIndex& address) {
++ UseScratchRegisterScope temps(*this);
++ Register scratch = temps.Acquire();
++ computeScaledAddress(address, scratch);
++ if (is_intN(address.offset, 16) && !(address.offset & 0x3)) {
++ return FaultingCodeOffset(
++ as_std(src, scratch, address.offset).getOffset());
++ }
++ Register scratch2 = temps.Acquire();
++ movePtr(ImmWord(address.offset), scratch2);
++ return FaultingCodeOffset(as_stdx(src, scratch, scratch2).getOffset());
++ }
++
++ // ===============================================================
++ // Misc
++
++ void handleFailureWithHandlerTail(Label* profilerExitTail, Label* bailoutTail,
++ uint32_t* returnValueCheckOffset);
++
++ inline void incrementInt32Value(const Address& addr);
++
++ void zeroDouble(FloatRegister reg) { as_xxlxor(reg, reg, reg); }
++
++ void writeCodePointer(CodeLabel* label) {
++ label->patchAt()->bind(currentOffset());
++ label->setLinkMode(CodeLabel::RawPointer);
++ m_buffer.ensureSpace(sizeof(void*));
++ writeInst(-1);
++ writeInst(-1);
++ }
++ void writeDataRelocation(const Value& val) {
++ if (val.isGCThing()) {
++ gc::Cell* cell = val.toGCThing();
++ if (cell && gc::IsInsideNursery(cell)) {
++ embedsNurseryPointers_ = true;
++ }
++ dataRelocations_.writeUnsigned(currentOffset());
++ }
++ }
++ void writeDataRelocation(CodeOffset off, const Value& val) {
++ if (val.isGCThing()) {
++ gc::Cell* cell = val.toGCThing();
++ if (cell && gc::IsInsideNursery(cell)) {
++ embedsNurseryPointers_ = true;
++ }
++ dataRelocations_.writeUnsigned(off.offset());
++ }
++ }
++
++ CodeOffset toggledJump(Label* label) {
++ CodeOffset ret(nextOffset().getOffset());
++ jump(label);
++ return ret;
++ }
++ CodeOffset toggledCall(JitCode* target, bool enabled);
++ // 8 instructions for load64 + mtctr + bctrl = 10 instructions total.
++ static size_t ToggledCallSize(uint8_t* code) { return 10 * sizeof(uint32_t); }
++
++ void checkStackAlignment() {}
++
++ static void calculateAlignedStackPointer(void** stackPointer) {
++ *stackPointer = reinterpret_cast<void*>((uintptr_t(*stackPointer)) &
++ ~(ABIStackAlignment - 1));
++ }
++
++ void lea(Operand addr, Register dest) {
++ // x86-ism; on PPC, compute effective address manually.
++ MOZ_CRASH("PPC64: lea not supported; use computeEffectiveAddress");
++ }
++
++ void abiret() { as_blr(); }
++
++ void profilerEnterFrame(Register framePtr, Register scratch);
++ void profilerExitFrame();
++
++ void outOfLineWasmTruncateToInt32Check(
++ FloatRegister input, Register output, MIRType fromType, TruncFlags flags,
++ Label* rejoin, const wasm::TrapSiteDesc& trapSiteDesc);
++ void outOfLineWasmTruncateToInt64Check(
++ FloatRegister input, Register64 output, MIRType fromType,
++ TruncFlags flags, Label* rejoin, const wasm::TrapSiteDesc& trapSiteDesc);
++
++ void wasmLoadImpl(const wasm::MemoryAccessDesc& access, Register memoryBase,
++ Register ptr, Register ptrScratch, AnyRegister output);
++ void wasmStoreImpl(const wasm::MemoryAccessDesc& access, AnyRegister value,
++ Register memoryBase, Register ptr, Register ptrScratch);
++ void wasmLoadI64Impl(const wasm::MemoryAccessDesc& access,
++ Register memoryBase, Register ptr, Register ptrScratch,
++ Register64 output);
++ void wasmStoreI64Impl(const wasm::MemoryAccessDesc& access, Register64 value,
++ Register memoryBase, Register ptr, Register ptrScratch);
++
++ // Last-byte probing load to enforce wasm-spec atomicity for multi-byte
++ // wasm accesses on POWER ISA. POWER permits unaligned page-spanning
++ // accesses to commit one half before the other half takes a DSI; wasm
++ // requires atomicity. Touching the last byte of the upcoming access
++ // with a 1-byte lbzx triggers SIGSEGV (→ wasm trap via the signal
++ // handler) before the actual access executes — POWER's precise-
++ // interrupt model guarantees the subsequent access is never
++ // architecturally executed if the probe faults.
++ //
++ // Wasm linear memory is one contiguous mapped region followed by an
++ // mprotect'd guard, so last-byte-mapped ⇒ all-bytes-mapped, and a
++ // single-byte probe is sufficient regardless of access size.
++ //
++ // No-op when HasPOWER9() (real POWER9/POWER10 silicon handles page-
++ // spanning unaligned stores atomically at the µarch level), and when
++ // access size is 1. Never called on the atomic path: atomic ops are
++ // naturally aligned per wasm spec + ISA-enforced lwarx alignment, so
++ // they cannot span pages; misaligned atomics take a precise SIGBUS
++ // before any commit.
++ //
++ // 2 instructions when emitted (addi + lbzx).
++ void wasmProbeLastByte(const wasm::MemoryAccessDesc& access,
++ Register memoryBase, Register ptr);
++};
++
++typedef MacroAssemblerPPC64Compat MacroAssemblerSpecific;
++
++} // namespace jit
++} // namespace js
++
++#endif /* jit_ppc64_MacroAssembler_ppc64_h */
+diff --git a/js/src/jit/ppc64/MoveEmitter-ppc64.cpp b/js/src/jit/ppc64/MoveEmitter-ppc64.cpp
+new file mode 100644
+index 000000000000..989d3f61f121
+--- /dev/null
++++ b/js/src/jit/ppc64/MoveEmitter-ppc64.cpp
+@@ -0,0 +1,357 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#include "jit/ppc64/MoveEmitter-ppc64.h"
++
++#include "jit/MacroAssembler-inl.h"
++
++using namespace js;
++using namespace js::jit;
++
++void MoveEmitterPPC64::breakCycle(const MoveOperand& from,
++ const MoveOperand& to, MoveOp::Type type,
++ uint32_t slotId) {
++ switch (type) {
++ case MoveOp::FLOAT32:
++ if (to.isMemory()) {
++ ScratchFloat32Scope fpscratch32(masm);
++ masm.loadFloat32(getAdjustedAddress(to), fpscratch32);
++ masm.storeFloat32(fpscratch32, cycleSlot(slotId));
++ } else {
++ masm.storeFloat32(to.floatReg(), cycleSlot(slotId));
++ }
++ break;
++ case MoveOp::DOUBLE:
++ if (to.isMemory()) {
++ ScratchDoubleScope fpscratch64(masm);
++ masm.loadDouble(getAdjustedAddress(to), fpscratch64);
++ masm.storeDouble(fpscratch64, cycleSlot(slotId));
++ } else {
++ masm.storeDouble(to.floatReg(), cycleSlot(slotId));
++ }
++ break;
++ case MoveOp::INT32:
++ if (to.isMemory()) {
++ UseScratchRegisterScope temps(masm);
++ Register scratch = temps.Acquire();
++ masm.load32(getAdjustedAddress(to), scratch);
++ masm.store32(scratch, cycleSlot(0));
++ } else {
++ masm.store32(to.reg(), cycleSlot(0));
++ }
++ break;
++ case MoveOp::GENERAL:
++ if (to.isMemory()) {
++ UseScratchRegisterScope temps(masm);
++ Register scratch = temps.Acquire();
++ masm.loadPtr(getAdjustedAddress(to), scratch);
++ masm.storePtr(scratch, cycleSlot(0));
++ } else {
++ masm.storePtr(to.reg(), cycleSlot(0));
++ }
++ break;
++ case MoveOp::SIMD128:
++ if (to.isMemory()) {
++ ScratchSimd128Scope scratch(masm);
++ masm.loadUnalignedSimd128(getAdjustedAddress(to), scratch);
++ masm.storeUnalignedSimd128(scratch, cycleSlot(slotId));
++ } else {
++ masm.storeUnalignedSimd128(to.floatReg(), cycleSlot(slotId));
++ }
++ break;
++ default:
++ MOZ_CRASH("Unexpected move type");
++ }
++}
++
++void MoveEmitterPPC64::completeCycle(const MoveOperand& from,
++ const MoveOperand& to, MoveOp::Type type,
++ uint32_t slotId) {
++ switch (type) {
++ case MoveOp::FLOAT32:
++ if (to.isMemory()) {
++ ScratchFloat32Scope fpscratch32(masm);
++ masm.loadFloat32(cycleSlot(slotId), fpscratch32);
++ masm.storeFloat32(fpscratch32, getAdjustedAddress(to));
++ } else {
++ masm.loadFloat32(cycleSlot(slotId), to.floatReg());
++ }
++ break;
++ case MoveOp::DOUBLE:
++ if (to.isMemory()) {
++ ScratchDoubleScope fpscratch64(masm);
++ masm.loadDouble(cycleSlot(slotId), fpscratch64);
++ masm.storeDouble(fpscratch64, getAdjustedAddress(to));
++ } else {
++ masm.loadDouble(cycleSlot(slotId), to.floatReg());
++ }
++ break;
++ case MoveOp::INT32:
++ MOZ_ASSERT(slotId == 0);
++ if (to.isMemory()) {
++ UseScratchRegisterScope temps(masm);
++ Register scratch = temps.Acquire();
++ masm.load32(cycleSlot(0), scratch);
++ masm.store32(scratch, getAdjustedAddress(to));
++ } else {
++ masm.load32(cycleSlot(0), to.reg());
++ }
++ break;
++ case MoveOp::GENERAL:
++ MOZ_ASSERT(slotId == 0);
++ if (to.isMemory()) {
++ UseScratchRegisterScope temps(masm);
++ Register scratch = temps.Acquire();
++ masm.loadPtr(cycleSlot(0), scratch);
++ masm.storePtr(scratch, getAdjustedAddress(to));
++ } else {
++ masm.loadPtr(cycleSlot(0), to.reg());
++ }
++ break;
++ case MoveOp::SIMD128:
++ if (to.isMemory()) {
++ ScratchSimd128Scope scratch(masm);
++ masm.loadUnalignedSimd128(cycleSlot(slotId), scratch);
++ masm.storeUnalignedSimd128(scratch, getAdjustedAddress(to));
++ } else {
++ masm.loadUnalignedSimd128(cycleSlot(slotId), to.floatReg());
++ }
++ break;
++ default:
++ MOZ_CRASH("Unexpected move type");
++ }
++}
++
++void MoveEmitterPPC64::emit(const MoveResolver& moves) {
++ if (moves.numCycles()) {
++ // SpillSlotSize must be wide enough for the widest cycled value
++ // (SIMD128 = 16 bytes). The stride below assumes the same. See
++ // Architecture-ppc64.h for the rationale.
++ static_assert(SpillSlotSize == 16);
++ masm.reserveStack(moves.numCycles() * SpillSlotSize);
++ pushedAtCycle_ = masm.framePushed();
++ }
++
++ for (size_t i = 0; i < moves.numMoves(); i++) {
++ emit(moves.getMove(i));
++ }
++}
++
++Address MoveEmitterPPC64::cycleSlot(uint32_t slot, uint32_t subslot) const {
++ int32_t offset = masm.framePushed() - pushedAtCycle_;
++ // Stride must match the per-cycle reservation in emit(); using a
++ // narrower stride causes adjacent SIMD128 slots to overlap.
++ return Address(StackPointer, offset + slot * SpillSlotSize + subslot);
++}
++
++int32_t MoveEmitterPPC64::getAdjustedOffset(const MoveOperand& operand) {
++ MOZ_ASSERT(operand.isMemoryOrEffectiveAddress());
++ if (operand.base() != StackPointer) {
++ return operand.disp();
++ }
++
++ return operand.disp() + masm.framePushed() - pushedAtStart_;
++}
++
++Address MoveEmitterPPC64::getAdjustedAddress(const MoveOperand& operand) {
++ return Address(operand.base(), getAdjustedOffset(operand));
++}
++
++void MoveEmitterPPC64::emitMove(const MoveOperand& from,
++ const MoveOperand& to) {
++ if (from.isGeneralReg()) {
++ if (to.isGeneralReg()) {
++ masm.movePtr(from.reg(), to.reg());
++ } else if (to.isMemory()) {
++ masm.storePtr(from.reg(), getAdjustedAddress(to));
++ } else {
++ MOZ_CRASH("Invalid emitMove arguments.");
++ }
++ } else if (from.isMemory()) {
++ if (to.isGeneralReg()) {
++ masm.loadPtr(getAdjustedAddress(from), to.reg());
++ } else if (to.isMemory()) {
++ UseScratchRegisterScope temps(masm);
++ Register scratch = temps.Acquire();
++ masm.loadPtr(getAdjustedAddress(from), scratch);
++ masm.storePtr(scratch, getAdjustedAddress(to));
++ } else {
++ MOZ_CRASH("Invalid emitMove arguments.");
++ }
++ } else if (from.isEffectiveAddress()) {
++ if (to.isGeneralReg()) {
++ masm.computeEffectiveAddress(getAdjustedAddress(from), to.reg());
++ } else if (to.isMemory()) {
++ UseScratchRegisterScope temps(masm);
++ Register scratch = temps.Acquire();
++ masm.computeEffectiveAddress(getAdjustedAddress(from), scratch);
++ masm.storePtr(scratch, getAdjustedAddress(to));
++ } else {
++ MOZ_CRASH("Invalid emitMove arguments.");
++ }
++ } else {
++ MOZ_CRASH("Invalid emitMove arguments.");
++ }
++}
++
++void MoveEmitterPPC64::emitInt32Move(const MoveOperand& from,
++ const MoveOperand& to) {
++ if (from.isGeneralReg()) {
++ if (to.isGeneralReg()) {
++ masm.move32(from.reg(), to.reg());
++ } else if (to.isMemory()) {
++ masm.store32(from.reg(), getAdjustedAddress(to));
++ } else {
++ MOZ_CRASH("Invalid emitInt32Move arguments.");
++ }
++ } else if (from.isMemory()) {
++ if (to.isGeneralReg()) {
++ masm.load32(getAdjustedAddress(from), to.reg());
++ } else if (to.isMemory()) {
++ UseScratchRegisterScope temps(masm);
++ Register scratch = temps.Acquire();
++ masm.load32(getAdjustedAddress(from), scratch);
++ masm.store32(scratch, getAdjustedAddress(to));
++ } else {
++ MOZ_CRASH("Invalid emitInt32Move arguments.");
++ }
++ } else if (from.isEffectiveAddress()) {
++ if (to.isGeneralReg()) {
++ masm.computeEffectiveAddress(getAdjustedAddress(from), to.reg());
++ } else if (to.isMemory()) {
++ UseScratchRegisterScope temps(masm);
++ Register scratch = temps.Acquire();
++ masm.computeEffectiveAddress(getAdjustedAddress(from), scratch);
++ masm.store32(scratch, getAdjustedAddress(to));
++ } else {
++ MOZ_CRASH("Invalid emitInt32Move arguments.");
++ }
++ } else {
++ MOZ_CRASH("Invalid emitInt32Move arguments.");
++ }
++}
++
++void MoveEmitterPPC64::emitFloat32Move(const MoveOperand& from,
++ const MoveOperand& to) {
++ if (from.isFloatReg()) {
++ if (to.isFloatReg()) {
++ masm.moveFloat32(from.floatReg(), to.floatReg());
++ } else {
++ MOZ_ASSERT(to.isMemory());
++ masm.storeFloat32(from.floatReg(), getAdjustedAddress(to));
++ }
++ } else if (to.isFloatReg()) {
++ MOZ_ASSERT(from.isMemory());
++ masm.loadFloat32(getAdjustedAddress(from), to.floatReg());
++ } else {
++ MOZ_ASSERT(from.isMemory());
++ MOZ_ASSERT(to.isMemory());
++ ScratchFloat32Scope fpscratch32(masm);
++ masm.loadFloat32(getAdjustedAddress(from), fpscratch32);
++ masm.storeFloat32(fpscratch32, getAdjustedAddress(to));
++ }
++}
++
++void MoveEmitterPPC64::emitDoubleMove(const MoveOperand& from,
++ const MoveOperand& to) {
++ if (from.isFloatReg()) {
++ if (to.isFloatReg()) {
++ masm.moveDouble(from.floatReg(), to.floatReg());
++ } else if (to.isGeneralReg()) {
++ // FPR -> GPR: use mfvsrd directly.
++ masm.as_mfvsrd(to.reg(), from.floatReg());
++ } else {
++ MOZ_ASSERT(to.isMemory());
++ masm.storeDouble(from.floatReg(), getAdjustedAddress(to));
++ }
++ } else if (to.isFloatReg()) {
++ if (from.isMemory()) {
++ masm.loadDouble(getAdjustedAddress(from), to.floatReg());
++ } else {
++ // GPR -> FPR: use mtvsrd directly.
++ masm.as_mtvsrd(to.floatReg(), from.reg());
++ }
++ } else {
++ MOZ_ASSERT(from.isMemory());
++ MOZ_ASSERT(to.isMemory());
++ ScratchDoubleScope fpscratch64(masm);
++ masm.loadDouble(getAdjustedAddress(from), fpscratch64);
++ masm.storeDouble(fpscratch64, getAdjustedAddress(to));
++ }
++}
++
++void MoveEmitterPPC64::emitSimd128Move(const MoveOperand& from,
++ const MoveOperand& to) {
++ if (from.isFloatReg()) {
++ if (to.isFloatReg()) {
++ masm.moveSimd128(from.floatReg(), to.floatReg());
++ } else {
++ MOZ_ASSERT(to.isMemory());
++ masm.storeUnalignedSimd128(from.floatReg(), getAdjustedAddress(to));
++ }
++ } else if (to.isFloatReg()) {
++ MOZ_ASSERT(from.isMemory());
++ masm.loadUnalignedSimd128(getAdjustedAddress(from), to.floatReg());
++ } else {
++ MOZ_ASSERT(from.isMemory());
++ MOZ_ASSERT(to.isMemory());
++ ScratchSimd128Scope scratch(masm);
++ masm.loadUnalignedSimd128(getAdjustedAddress(from), scratch);
++ masm.storeUnalignedSimd128(scratch, getAdjustedAddress(to));
++ }
++}
++
++void MoveEmitterPPC64::emit(const MoveOp& move) {
++ const MoveOperand& from = move.from();
++ const MoveOperand& to = move.to();
++
++ if (move.isCycleEnd() && move.isCycleBegin()) {
++ breakCycle(from, to, move.endCycleType(), move.cycleBeginSlot());
++ completeCycle(from, to, move.type(), move.cycleEndSlot());
++ return;
++ }
++
++ if (move.isCycleEnd()) {
++ MOZ_ASSERT(inCycle_);
++ completeCycle(from, to, move.type(), move.cycleEndSlot());
++ MOZ_ASSERT(inCycle_ > 0);
++ inCycle_--;
++ return;
++ }
++
++ if (move.isCycleBegin()) {
++ breakCycle(from, to, move.endCycleType(), move.cycleBeginSlot());
++ inCycle_++;
++ }
++
++ switch (move.type()) {
++ case MoveOp::FLOAT32:
++ emitFloat32Move(from, to);
++ break;
++ case MoveOp::DOUBLE:
++ emitDoubleMove(from, to);
++ break;
++ case MoveOp::SIMD128:
++ emitSimd128Move(from, to);
++ break;
++ case MoveOp::INT32:
++ emitInt32Move(from, to);
++ break;
++ case MoveOp::GENERAL:
++ emitMove(from, to);
++ break;
++ default:
++ MOZ_CRASH("Unexpected move type");
++ }
++}
++
++void MoveEmitterPPC64::assertDone() { MOZ_ASSERT(inCycle_ == 0); }
++
++void MoveEmitterPPC64::finish() {
++ assertDone();
++
++ masm.freeStack(masm.framePushed() - pushedAtStart_);
++}
+diff --git a/js/src/jit/ppc64/MoveEmitter-ppc64.h b/js/src/jit/ppc64/MoveEmitter-ppc64.h
+new file mode 100644
+index 000000000000..a9faa34de6bb
+--- /dev/null
++++ b/js/src/jit/ppc64/MoveEmitter-ppc64.h
+@@ -0,0 +1,64 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifndef jit_ppc64_MoveEmitter_ppc64_h
++#define jit_ppc64_MoveEmitter_ppc64_h
++
++#include "jit/MacroAssembler.h"
++#include "jit/MoveResolver.h"
++
++namespace js {
++namespace jit {
++
++class MoveEmitterPPC64 {
++ void emitDoubleMove(const MoveOperand& from, const MoveOperand& to);
++ void emitSimd128Move(const MoveOperand& from, const MoveOperand& to);
++ void breakCycle(const MoveOperand& from, const MoveOperand& to,
++ MoveOp::Type type, uint32_t slot);
++ void completeCycle(const MoveOperand& from, const MoveOperand& to,
++ MoveOp::Type type, uint32_t slot);
++
++ protected:
++ uint32_t inCycle_;
++ MacroAssembler& masm;
++
++ uint32_t pushedAtStart_;
++
++ int32_t pushedAtCycle_;
++
++ void assertDone();
++ Address cycleSlot(uint32_t slot, uint32_t subslot = 0) const;
++ int32_t getAdjustedOffset(const MoveOperand& operand);
++ Address getAdjustedAddress(const MoveOperand& operand);
++
++ void emitMove(const MoveOperand& from, const MoveOperand& to);
++ void emitInt32Move(const MoveOperand& from, const MoveOperand& to);
++ void emitFloat32Move(const MoveOperand& from, const MoveOperand& to);
++ void emit(const MoveOp& move);
++
++ public:
++ explicit MoveEmitterPPC64(MacroAssembler& masm)
++ : inCycle_(0),
++ masm(masm),
++ pushedAtStart_(masm.framePushed()),
++ pushedAtCycle_(-1) {}
++
++ ~MoveEmitterPPC64() { assertDone(); }
++
++ void emit(const MoveResolver& moves);
++ void finish();
++ // setScratchRegister is part of the cross-arch MoveEmitter interface
++ // but we never spill, so there's no scratch to set. No-op kept for
++ // shared-code compatibility.
++ void setScratchRegister(Register reg) {}
++};
++
++typedef MoveEmitterPPC64 MoveEmitter;
++
++} // namespace jit
++} // namespace js
++
++#endif /* jit_ppc64_MoveEmitter_ppc64_h */
+diff --git a/js/src/jit/ppc64/SharedICHelpers-ppc64-inl.h b/js/src/jit/ppc64/SharedICHelpers-ppc64-inl.h
+new file mode 100644
+index 000000000000..aa874dfd6732
+--- /dev/null
++++ b/js/src/jit/ppc64/SharedICHelpers-ppc64-inl.h
+@@ -0,0 +1,83 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifndef jit_ppc64_SharedICHelpers_ppc64_inl_h
++#define jit_ppc64_SharedICHelpers_ppc64_inl_h
++
++#include "jit/BaselineFrame.h"
++#include "jit/SharedICHelpers.h"
++
++#include "jit/MacroAssembler-inl.h"
++
++namespace js {
++namespace jit {
++
++inline void EmitBaselineTailCallVM(TrampolinePtr target, MacroAssembler& masm,
++ uint32_t argSize) {
++#ifdef DEBUG
++ Register scratch = R2.scratchReg();
++
++ // Compute frame size.
++ masm.movePtr(FramePointer, scratch);
++ masm.subPtr(StackPointer, scratch);
++
++ // Store frame size without VMFunction arguments for debug assertions.
++ masm.subPtr(Imm32(argSize), scratch);
++ Address frameSizeAddr(FramePointer,
++ BaselineFrame::reverseOffsetOfDebugFrameSize());
++ masm.store32(scratch, frameSizeAddr);
++ masm.addPtr(Imm32(argSize), scratch);
++#endif
++
++ // Push frame descriptor and perform the tail call.
++ masm.push(FrameDescriptor(FrameType::BaselineJS));
++
++ // The return address is in LR (set by the original bl/bctrl call).
++ // The VMWrapper code will push it via pushReturnAddress().
++
++ masm.jump(target);
++}
++
++inline void EmitBaselineCallVM(TrampolinePtr target, MacroAssembler& masm) {
++ masm.push(FrameDescriptor(FrameType::BaselineStub));
++ masm.call(target);
++}
++
++inline void EmitBaselineEnterStubFrame(MacroAssembler& masm, Register scratch) {
++ MOZ_ASSERT(scratch != ICTailCallReg);
++
++#ifdef DEBUG
++ // Compute frame size.
++ masm.movePtr(FramePointer, scratch);
++ masm.subPtr(StackPointer, scratch);
++
++ Address frameSizeAddr(FramePointer,
++ BaselineFrame::reverseOffsetOfDebugFrameSize());
++ masm.store32(scratch, frameSizeAddr);
++#endif
++
++ // Note: when making changes here, don't forget to update
++ // BaselineStubFrame if needed.
++
++ // Push frame descriptor and return address.
++ // LR holds the return address; read it into ICTailCallReg to push.
++ masm.Push(FrameDescriptor(FrameType::BaselineJS));
++ masm.xs_mflr(ICTailCallReg);
++ masm.Push(ICTailCallReg);
++
++ // Save old frame pointer, stack pointer and stub reg.
++ masm.Push(FramePointer);
++ masm.movePtr(StackPointer, FramePointer);
++ masm.Push(ICStubReg);
++
++ // Stack should remain aligned.
++ masm.assertStackAlignment(sizeof(Value), 0);
++}
++
++} // namespace jit
++} // namespace js
++
++#endif /* jit_ppc64_SharedICHelpers_ppc64_inl_h */
+diff --git a/js/src/jit/ppc64/SharedICHelpers-ppc64.h b/js/src/jit/ppc64/SharedICHelpers-ppc64.h
+new file mode 100644
+index 000000000000..31ba830d2609
+--- /dev/null
++++ b/js/src/jit/ppc64/SharedICHelpers-ppc64.h
+@@ -0,0 +1,97 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifndef jit_ppc64_SharedICHelpers_ppc64_h
++#define jit_ppc64_SharedICHelpers_ppc64_h
++
++#include "jit/BaselineIC.h"
++#include "jit/JitFrames.h"
++#include "jit/MacroAssembler.h"
++#include "jit/SharedICRegisters.h"
++
++namespace js {
++namespace jit {
++
++// Distance from sp to the top Value inside an IC stub (no return address on
++// the stack on PPC64).
++static const size_t ICStackValueOffset = 0;
++
++struct BaselineStubFrame {
++ uintptr_t savedFrame;
++ uintptr_t savedStub;
++ uintptr_t returnAddress;
++ uintptr_t descriptor;
++};
++
++inline void EmitRestoreTailCallReg(MacroAssembler& masm) {
++ // On PPC64, LR always holds the return address after a bl/bctrl call.
++ // No-op: LR is the hardware link register, not a GPR on the stack.
++}
++
++inline void EmitRepushTailCallReg(MacroAssembler& masm) {
++ // No-op: LR already holds the return address.
++}
++
++inline void EmitCallIC(MacroAssembler& masm, CodeOffset* callOffset) {
++ // The stub pointer must already be in ICStubReg.
++ // Load stubcode pointer from the ICStub.
++ // R2 won't be active when we call ICs, so we can use it as scratch.
++ masm.loadPtr(Address(ICStubReg, ICStub::offsetOfStubCode()), R2.scratchReg());
++
++ // Call the stubcode. On PPC64 call(Register) emits mtctr + bctrl,
++ // which sets LR to the address after bctrl.
++ masm.call(R2.scratchReg());
++ *callOffset = CodeOffset(masm.currentOffset());
++}
++
++inline void EmitReturnFromIC(MacroAssembler& masm) {
++ // Return via hardware LR (set by the original bl/bctrl call).
++ masm.as_blr();
++}
++
++inline void EmitBaselineLeaveStubFrame(MacroAssembler& masm) {
++ masm.loadPtr(
++ Address(FramePointer, BaselineStubFrameLayout::ICStubOffsetFromFP),
++ ICStubReg);
++
++ masm.movePtr(FramePointer, StackPointer);
++ masm.Pop(FramePointer);
++
++ // Load the return address and restore it to LR.
++ masm.Pop(ICTailCallReg);
++ masm.xs_mtlr(ICTailCallReg);
++
++ // Discard the frame descriptor.
++ {
++ UseScratchRegisterScope temps(masm);
++ Register scratch = temps.Acquire();
++ masm.Pop(scratch);
++ }
++}
++
++template <typename AddrType>
++inline void EmitPreBarrier(MacroAssembler& masm, const AddrType& addr,
++ MIRType type) {
++ // On PPC64, LR is clobbered by guardedCallPreBarrier. Save it first.
++ masm.xs_mflr(r0);
++ masm.push(r0);
++ masm.guardedCallPreBarrier(addr, type);
++ masm.pop(r0);
++ masm.xs_mtlr(r0);
++}
++
++inline void EmitStubGuardFailure(MacroAssembler& masm) {
++ // Load next stub into ICStubReg.
++ masm.loadPtr(Address(ICStubReg, ICCacheIRStub::offsetOfNext()), ICStubReg);
++
++ // Return address is in LR. Jump to the next stubcode.
++ masm.jump(Address(ICStubReg, ICStub::offsetOfStubCode()));
++}
++
++} // namespace jit
++} // namespace js
++
++#endif /* jit_ppc64_SharedICHelpers_ppc64_h */
+diff --git a/js/src/jit/ppc64/SharedICRegisters-ppc64.h b/js/src/jit/ppc64/SharedICRegisters-ppc64.h
+new file mode 100644
+index 000000000000..ddf67342f855
+--- /dev/null
++++ b/js/src/jit/ppc64/SharedICRegisters-ppc64.h
+@@ -0,0 +1,46 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifndef jit_ppc64_SharedICRegisters_ppc64_h
++#define jit_ppc64_SharedICRegisters_ppc64_h
++
++#include "jit/ppc64/Assembler-ppc64.h"
++#include "jit/Registers.h"
++#include "jit/RegisterSets.h"
++
++namespace js {
++namespace jit {
++
++// ValueOperands R0, R1, and R2.
++// R0 == JSReturnReg, and R2 uses registers not preserved across calls. R1 value
++// should be preserved across calls.
++static constexpr ValueOperand R0(r5);
++static constexpr ValueOperand R1(r15);
++static constexpr ValueOperand R2(r4);
++
++// ICTailCallReg and ICStubReg.
++// On PPC64, LR is not a GPR, so ICTailCallReg must be a normal GPR.
++// PPC64 ELFv2 has no volatile non-arg GPRs (r3-r10 are all arg regs), so
++// using an arg register risks clobbering by ABI calls with enough arguments.
++// We use callee-saved registers instead, matching MIPS64/RISC-V strategy.
++// These are excluded from BaselineICAvailableGeneralRegs.
++static constexpr Register ICTailCallReg = r27;
++static constexpr Register ICStubReg = r26;
++
++// FloatReg0 must be equal to ReturnFloatReg.
++static constexpr FloatRegister FloatReg0 = {FloatRegisters::f1,
++ FloatRegisters::Double};
++static constexpr FloatRegister FloatReg1 = {FloatRegisters::f2,
++ FloatRegisters::Double};
++static constexpr FloatRegister FloatReg2 = {FloatRegisters::f3,
++ FloatRegisters::Double};
++static constexpr FloatRegister FloatReg3 = {FloatRegisters::f4,
++ FloatRegisters::Double};
++
++} // namespace jit
++} // namespace js
++
++#endif /* jit_ppc64_SharedICRegisters_ppc64_h */
+diff --git a/js/src/jit/ppc64/Simulator-ppc64.cpp b/js/src/jit/ppc64/Simulator-ppc64.cpp
+new file mode 100644
+index 000000000000..8b29eb3add04
+--- /dev/null
++++ b/js/src/jit/ppc64/Simulator-ppc64.cpp
+@@ -0,0 +1,7296 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#include "jit/ppc64/Simulator-ppc64.h"
++
++#include <cinttypes>
++#include <cmath>
++#include <cstring>
++#include <float.h>
++#include <limits>
++
++#include "jit/AtomicOperations.h"
++#include "jit/ppc64/Assembler-ppc64.h"
++#include "js/Conversions.h"
++#include "threading/LockGuard.h"
++#include "vm/Float16.h"
++#include "vm/JSContext.h"
++#include "vm/Runtime.h"
++#include "wasm/WasmInstance.h"
++#include "wasm/WasmSignalHandlers.h"
++
++#define I8(v) static_cast<int8_t>(v)
++#define I16(v) static_cast<int16_t>(v)
++#define U16(v) static_cast<uint16_t>(v)
++#define I32(v) static_cast<int32_t>(v)
++#define U32(v) static_cast<uint32_t>(v)
++#define I64(v) static_cast<int64_t>(v)
++#define U64(v) static_cast<uint64_t>(v)
++#define I128(v) static_cast<__int128_t>(v)
++#define U128(v) static_cast<__uint128_t>(v)
++
++namespace js {
++namespace jit {
++
++static int64_t MultiplyHighSigned(int64_t u, int64_t v) {
++ uint64_t u0, v0, w0;
++ int64_t u1, v1, w1, w2, t;
++
++ u0 = u & 0xFFFFFFFFL;
++ u1 = u >> 32;
++ v0 = v & 0xFFFFFFFFL;
++ v1 = v >> 32;
++
++ w0 = u0 * v0;
++ t = u1 * v0 + (w0 >> 32);
++ w1 = t & 0xFFFFFFFFL;
++ w2 = t >> 32;
++ w1 = u0 * v1 + w1;
++
++ return u1 * v1 + w2 + (w1 >> 32);
++}
++
++static uint64_t MultiplyHighUnsigned(uint64_t u, uint64_t v) {
++ uint64_t u0, v0, w0;
++ uint64_t u1, v1, w1, w2, t;
++
++ u0 = u & 0xFFFFFFFFL;
++ u1 = u >> 32;
++ v0 = v & 0xFFFFFFFFL;
++ v1 = v >> 32;
++
++ w0 = u0 * v0;
++ t = u1 * v0 + (w0 >> 32);
++ w1 = t & 0xFFFFFFFFL;
++ w2 = t >> 32;
++ w1 = u0 * v1 + w1;
++
++ return u1 * v1 + w2 + (w1 >> 32);
++}
++
++inline constexpr uint32_t RotateLeft32(uint32_t value, uint32_t shift) {
++ return (value << shift) | (value >> ((32 - shift) & 31));
++}
++
++inline constexpr uint64_t RotateLeft64(uint64_t value, uint64_t shift) {
++ return (value << shift) | (value >> ((64 - shift) & 63));
++}
++
++// Generate a 64-bit mask with bits mb..me set (PPC numbering: 0 = MSB = bit
++// 63 in C). When mb <= me, a contiguous range is set; when mb > me, the
++// mask wraps around (bits 0..me and mb..63 are set).
++static inline uint64_t MASK64(unsigned mb, unsigned me) {
++ MOZ_ASSERT(mb < 64 && me < 64);
++ uint64_t mask_begin = ~0ULL >> mb;
++ uint64_t mask_end = ~0ULL << (63 - me);
++ if (mb <= me) {
++ return mask_begin & mask_end;
++ }
++ return mask_begin | mask_end;
++}
++
++static inline uint32_t MASK32(unsigned mb, unsigned me) {
++ MOZ_ASSERT(mb < 32 && me < 32);
++ uint32_t mask_begin = ~0U >> mb;
++ uint32_t mask_end = ~0U << (31 - me);
++ if (mb <= me) {
++ return mask_begin & mask_end;
++ }
++ return mask_begin | mask_end;
++}
++
++// Count leading zeros.
++static inline int CountLeadingZeros64(uint64_t value) {
++ if (value == 0) return 64;
++ return __builtin_clzll(value);
++}
++
++static inline int CountLeadingZeros32(uint32_t value) {
++ if (value == 0) return 32;
++ return __builtin_clz(value);
++}
++
++static inline int CountTrailingZeros64(uint64_t value) {
++ if (value == 0) return 64;
++ return __builtin_ctzll(value);
++}
++
++static inline int CountTrailingZeros32(uint32_t value) {
++ if (value == 0) return 32;
++ return __builtin_ctz(value);
++}
++
++static inline int PopCount64(uint64_t value) {
++ return __builtin_popcountll(value);
++}
++
++static inline int PopCount32(uint32_t value) {
++ return __builtin_popcount(value);
++}
++
++static inline uint64_t PopCountPerByte(uint64_t value) {
++ uint64_t result = 0;
++ for (int i = 0; i < 8; i++) {
++ uint8_t byte = (value >> (i * 8)) & 0xFF;
++ result |= (uint64_t)__builtin_popcount(byte) << (i * 8);
++ }
++ return result;
++}
++
++// PPC64 C argument slots: PPC64 ELFv2 ABI does not require C argument
++// slots on the stack for register-passed arguments, but we reserve the
++// link area (32 bytes).
++const int kCArgSlotCount = 0;
++const int kCArgsSlotsSize = kCArgSlotCount * sizeof(uintptr_t);
++
++// -----------------------------------------------------------------------------
++// PPC64 SimInstruction.
++
++class SimInstruction {
++ public:
++ enum {
++ kInstrSize = 4,
++ kPCReadOffset = 0
++ };
++
++ inline Instr instructionBits() const {
++ return *reinterpret_cast<const Instr*>(this);
++ }
++
++ inline void setInstructionBits(Instr value) {
++ *reinterpret_cast<Instr*>(this) = value;
++ }
++
++ inline int bit(int nr) const { return (instructionBits() >> nr) & 1; }
++
++ inline uint32_t bits(int hi, int lo) const {
++ return (instructionBits() >> lo) & ((2U << (hi - lo)) - 1);
++ }
++
++ inline uint32_t opcode() const { return bits(31, 26); }
++
++ inline uint32_t rtValue() const { return bits(25, 21); }
++ inline uint32_t rsValue() const { return bits(25, 21); }
++ inline uint32_t raValue() const { return bits(20, 16); }
++ inline uint32_t rbValue() const { return bits(15, 11); }
++ inline uint32_t rcValue() const { return bits(10, 6); }
++
++ inline uint32_t boValue() const { return bits(25, 21); }
++ inline uint32_t biValue() const { return bits(20, 16); }
++
++ // D-form 16-bit immediate (sign-extend to get signed value).
++ inline int16_t imm16Value() const { return I16(bits(15, 0)); }
++ inline uint16_t uimm16Value() const { return U16(bits(15, 0)); }
++
++ // DS-form 14-bit displacement (bits 2..15, 4-byte aligned).
++ inline int16_t ds14Value() const {
++ return I16(bits(15, 2) << 2);
++ }
++
++ // B-form 14-bit branch displacement (bits 2..15, 4-byte aligned).
++ inline int32_t bd16Value() const {
++ int16_t raw = I16(bits(15, 2) << 2);
++ return (int32_t)raw;
++ }
++
++ // I-form 24-bit branch offset (bits 2..25, sign-extended, 4-byte aligned).
++ inline int32_t li26Value() const {
++ int32_t raw = I32(bits(25, 2) << 2);
++ // Sign-extend from 26 bits.
++ return (raw << 6) >> 6;
++ }
++
++ // Extended opcode for X-form / XO-form (bits 1..10).
++ inline uint32_t xoValue() const { return bits(10, 1); }
++
++ // Extended opcode for XL-form (bits 1..10).
++ inline uint32_t xlValue() const { return bits(10, 1); }
++
++ // MD-form SH field: sh[0:4] in instruction bits 15:11, sh[5] in bit 1.
++ // Assembler encodes: ((sh & 0x1f) << 11) | ((sh & 0x20) >> 4).
++ inline uint32_t mdSHValue() const {
++ return bits(15, 11) | (bit(1) << 5);
++ }
++ // mb/me for MD-form (rldicl/rldicr/rldic/rldimi): 6-bit field split as
++ // mb[0:4] in instruction bits 10:6 and mb[5] in bit 5.
++ inline uint32_t mdMBValue() const {
++ return bits(10, 6) | (bit(5) << 5);
++ }
++ inline uint32_t mdMEValue() const { return mdMBValue(); }
++
++ // MD-form XO (bits 2..4).
++ inline uint32_t mdXOValue() const { return bits(4, 2); }
++
++ // MDS-form (rldcl, rldcr): mb[0:4] in bits 10:6, mb[5] in bit 5.
++ inline uint32_t mdsMBValue() const {
++ return bits(10, 6) | (bit(5) << 5);
++ }
++
++ // M-form fields (32-bit rotate/mask).
++ inline uint32_t mSHValue() const { return bits(15, 11); }
++ inline uint32_t mMBValue() const { return bits(10, 6); }
++ inline uint32_t mMEValue() const { return bits(5, 1); }
++
++ // Rc bit.
++ inline bool rcBit() const { return bit(0); }
++
++ // AA bit for branch instructions.
++ inline bool aaBit() const { return bit(1); }
++
++ // LK bit for branch instructions.
++ inline bool lkBit() const { return bit(0); }
++
++ // OE bit for XO-form arithmetic.
++ inline bool oeBit() const { return bit(10); }
++
++ // L bit for compare instructions (bit 21).
++ inline bool lBit() const { return bit(21); }
++
++ // BF field (bits 23..25) for compares.
++ inline uint32_t bfValue() const { return bits(25, 23); }
++
++ bool isTrap() const {
++ uint32_t instr = instructionBits();
++ // PPC_trap = 0x7FE00008 (tw 31,0,0).
++ // Don't treat the call-redirection instruction or wasm trap as a
++ // debugger trap.
++ if (instr == kCallRedirInstr) return false;
++ if (instr == 0x7FE00008) return false;
++ // Any other tw instruction with TO=31 is a trap.
++ if (opcode() == 31 && (xoValue() == 4)) return true;
++ return false;
++ }
++
++ private:
++ SimInstruction() = delete;
++ SimInstruction(const SimInstruction& other) = delete;
++ void operator=(const SimInstruction& other) = delete;
++};
++
++// -----------------------------------------------------------------------------
++// ICache.
++
++class CachePage {
++ public:
++ static const int LINE_VALID = 0;
++ static const int LINE_INVALID = 1;
++
++ static const int kPageShift = 12;
++ static const int kPageSize = 1 << kPageShift;
++ static const int kPageMask = kPageSize - 1;
++ static const int kLineShift = 2;
++ static const int kLineLength = 1 << kLineShift;
++ static const int kLineMask = kLineLength - 1;
++
++ CachePage() { memset(&validity_map_, LINE_INVALID, sizeof(validity_map_)); }
++
++ char* validityByte(int offset) {
++ return &validity_map_[offset >> kLineShift];
++ }
++
++ char* cachedData(int offset) { return &data_[offset]; }
++
++ private:
++ char data_[kPageSize];
++ static const int kValidityMapSize = kPageSize >> kLineShift;
++ char validity_map_[kValidityMapSize];
++};
++
++class AutoLockSimulatorCache : public LockGuard<Mutex> {
++ using Base = LockGuard<Mutex>;
++
++ public:
++ explicit AutoLockSimulatorCache()
++ : Base(SimulatorProcess::singleton_->cacheLock_) {}
++};
++
++mozilla::Atomic<size_t, mozilla::ReleaseAcquire>
++ SimulatorProcess::ICacheCheckingDisableCount(1);
++SimulatorProcess* SimulatorProcess::singleton_ = nullptr;
++
++int64_t Simulator::StopSimAt = -1;
++
++// -----------------------------------------------------------------------------
++// Simulator Create / Destroy.
++
++Simulator* Simulator::Create() {
++ auto sim = MakeUnique<Simulator>();
++ if (!sim) {
++ return nullptr;
++ }
++
++ if (!sim->init()) {
++ return nullptr;
++ }
++
++ int64_t stopAt;
++ char* stopAtStr = getenv("PPC64_SIM_STOP_AT");
++ if (stopAtStr && sscanf(stopAtStr, "%" PRIi64, &stopAt) == 1) {
++ fprintf(stderr, "\nStopping simulation at icount %" PRIi64 "\n", stopAt);
++ Simulator::StopSimAt = stopAt;
++ }
++
++ return sim.release();
++}
++
++void Simulator::Destroy(Simulator* sim) { js_delete(sim); }
++
++// -----------------------------------------------------------------------------
++// Debugger.
++
++class ppc64Debugger {
++ public:
++ explicit ppc64Debugger(Simulator* sim) : sim_(sim) {}
++
++ void stop(SimInstruction* instr);
++ void debug();
++ void printAllRegs();
++ void printAllRegsIncludingFPU();
++
++ private:
++ static const Instr kBreakpointInstr = 0x7FE00008; // PPC_trap
++ static const Instr kNopInstr = 0x60000000; // PPC_nop
++
++ Simulator* sim_;
++
++ int64_t getRegisterValue(int regnum);
++ int64_t getFPURegisterValueLong(int regnum);
++ float getFPURegisterValueFloat(int regnum);
++ double getFPURegisterValueDouble(int regnum);
++ bool getValue(const char* desc, int64_t* value);
++
++ bool setBreakpoint(SimInstruction* breakpc);
++ bool deleteBreakpoint(SimInstruction* breakpc);
++
++ void undoBreakpoints();
++ void redoBreakpoints();
++};
++
++[[maybe_unused]] static void UNIMPLEMENTED() {
++ printf("UNIMPLEMENTED instruction.\n");
++ MOZ_CRASH();
++}
++[[maybe_unused]] static void UNREACHABLE() {
++ printf("UNREACHABLE instruction.\n");
++ MOZ_CRASH();
++}
++[[maybe_unused]] static void UNSUPPORTED() {
++ printf("Unsupported instruction.\n");
++ MOZ_CRASH();
++}
++
++void ppc64Debugger::stop(SimInstruction* instr) {
++ uint32_t code = 0;
++ char* msg = *reinterpret_cast<char**>(sim_->get_pc() +
++ SimInstruction::kInstrSize);
++ if (!sim_->watchedStops_[code].desc_) {
++ sim_->watchedStops_[code].desc_ = msg;
++ }
++ if (code != kMaxStopCode) {
++ printf("Simulator hit stop %u: %s\n", code, msg);
++ } else {
++ printf("Simulator hit %s\n", msg);
++ }
++ sim_->set_pc(sim_->get_pc() + 2 * SimInstruction::kInstrSize);
++ debug();
++}
++
++int64_t ppc64Debugger::getRegisterValue(int regnum) {
++ if (regnum == kPCRegister) {
++ return sim_->get_pc();
++ }
++ return sim_->getRegister(regnum);
++}
++
++int64_t ppc64Debugger::getFPURegisterValueLong(int regnum) {
++ return sim_->getFpuRegister(regnum);
++}
++
++float ppc64Debugger::getFPURegisterValueFloat(int regnum) {
++ return sim_->getFpuRegisterFloat(regnum);
++}
++
++double ppc64Debugger::getFPURegisterValueDouble(int regnum) {
++ return sim_->getFpuRegisterDouble(regnum);
++}
++
++bool ppc64Debugger::getValue(const char* desc, int64_t* value) {
++ Register reg = Register::FromName(desc);
++ if (reg != InvalidReg) {
++ *value = getRegisterValue(reg.code());
++ return true;
++ }
++
++ if (strncmp(desc, "0x", 2) == 0) {
++ return sscanf(desc + 2, "%" PRIx64, reinterpret_cast<uint64_t*>(value)) ==
++ 1;
++ }
++ return sscanf(desc, "%" PRIu64, reinterpret_cast<uint64_t*>(value)) == 1;
++}
++
++bool ppc64Debugger::setBreakpoint(SimInstruction* breakpc) {
++ if (sim_->break_pc_ != nullptr) {
++ return false;
++ }
++
++ sim_->break_pc_ = breakpc;
++ sim_->break_instr_ = breakpc->instructionBits();
++ return true;
++}
++
++bool ppc64Debugger::deleteBreakpoint(SimInstruction* breakpc) {
++ if (sim_->break_pc_ != nullptr) {
++ sim_->break_pc_->setInstructionBits(sim_->break_instr_);
++ }
++
++ sim_->break_pc_ = nullptr;
++ sim_->break_instr_ = 0;
++ return true;
++}
++
++void ppc64Debugger::undoBreakpoints() {
++ if (sim_->break_pc_) {
++ sim_->break_pc_->setInstructionBits(sim_->break_instr_);
++ }
++}
++
++void ppc64Debugger::redoBreakpoints() {
++ if (sim_->break_pc_) {
++ sim_->break_pc_->setInstructionBits(kBreakpointInstr);
++ }
++}
++
++void ppc64Debugger::printAllRegs() {
++ int64_t value;
++ for (uint32_t i = 0; i < Registers::Total; i++) {
++ value = getRegisterValue(i);
++ printf("%3s: 0x%016" PRIx64 " %20" PRIi64 " ", Registers::GetName(i),
++ value, value);
++
++ if (i % 2) {
++ printf("\n");
++ }
++ }
++ printf("\n");
++
++ value = getRegisterValue(Simulator::pc);
++ printf(" pc: 0x%016" PRIx64 "\n", value);
++ printf(" lr: 0x%016" PRIx64 "\n", sim_->getLR());
++ printf(" ctr: 0x%016" PRIx64 "\n", sim_->getCTR());
++ printf(" cr: 0x%08x\n", sim_->getCR());
++ printf(" xer: 0x%016" PRIx64 "\n", sim_->getXER());
++}
++
++void ppc64Debugger::printAllRegsIncludingFPU() {
++ printAllRegs();
++
++ printf("\n\n");
++ for (uint32_t i = 0; i < FloatRegisters::TotalPhys; i++) {
++ printf("%3s: 0x%016" PRIx64 "\tflt: %-8.4g\tdbl: %-16.4g\n",
++ FloatRegisters::GetName(i), getFPURegisterValueLong(i),
++ getFPURegisterValueFloat(i), getFPURegisterValueDouble(i));
++ }
++}
++
++static char* ReadLine(const char* prompt) {
++ UniqueChars result;
++ char lineBuf[256];
++ int offset = 0;
++ bool keepGoing = true;
++ fprintf(stdout, "%s", prompt);
++ fflush(stdout);
++ while (keepGoing) {
++ if (fgets(lineBuf, sizeof(lineBuf), stdin) == nullptr) {
++ return nullptr;
++ }
++ int len = strlen(lineBuf);
++ if (len > 0 && lineBuf[len - 1] == '\n') {
++ keepGoing = false;
++ }
++ if (!result) {
++ result.reset(js_pod_malloc<char>(len + 1));
++ if (!result) {
++ return nullptr;
++ }
++ } else {
++ int new_len = offset + len + 1;
++ char* new_result = js_pod_malloc<char>(new_len);
++ if (!new_result) {
++ return nullptr;
++ }
++ memcpy(new_result, result.get(), offset * sizeof(char));
++ result.reset(new_result);
++ }
++ memcpy(result.get() + offset, lineBuf, len * sizeof(char));
++ offset += len;
++ }
++
++ MOZ_ASSERT(result);
++ result[offset] = '\0';
++ return result.release();
++}
++
++static void DisassembleInstruction(uint64_t pc) {
++ printf(" 0x%016" PRIx64 ": %08x\n", pc,
++ *reinterpret_cast<uint32_t*>(pc));
++}
++
++void ppc64Debugger::debug() {
++ intptr_t lastPC = -1;
++ bool done = false;
++
++#define COMMAND_SIZE 63
++#define ARG_SIZE 255
++
++#define STR(a) #a
++#define XSTR(a) STR(a)
++
++ char cmd[COMMAND_SIZE + 1];
++ char arg1[ARG_SIZE + 1];
++ char arg2[ARG_SIZE + 1];
++ char* argv[3] = {cmd, arg1, arg2};
++
++ cmd[COMMAND_SIZE] = 0;
++ arg1[ARG_SIZE] = 0;
++ arg2[ARG_SIZE] = 0;
++
++ undoBreakpoints();
++
++ while (!done && (sim_->get_pc() != Simulator::end_sim_pc)) {
++ if (lastPC != sim_->get_pc()) {
++ DisassembleInstruction(sim_->get_pc());
++ lastPC = sim_->get_pc();
++ }
++ char* line = ReadLine("sim> ");
++ if (line == nullptr) {
++ break;
++ } else {
++ char* last_input = sim_->lastDebuggerInput();
++ if (strcmp(line, "\n") == 0 && last_input != nullptr) {
++ line = last_input;
++ } else {
++ sim_->setLastDebuggerInput(line);
++ }
++ int argc = sscanf(line,
++ "%" XSTR(COMMAND_SIZE) "s "
++ "%" XSTR(ARG_SIZE) "s "
++ "%" XSTR(ARG_SIZE) "s",
++ cmd, arg1, arg2);
++ if ((strcmp(cmd, "si") == 0) || (strcmp(cmd, "stepi") == 0)) {
++ SimInstruction* instr =
++ reinterpret_cast<SimInstruction*>(sim_->get_pc());
++ if (!instr->isTrap()) {
++ sim_->instructionDecode(instr);
++ } else {
++ printf("/!\\ Jumping over generated breakpoint.\n");
++ sim_->set_pc(sim_->get_pc() + SimInstruction::kInstrSize);
++ }
++ sim_->icount_++;
++ } else if ((strcmp(cmd, "c") == 0) || (strcmp(cmd, "cont") == 0)) {
++ sim_->instructionDecode(
++ reinterpret_cast<SimInstruction*>(sim_->get_pc()));
++ sim_->icount_++;
++ done = true;
++ } else if ((strcmp(cmd, "p") == 0) || (strcmp(cmd, "print") == 0)) {
++ if (argc == 2) {
++ int64_t value;
++ if (strcmp(arg1, "all") == 0) {
++ printAllRegs();
++ } else if (strcmp(arg1, "allf") == 0) {
++ printAllRegsIncludingFPU();
++ } else {
++ Register reg = Register::FromName(arg1);
++ FloatRegisters::Code fReg = FloatRegisters::FromName(arg1);
++ if (reg != InvalidReg) {
++ value = getRegisterValue(reg.code());
++ printf("%s: 0x%016" PRIx64 " %20" PRIi64 " \n", arg1, value,
++ value);
++ } else if (fReg != FloatRegisters::Invalid) {
++ printf("%3s: 0x%016" PRIx64 "\tflt: %-8.4g\tdbl: %-16.4g\n",
++ FloatRegisters::GetName(fReg),
++ getFPURegisterValueLong(fReg),
++ getFPURegisterValueFloat(fReg),
++ getFPURegisterValueDouble(fReg));
++ } else {
++ printf("%s unrecognized\n", arg1);
++ }
++ }
++ } else {
++ printf("print <register> or print <fpu register> single\n");
++ }
++ } else if (strcmp(cmd, "stack") == 0 || strcmp(cmd, "mem") == 0) {
++ int64_t* cur = nullptr;
++ int64_t* end = nullptr;
++ int next_arg = 1;
++
++ if (strcmp(cmd, "stack") == 0) {
++ cur = reinterpret_cast<int64_t*>(sim_->getRegister(Simulator::sp));
++ } else {
++ int64_t value;
++ if (!getValue(arg1, &value)) {
++ printf("%s unrecognized\n", arg1);
++ continue;
++ }
++ cur = reinterpret_cast<int64_t*>(value);
++ next_arg++;
++ }
++
++ int64_t words;
++ if (argc == next_arg) {
++ words = 10;
++ } else {
++ if (!getValue(argv[next_arg], &words)) {
++ words = 10;
++ }
++ }
++ end = cur + words;
++
++ while (cur < end) {
++ printf(" %p: 0x%016" PRIx64 " %20" PRIi64, cur, *cur, *cur);
++ printf("\n");
++ cur++;
++ }
++
++ } else if ((strcmp(cmd, "disasm") == 0) || (strcmp(cmd, "dpc") == 0) ||
++ (strcmp(cmd, "di") == 0)) {
++ uint8_t* cur = nullptr;
++ uint8_t* end = nullptr;
++
++ if (argc == 1) {
++ cur = reinterpret_cast<uint8_t*>(sim_->get_pc());
++ end = cur + (10 * SimInstruction::kInstrSize);
++ } else if (argc == 2) {
++ Register reg = Register::FromName(arg1);
++ if (reg != InvalidReg || strncmp(arg1, "0x", 2) == 0) {
++ int64_t value;
++ if (getValue(arg1, &value)) {
++ cur = reinterpret_cast<uint8_t*>(value);
++ end = cur + (10 * SimInstruction::kInstrSize);
++ }
++ } else {
++ int64_t value;
++ if (getValue(arg1, &value)) {
++ cur = reinterpret_cast<uint8_t*>(sim_->get_pc());
++ end = cur + (value * SimInstruction::kInstrSize);
++ }
++ }
++ } else {
++ int64_t value1;
++ int64_t value2;
++ if (getValue(arg1, &value1) && getValue(arg2, &value2)) {
++ cur = reinterpret_cast<uint8_t*>(value1);
++ end = cur + (value2 * SimInstruction::kInstrSize);
++ }
++ }
++
++ while (cur < end) {
++ DisassembleInstruction(uint64_t(cur));
++ cur += SimInstruction::kInstrSize;
++ }
++ } else if (strcmp(cmd, "gdb") == 0) {
++ printf("relinquishing control to gdb\n");
++#if defined(__x86_64__)
++ asm("int $3");
++#elif defined(__aarch64__)
++ asm("brk #0xf000");
++#endif
++ printf("regaining control from gdb\n");
++ } else if (strcmp(cmd, "break") == 0) {
++ if (argc == 2) {
++ int64_t value;
++ if (getValue(arg1, &value)) {
++ if (!setBreakpoint(reinterpret_cast<SimInstruction*>(value))) {
++ printf("setting breakpoint failed\n");
++ }
++ } else {
++ printf("%s unrecognized\n", arg1);
++ }
++ } else {
++ printf("break <address>\n");
++ }
++ } else if (strcmp(cmd, "del") == 0) {
++ if (!deleteBreakpoint(nullptr)) {
++ printf("deleting breakpoint failed\n");
++ }
++ } else if (strcmp(cmd, "flags") == 0) {
++ printf("CR: 0x%08x XER: 0x%016" PRIx64 "\n", sim_->getCR(),
++ sim_->getXER());
++ } else if (strcmp(cmd, "stop") == 0) {
++ int64_t value;
++ intptr_t stop_pc = sim_->get_pc() - 2 * SimInstruction::kInstrSize;
++ SimInstruction* stop_instr =
++ reinterpret_cast<SimInstruction*>(stop_pc);
++ SimInstruction* msg_address = reinterpret_cast<SimInstruction*>(
++ stop_pc + SimInstruction::kInstrSize);
++ if ((argc == 2) && (strcmp(arg1, "unstop") == 0)) {
++ if (sim_->isStopInstruction(stop_instr)) {
++ stop_instr->setInstructionBits(kNopInstr);
++ msg_address->setInstructionBits(kNopInstr);
++ } else {
++ printf("Not at debugger stop.\n");
++ }
++ } else if (argc == 3) {
++ if (strcmp(arg1, "info") == 0) {
++ if (strcmp(arg2, "all") == 0) {
++ printf("Stop information:\n");
++ for (uint32_t i = kMaxWatchpointCode + 1; i <= kMaxStopCode;
++ i++) {
++ sim_->printStopInfo(i);
++ }
++ } else if (getValue(arg2, &value)) {
++ sim_->printStopInfo(value);
++ } else {
++ printf("Unrecognized argument.\n");
++ }
++ } else if (strcmp(arg1, "enable") == 0) {
++ if (strcmp(arg2, "all") == 0) {
++ for (uint32_t i = kMaxWatchpointCode + 1; i <= kMaxStopCode;
++ i++) {
++ sim_->enableStop(i);
++ }
++ } else if (getValue(arg2, &value)) {
++ sim_->enableStop(value);
++ } else {
++ printf("Unrecognized argument.\n");
++ }
++ } else if (strcmp(arg1, "disable") == 0) {
++ if (strcmp(arg2, "all") == 0) {
++ for (uint32_t i = kMaxWatchpointCode + 1; i <= kMaxStopCode;
++ i++) {
++ sim_->disableStop(i);
++ }
++ } else if (getValue(arg2, &value)) {
++ sim_->disableStop(value);
++ } else {
++ printf("Unrecognized argument.\n");
++ }
++ }
++ } else {
++ printf("Wrong usage. Use help command for more information.\n");
++ }
++ } else if ((strcmp(cmd, "h") == 0) || (strcmp(cmd, "help") == 0)) {
++ printf("cont\n");
++ printf(" continue execution (alias 'c')\n");
++ printf("stepi\n");
++ printf(" step one instruction (alias 'si')\n");
++ printf("print <register>\n");
++ printf(" print register content (alias 'p')\n");
++ printf(" use register name 'all' to print all registers\n");
++ printf("stack [<words>]\n");
++ printf(" dump stack content, default dump 10 words)\n");
++ printf("mem <address> [<words>]\n");
++ printf(" dump memory content, default dump 10 words)\n");
++ printf("flags\n");
++ printf(" print CR and XER\n");
++ printf("disasm [<instructions>]\n");
++ printf("disasm [<address/register>]\n");
++ printf("disasm [[<address/register>] <instructions>]\n");
++ printf(" disassemble code, default is 10 instructions\n");
++ printf(" from pc (alias 'di')\n");
++ printf("gdb\n");
++ printf(" enter gdb\n");
++ printf("break <address>\n");
++ printf(" set a break point on the address\n");
++ printf("del\n");
++ printf(" delete the breakpoint\n");
++ } else {
++ printf("Unknown command: %s\n", cmd);
++ }
++ }
++ }
++
++ redoBreakpoints();
++
++#undef COMMAND_SIZE
++#undef ARG_SIZE
++
++#undef STR
++#undef XSTR
++}
++
++// -----------------------------------------------------------------------------
++// ICache helpers.
++
++static bool AllOnOnePage(uintptr_t start, int size) {
++ intptr_t start_page = (start & ~CachePage::kPageMask);
++ intptr_t end_page = ((start + size) & ~CachePage::kPageMask);
++ return start_page == end_page;
++}
++
++void Simulator::setLastDebuggerInput(char* input) {
++ js_free(lastDebuggerInput_);
++ lastDebuggerInput_ = input;
++}
++
++static CachePage* GetCachePageLocked(SimulatorProcess::ICacheMap& i_cache,
++ void* page) {
++ SimulatorProcess::ICacheMap::AddPtr p = i_cache.lookupForAdd(page);
++ if (p) {
++ return p->value();
++ }
++ AutoEnterOOMUnsafeRegion oomUnsafe;
++ CachePage* new_page = js_new<CachePage>();
++ if (!new_page || !i_cache.add(p, page, new_page)) {
++ oomUnsafe.crash("Simulator CachePage");
++ }
++ return new_page;
++}
++
++static void FlushOnePageLocked(SimulatorProcess::ICacheMap& i_cache,
++ intptr_t start, int size) {
++ MOZ_ASSERT(size <= CachePage::kPageSize);
++ MOZ_ASSERT(AllOnOnePage(start, size - 1));
++ MOZ_ASSERT((start & CachePage::kLineMask) == 0);
++ MOZ_ASSERT((size & CachePage::kLineMask) == 0);
++ void* page = reinterpret_cast<void*>(start & (~CachePage::kPageMask));
++ int offset = (start & CachePage::kPageMask);
++ CachePage* cache_page = GetCachePageLocked(i_cache, page);
++ char* valid_bytemap = cache_page->validityByte(offset);
++ memset(valid_bytemap, CachePage::LINE_INVALID, size >> CachePage::kLineShift);
++}
++
++static void FlushICacheLocked(SimulatorProcess::ICacheMap& i_cache,
++ void* start_addr, size_t size) {
++ intptr_t start = reinterpret_cast<intptr_t>(start_addr);
++ int intra_line = (start & CachePage::kLineMask);
++ start -= intra_line;
++ size += intra_line;
++ size = ((size - 1) | CachePage::kLineMask) + 1;
++ int offset = (start & CachePage::kPageMask);
++ while (!AllOnOnePage(start, size - 1)) {
++ int bytes_to_flush = CachePage::kPageSize - offset;
++ FlushOnePageLocked(i_cache, start, bytes_to_flush);
++ start += bytes_to_flush;
++ size -= bytes_to_flush;
++ MOZ_ASSERT((start & CachePage::kPageMask) == 0);
++ offset = 0;
++ }
++ if (size != 0) {
++ FlushOnePageLocked(i_cache, start, size);
++ }
++}
++
++/* static */
++void SimulatorProcess::checkICacheLocked(SimInstruction* instr) {
++ intptr_t address = reinterpret_cast<intptr_t>(instr);
++ void* page = reinterpret_cast<void*>(address & (~CachePage::kPageMask));
++ void* line = reinterpret_cast<void*>(address & (~CachePage::kLineMask));
++ int offset = (address & CachePage::kPageMask);
++ CachePage* cache_page = GetCachePageLocked(icache(), page);
++ char* cache_valid_byte = cache_page->validityByte(offset);
++ bool cache_hit = (*cache_valid_byte == CachePage::LINE_VALID);
++ char* cached_line = cache_page->cachedData(offset & ~CachePage::kLineMask);
++
++ if (cache_hit) {
++ mozilla::DebugOnly<int> cmpret =
++ memcmp(reinterpret_cast<void*>(instr), cache_page->cachedData(offset),
++ SimInstruction::kInstrSize);
++ MOZ_ASSERT(cmpret == 0);
++ } else {
++ memcpy(cached_line, line, CachePage::kLineLength);
++ *cache_valid_byte = CachePage::LINE_VALID;
++ }
++}
++
++HashNumber SimulatorProcess::ICacheHasher::hash(const Lookup& l) {
++ return U32(reinterpret_cast<uintptr_t>(l)) >> 2;
++}
++
++bool SimulatorProcess::ICacheHasher::match(const Key& k, const Lookup& l) {
++ MOZ_ASSERT((reinterpret_cast<intptr_t>(k) & CachePage::kPageMask) == 0);
++ MOZ_ASSERT((reinterpret_cast<intptr_t>(l) & CachePage::kPageMask) == 0);
++ return k == l;
++}
++
++/* static */
++void SimulatorProcess::FlushICache(void* start_addr, size_t size) {
++ if (!ICacheCheckingDisableCount) {
++ AutoLockSimulatorCache als;
++ js::jit::FlushICacheLocked(icache(), start_addr, size);
++ }
++}
++
++// -----------------------------------------------------------------------------
++// Redirection.
++
++class Redirection {
++ friend class SimulatorProcess;
++
++ Redirection(void* nativeFunction, ABIFunctionType type)
++ : nativeFunction_(nativeFunction),
++ swiInstruction_(kCallRedirInstr),
++ type_(type),
++ next_(nullptr) {
++ next_ = SimulatorProcess::redirection();
++ if (!SimulatorProcess::ICacheCheckingDisableCount) {
++ FlushICacheLocked(SimulatorProcess::icache(), addressOfSwiInstruction(),
++ SimInstruction::kInstrSize);
++ }
++ SimulatorProcess::setRedirection(this);
++ }
++
++ public:
++ void* addressOfSwiInstruction() { return &swiInstruction_; }
++ void* nativeFunction() const { return nativeFunction_; }
++ ABIFunctionType type() const { return type_; }
++
++ static Redirection* Get(void* nativeFunction, ABIFunctionType type) {
++ AutoLockSimulatorCache als;
++
++ Redirection* current = SimulatorProcess::redirection();
++ for (; current != nullptr; current = current->next_) {
++ if (current->nativeFunction_ == nativeFunction) {
++ MOZ_ASSERT(current->type() == type);
++ return current;
++ }
++ }
++
++ AutoEnterOOMUnsafeRegion oomUnsafe;
++ Redirection* redir = js_pod_malloc<Redirection>(1);
++ if (!redir) {
++ oomUnsafe.crash("Simulator redirection");
++ }
++ new (redir) Redirection(nativeFunction, type);
++ return redir;
++ }
++
++ static Redirection* FromSwiInstruction(SimInstruction* swiInstruction) {
++ uint8_t* addrOfSwi = reinterpret_cast<uint8_t*>(swiInstruction);
++ uint8_t* addrOfRedirection =
++ addrOfSwi - offsetof(Redirection, swiInstruction_);
++ return reinterpret_cast<Redirection*>(addrOfRedirection);
++ }
++
++ private:
++ void* nativeFunction_;
++ uint32_t swiInstruction_;
++ ABIFunctionType type_;
++ Redirection* next_;
++};
++
++// -----------------------------------------------------------------------------
++// Simulator constructor / destructor / init.
++
++Simulator::Simulator() {
++ stack_ = nullptr;
++ stackLimit_ = 0;
++ pc_modified_ = false;
++ icount_ = 0;
++ break_count_ = 0;
++ break_pc_ = nullptr;
++ break_instr_ = 0;
++ single_stepping_ = false;
++ single_step_callback_ = nullptr;
++ single_step_callback_arg_ = nullptr;
++
++ for (int i = 0; i < Register::kNumSimuRegisters; i++) {
++ registers_[i] = 0;
++ }
++ for (int i = 0; i < Simulator::FPURegister::kNumFPURegisters; i++) {
++ FPUregisters_[i] = 0;
++ }
++
++ LR_ = 0;
++ CTR_ = 0;
++ CR_ = 0;
++ XER_ = 0;
++ FPSCR_ = 0;
++ LLBit_ = false;
++ LLAddr_ = 0;
++ lastLLValue_ = 0;
++
++ // Initialize PC and LR to a known bad value that will cause an
++ // access violation if the simulator ever tries to execute it.
++ registers_[pc] = bad_ra;
++ LR_ = bad_ra;
++
++ lastDebuggerInput_ = nullptr;
++}
++
++bool Simulator::init() {
++ static const size_t stackSize = 2 * 1024 * 1024;
++ stack_ = js_pod_malloc<char>(stackSize);
++ if (!stack_) {
++ return false;
++ }
++
++ // Leave a safety margin of 1MB to prevent overrunning the stack.
++ stackLimit_ = reinterpret_cast<uintptr_t>(stack_) + 1024 * 1024;
++
++ // The sp is initialized to point to the bottom (high address) of the
++ // allocated stack area.
++ registers_[sp] = reinterpret_cast<int64_t>(stack_) + stackSize - 64;
++
++ // Zero-initialize VR namespace. Simulated PPC64 does not guarantee any
++ // value in VRs at entry, but zeroing avoids uninitialized-read false
++ // positives in tools and makes regression traces deterministic.
++ memset(VRregisters_, 0, sizeof(VRregisters_));
++
++ return true;
++}
++
++Simulator::~Simulator() { js_free(stack_); }
++
++SimulatorProcess::SimulatorProcess()
++ : cacheLock_(mutexid::SimulatorCacheLock), redirection_(nullptr) {
++ if (getenv("PPC64_SIM_ICACHE_CHECKS")) {
++ ICacheCheckingDisableCount = 0;
++ }
++}
++
++SimulatorProcess::~SimulatorProcess() {
++ Redirection* r = redirection_;
++ while (r) {
++ Redirection* next = r->next_;
++ js_delete(r);
++ r = next;
++ }
++}
++
++/* static */
++void* Simulator::RedirectNativeFunction(void* nativeFunction,
++ ABIFunctionType type) {
++ Redirection* redirection = Redirection::Get(nativeFunction, type);
++ return redirection->addressOfSwiInstruction();
++}
++
++Simulator* Simulator::Current() {
++ JSContext* cx = TlsContext.get();
++ MOZ_ASSERT(CurrentThreadCanAccessRuntime(cx->runtime()));
++ return cx->simulator();
++}
++
++// -----------------------------------------------------------------------------
++// Register accessors.
++
++void Simulator::setRegister(int reg, int64_t value) {
++ MOZ_ASSERT((reg >= 0) && (reg < Register::kNumSimuRegisters));
++ if (reg == pc) {
++ pc_modified_ = true;
++ }
++ registers_[reg] = value;
++}
++
++int64_t Simulator::getRegister(int reg) const {
++ MOZ_ASSERT((reg >= 0) && (reg < Register::kNumSimuRegisters));
++ return registers_[reg] + ((reg == pc) ? SimInstruction::kPCReadOffset : 0);
++}
++
++void Simulator::setFpuRegister(int fpureg, int64_t value) {
++ MOZ_ASSERT((fpureg >= 0) &&
++ (fpureg < Simulator::FPURegister::kNumFPURegisters));
++ FPUregisters_[fpureg] = value;
++}
++
++void Simulator::setFpuRegisterWord(int fpureg, int32_t value) {
++ MOZ_ASSERT((fpureg >= 0) &&
++ (fpureg < Simulator::FPURegister::kNumFPURegisters));
++ int32_t* pword;
++ pword = reinterpret_cast<int32_t*>(&FPUregisters_[fpureg]);
++ *pword = value;
++}
++
++// Promote f32 → f64 preserving NaN payload, like PPC64's `lfs` and
++// `xscvspdpn`. The plain C cast `(double)f32_nan` is permitted by the
++// standard to quiet a signaling NaN, which on x86/ARM hosts visibly
++// transforms 0x7FA00000 (sNaN) into a qNaN such as 0x7FE00000 — breaking
++// every wasm test that loads a constant sNaN bit pattern. Manually
++// reconstruct the f64 NaN with the same sign + payload (payload shifted
++// left by 29 to fill the wider mantissa).
++static double promoteFloatPreservingNaN(float f) {
++ uint32_t bits;
++ memcpy(&bits, &f, sizeof(bits));
++ if ((bits & 0x7F800000u) == 0x7F800000u && (bits & 0x007FFFFFu) != 0u) {
++ uint64_t sign = uint64_t(bits >> 31) & 1u;
++ uint64_t payload = uint64_t(bits & 0x007FFFFFu);
++ uint64_t dbits = (sign << 63) | (uint64_t(0x7FFu) << 52) | (payload << 29);
++ double d;
++ memcpy(&d, &dbits, sizeof(d));
++ return d;
++ }
++ return (double)f;
++}
++
++// Demote f64 → f32 preserving NaN payload (non-signaling: matches PPC64
++// `stfs` / `xscvdpspn`, and wasm `lfs`-equivalent stores). Truncates the
++// lower 29 bits of the f64 payload (those bits cannot be represented in
++// the narrower f32 mantissa); if the truncation would yield a payload of
++// zero (which would degrade the NaN to an Infinity), force the LSB so
++// the result is still a NaN. This intentionally does NOT set the quiet
++// bit — that's the job of the explicit-quieting op `xscvdpsp` and
++// f32.demote_f64's wasm-level lowering.
++static float demoteDoublePreservingNaN(double d) {
++ uint64_t bits;
++ memcpy(&bits, &d, sizeof(bits));
++ if ((bits & 0x7FF0000000000000ULL) == 0x7FF0000000000000ULL &&
++ (bits & 0x000FFFFFFFFFFFFFULL) != 0) {
++ uint32_t sign = uint32_t(bits >> 63) & 1u;
++ uint32_t payload = uint32_t((bits >> 29) & 0x007FFFFFu);
++ if (payload == 0) payload = 1;
++ uint32_t fbits = (sign << 31) | 0x7F800000u | payload;
++ float f;
++ memcpy(&f, &fbits, sizeof(f));
++ return f;
++ }
++ return (float)d;
++}
++
++void Simulator::setFpuRegisterFloat(int fpureg, float value) {
++ MOZ_ASSERT((fpureg >= 0) &&
++ (fpureg < Simulator::FPURegister::kNumFPURegisters));
++ // ELFv2 ABI: single-precision values in FPRs are stored as their
++ // double-precision representation. Promote and store the full 8 bytes,
++ // not just the low 4. (Otherwise the upper 4 bytes are stale, matching
++ // the layout that fctid/fcfid/lfd would read but NOT what the JIT and
++ // the C ABI expect for a 'float' parameter.) Use the NaN-preserving
++ // helper so a signaling-NaN return value isn't quieted into a qNaN.
++ double promoted = promoteFloatPreservingNaN(value);
++ memcpy(&FPUregisters_[fpureg], &promoted, sizeof(promoted));
++}
++
++void Simulator::setFpuRegisterDouble(int fpureg, double value) {
++ MOZ_ASSERT((fpureg >= 0) &&
++ (fpureg < Simulator::FPURegister::kNumFPURegisters));
++ *mozilla::BitwiseCast<double*>(&FPUregisters_[fpureg]) = value;
++}
++
++int64_t Simulator::getFpuRegister(int fpureg) const {
++ MOZ_ASSERT((fpureg >= 0) &&
++ (fpureg < Simulator::FPURegister::kNumFPURegisters));
++ return FPUregisters_[fpureg];
++}
++
++int32_t Simulator::getFpuRegisterWord(int fpureg) const {
++ MOZ_ASSERT((fpureg >= 0) &&
++ (fpureg < Simulator::FPURegister::kNumFPURegisters));
++ return *mozilla::BitwiseCast<int32_t*>(&FPUregisters_[fpureg]);
++}
++
++int32_t Simulator::getFpuRegisterSignedWord(int fpureg) const {
++ MOZ_ASSERT((fpureg >= 0) &&
++ (fpureg < Simulator::FPURegister::kNumFPURegisters));
++ return *mozilla::BitwiseCast<int32_t*>(&FPUregisters_[fpureg]);
++}
++
++float Simulator::getFpuRegisterFloat(int fpureg) const {
++ MOZ_ASSERT((fpureg >= 0) &&
++ (fpureg < Simulator::FPURegister::kNumFPURegisters));
++ // ELFv2 ABI: single-precision values are passed/returned in FPRs as their
++ // double-precision representation. Read the full 8 bytes as double, then
++ // narrow to float — matching the `frsp` the C callee would do, and matching
++ // what real PPC64 hardware sees when the FPR was loaded via `lfs`. Use the
++ // NaN-preserving helper so a signaling-NaN parameter isn't quieted.
++ double promoted;
++ memcpy(&promoted, &FPUregisters_[fpureg], sizeof(promoted));
++ return demoteDoublePreservingNaN(promoted);
++}
++
++double Simulator::getFpuRegisterDouble(int fpureg) const {
++ MOZ_ASSERT((fpureg >= 0) &&
++ (fpureg < Simulator::FPURegister::kNumFPURegisters));
++ return *mozilla::BitwiseCast<double*>(&FPUregisters_[fpureg]);
++}
++
++void Simulator::setVRBytes(int vreg, const uint8_t bytes[16]) {
++ MOZ_ASSERT((vreg >= 0) && (vreg < kNumVRRegisters));
++ memcpy(VRregisters_[vreg], bytes, 16);
++}
++
++void Simulator::getVRBytes(int vreg, uint8_t bytes[16]) const {
++ MOZ_ASSERT((vreg >= 0) && (vreg < kNumVRRegisters));
++ memcpy(bytes, VRregisters_[vreg], 16);
++}
++
++void Simulator::getVSR128(int vsr, uint8_t bytes[16]) const {
++ MOZ_ASSERT((vsr >= 0) && (vsr < kNumFPURegisters + kNumVRRegisters));
++ if (vsr < kNumFPURegisters) {
++ // VSR 0-31: FPR view. The FPR scalar lives in BE DW0 of the VSR,
++ // which on PPC64LE register storage maps to LE bytes 8-15.
++ // DW1 is undefined per ISA; we model it as zero.
++ // `lfd f0,(mem); xxlor <vr>,f0,f0; stxvx <vr>,...` writes the
++ // double's 8 bytes to the HIGH half of the 16-byte store (LE
++ // bytes 8-15).
++ int64_t val = FPUregisters_[vsr];
++ memset(bytes, 0, 8);
++ memcpy(bytes + 8, &val, 8);
++ } else {
++ memcpy(bytes, VRregisters_[vsr - kNumFPURegisters], 16);
++ }
++}
++
++void Simulator::setVSR128(int vsr, const uint8_t bytes[16]) {
++ MOZ_ASSERT((vsr >= 0) && (vsr < kNumFPURegisters + kNumVRRegisters));
++ if (vsr < kNumFPURegisters) {
++ // FPR scalar at BE DW0 = LE bytes 8-15. DW1 is architecturally
++ // discarded on VSR-to-FPR writes.
++ int64_t val;
++ memcpy(&val, bytes + 8, 8);
++ FPUregisters_[vsr] = val;
++ } else {
++ memcpy(VRregisters_[vsr - kNumFPURegisters], bytes, 16);
++ }
++}
++
++void Simulator::setCallResultDouble(double result) {
++ setFpuRegisterDouble(Simulator::f1, result);
++}
++
++void Simulator::setCallResultFloat(float result) {
++ setFpuRegisterFloat(Simulator::f1, result);
++}
++
++void Simulator::setCallResult(int64_t res) { setRegister(r3, res); }
++
++#ifdef XP_DARWIN
++void Simulator::setCallResult(intptr_t res) {
++ setRegister(r3, I64(res));
++}
++#endif
++
++void Simulator::setCallResult(__int128 res) {
++ setRegister(r3, I64(res));
++ setRegister(r4, I64(res >> 64));
++}
++
++void Simulator::set_pc(int64_t value) {
++ pc_modified_ = true;
++ registers_[pc] = value;
++}
++
++bool Simulator::has_bad_pc() const {
++ return ((registers_[pc] == bad_ra) || (registers_[pc] == end_sim_pc));
++}
++
++int64_t Simulator::get_pc() const { return registers_[pc]; }
++
++JS::ProfilingFrameIterator::RegisterState Simulator::registerState() {
++ wasm::RegisterState state;
++ state.pc = (void*)get_pc();
++ state.fp = (void*)getRegister(fp);
++ state.sp = (void*)getRegister(sp);
++ state.lr = (void*)getLR();
++ return state;
++}
++
++// -----------------------------------------------------------------------------
++// Memory access helpers.
++
++uint8_t Simulator::readBU(uint64_t addr) {
++ if (handleWasmSegFault(addr, 1)) {
++ return 0xff;
++ }
++ uint8_t* ptr = reinterpret_cast<uint8_t*>(addr);
++ return *ptr;
++}
++
++int8_t Simulator::readB(uint64_t addr) {
++ if (handleWasmSegFault(addr, 1)) {
++ return -1;
++ }
++ int8_t* ptr = reinterpret_cast<int8_t*>(addr);
++ return *ptr;
++}
++
++void Simulator::writeB(uint64_t addr, uint8_t value) {
++ if (handleWasmSegFault(addr, 1)) {
++ return;
++ }
++ uint8_t* ptr = reinterpret_cast<uint8_t*>(addr);
++ *ptr = value;
++}
++
++void Simulator::writeB(uint64_t addr, int8_t value) {
++ if (handleWasmSegFault(addr, 1)) {
++ return;
++ }
++ int8_t* ptr = reinterpret_cast<int8_t*>(addr);
++ *ptr = value;
++}
++
++uint16_t Simulator::readHU(uint64_t addr, SimInstruction* instr) {
++ if (handleWasmSegFault(addr, 2)) {
++ return 0xffff;
++ }
++ uint16_t* ptr = reinterpret_cast<uint16_t*>(addr);
++ return *ptr;
++}
++
++int16_t Simulator::readH(uint64_t addr, SimInstruction* instr) {
++ if (handleWasmSegFault(addr, 2)) {
++ return -1;
++ }
++ int16_t* ptr = reinterpret_cast<int16_t*>(addr);
++ return *ptr;
++}
++
++void Simulator::writeH(uint64_t addr, uint16_t value, SimInstruction* instr) {
++ if (handleWasmSegFault(addr, 2)) {
++ return;
++ }
++ uint16_t* ptr = reinterpret_cast<uint16_t*>(addr);
++ LLBit_ = false;
++ *ptr = value;
++}
++
++void Simulator::writeH(uint64_t addr, int16_t value, SimInstruction* instr) {
++ if (handleWasmSegFault(addr, 2)) {
++ return;
++ }
++ int16_t* ptr = reinterpret_cast<int16_t*>(addr);
++ LLBit_ = false;
++ *ptr = value;
++}
++
++uint32_t Simulator::readWU(uint64_t addr, SimInstruction* instr) {
++ if (handleWasmSegFault(addr, 4)) {
++ return -1;
++ }
++ uint32_t* ptr = reinterpret_cast<uint32_t*>(addr);
++ return *ptr;
++}
++
++int32_t Simulator::readW(uint64_t addr, SimInstruction* instr) {
++ if (handleWasmSegFault(addr, 4)) {
++ return -1;
++ }
++ int32_t* ptr = reinterpret_cast<int32_t*>(addr);
++ return *ptr;
++}
++
++void Simulator::writeW(uint64_t addr, uint32_t value, SimInstruction* instr) {
++ if (handleWasmSegFault(addr, 4)) {
++ return;
++ }
++ uint32_t* ptr = reinterpret_cast<uint32_t*>(addr);
++ LLBit_ = false;
++ *ptr = value;
++}
++
++void Simulator::writeW(uint64_t addr, int32_t value, SimInstruction* instr) {
++ if (handleWasmSegFault(addr, 4)) {
++ return;
++ }
++ int32_t* ptr = reinterpret_cast<int32_t*>(addr);
++ LLBit_ = false;
++ *ptr = value;
++}
++
++int64_t Simulator::readDW(uint64_t addr, SimInstruction* instr) {
++ if (handleWasmSegFault(addr, 8)) {
++ return -1;
++ }
++ int64_t* ptr = reinterpret_cast<int64_t*>(addr);
++ return *ptr;
++}
++
++void Simulator::writeDW(uint64_t addr, int64_t value, SimInstruction* instr) {
++ if (handleWasmSegFault(addr, 8)) {
++ return;
++ }
++ int64_t* ptr = reinterpret_cast<int64_t*>(addr);
++ LLBit_ = false;
++ *ptr = value;
++}
++
++double Simulator::readD(uint64_t addr, SimInstruction* instr) {
++ if (handleWasmSegFault(addr, 8)) {
++ return NAN;
++ }
++ double* ptr = reinterpret_cast<double*>(addr);
++ return *ptr;
++}
++
++void Simulator::writeD(uint64_t addr, double value, SimInstruction* instr) {
++ if (handleWasmSegFault(addr, 8)) {
++ return;
++ }
++ double* ptr = reinterpret_cast<double*>(addr);
++ LLBit_ = false;
++ *ptr = value;
++}
++
++// Byte-wide load-reserve / store-conditional (lbarx / stbcx.).
++// Byte accesses have no alignment requirement.
++uint8_t Simulator::loadLinkedB(uint64_t addr, SimInstruction* instr) {
++ if (handleWasmSegFault(addr, 1)) {
++ return 0;
++ }
++ volatile uint8_t* ptr = reinterpret_cast<volatile uint8_t*>(addr);
++ uint8_t value = *ptr;
++ lastLLValue_ = value;
++ LLAddr_ = addr;
++ LLBit_ = true;
++ return value;
++}
++
++int Simulator::storeConditionalB(uint64_t addr, uint8_t value,
++ SimInstruction* instr) {
++ if (addr != LLAddr_) {
++ printf("stbcx. to bad address: 0x%016" PRIx64 ", pc=0x%016" PRIxPTR
++ ", expected: 0x%016" PRIxPTR "\n",
++ addr, reinterpret_cast<intptr_t>(instr), LLAddr_);
++ MOZ_CRASH();
++ }
++ SharedMem<uint8_t*> ptr =
++ SharedMem<uint8_t*>::shared(reinterpret_cast<uint8_t*>(addr));
++ if (!LLBit_) {
++ return 0;
++ }
++ LLBit_ = false;
++ LLAddr_ = 0;
++ uint8_t expected = uint8_t(lastLLValue_);
++ uint8_t old =
++ AtomicOperations::compareExchangeSeqCst(ptr, expected, value);
++ return (old == expected) ? 1 : 0;
++}
++
++// Halfword-wide load-reserve / store-conditional (lharx / sthcx.).
++// 2-byte aligned per ISA.
++uint16_t Simulator::loadLinkedH(uint64_t addr, SimInstruction* instr) {
++ if ((addr & 1) == 0) {
++ if (handleWasmSegFault(addr, 2)) {
++ return 0;
++ }
++ volatile uint16_t* ptr = reinterpret_cast<volatile uint16_t*>(addr);
++ uint16_t value = *ptr;
++ lastLLValue_ = value;
++ LLAddr_ = addr;
++ LLBit_ = true;
++ return value;
++ }
++ printf("Unaligned lharx at 0x%016" PRIx64 ", pc=0x%016" PRIxPTR "\n", addr,
++ reinterpret_cast<intptr_t>(instr));
++ MOZ_CRASH();
++ return 0;
++}
++
++int Simulator::storeConditionalH(uint64_t addr, uint16_t value,
++ SimInstruction* instr) {
++ if (addr != LLAddr_) {
++ printf("sthcx. to bad address: 0x%016" PRIx64 ", pc=0x%016" PRIxPTR
++ ", expected: 0x%016" PRIxPTR "\n",
++ addr, reinterpret_cast<intptr_t>(instr), LLAddr_);
++ MOZ_CRASH();
++ }
++ if ((addr & 1) == 0) {
++ SharedMem<uint16_t*> ptr =
++ SharedMem<uint16_t*>::shared(reinterpret_cast<uint16_t*>(addr));
++ if (!LLBit_) {
++ return 0;
++ }
++ LLBit_ = false;
++ LLAddr_ = 0;
++ uint16_t expected = uint16_t(lastLLValue_);
++ uint16_t old =
++ AtomicOperations::compareExchangeSeqCst(ptr, expected, value);
++ return (old == expected) ? 1 : 0;
++ }
++ printf("Unaligned sthcx. at 0x%016" PRIx64 ", pc=0x%016" PRIxPTR "\n", addr,
++ reinterpret_cast<intptr_t>(instr));
++ MOZ_CRASH();
++ return 0;
++}
++
++int32_t Simulator::loadLinkedW(uint64_t addr, SimInstruction* instr) {
++ if ((addr & 3) == 0) {
++ if (handleWasmSegFault(addr, 4)) {
++ return -1;
++ }
++
++ volatile int32_t* ptr = reinterpret_cast<volatile int32_t*>(addr);
++ int32_t value = *ptr;
++ lastLLValue_ = value;
++ LLAddr_ = addr;
++ LLBit_ = true;
++ return value;
++ }
++ printf("Unaligned lwarx at 0x%016" PRIx64 ", pc=0x%016" PRIxPTR "\n", addr,
++ reinterpret_cast<intptr_t>(instr));
++ MOZ_CRASH();
++ return 0;
++}
++
++int Simulator::storeConditionalW(uint64_t addr, int32_t value,
++ SimInstruction* instr) {
++ if (addr != LLAddr_) {
++ printf("stwcx. to bad address: 0x%016" PRIx64 ", pc=0x%016" PRIxPTR
++ ", expected: 0x%016" PRIxPTR "\n",
++ addr, reinterpret_cast<intptr_t>(instr), LLAddr_);
++ MOZ_CRASH();
++ }
++
++ if ((addr & 3) == 0) {
++ SharedMem<int32_t*> ptr =
++ SharedMem<int32_t*>::shared(reinterpret_cast<int32_t*>(addr));
++
++ if (!LLBit_) {
++ return 0;
++ }
++
++ LLBit_ = false;
++ LLAddr_ = 0;
++ int32_t expected = int32_t(lastLLValue_);
++ int32_t old =
++ AtomicOperations::compareExchangeSeqCst(ptr, expected, value);
++ return (old == expected) ? 1 : 0;
++ }
++ printf("Unaligned stwcx. at 0x%016" PRIx64 ", pc=0x%016" PRIxPTR "\n", addr,
++ reinterpret_cast<intptr_t>(instr));
++ MOZ_CRASH();
++ return 0;
++}
++
++int64_t Simulator::loadLinkedD(uint64_t addr, SimInstruction* instr) {
++ if ((addr & kPointerAlignmentMask) == 0) {
++ if (handleWasmSegFault(addr, 8)) {
++ return -1;
++ }
++
++ volatile int64_t* ptr = reinterpret_cast<volatile int64_t*>(addr);
++ int64_t value = *ptr;
++ lastLLValue_ = value;
++ LLAddr_ = addr;
++ LLBit_ = true;
++ return value;
++ }
++ printf("Unaligned ldarx at 0x%016" PRIx64 ", pc=0x%016" PRIxPTR "\n", addr,
++ reinterpret_cast<intptr_t>(instr));
++ MOZ_CRASH();
++ return 0;
++}
++
++int Simulator::storeConditionalD(uint64_t addr, int64_t value,
++ SimInstruction* instr) {
++ if (addr != LLAddr_) {
++ printf("stdcx. to bad address: 0x%016" PRIx64 ", pc=0x%016" PRIxPTR
++ ", expected: 0x%016" PRIxPTR "\n",
++ addr, reinterpret_cast<intptr_t>(instr), LLAddr_);
++ MOZ_CRASH();
++ }
++
++ if ((addr & kPointerAlignmentMask) == 0) {
++ SharedMem<int64_t*> ptr =
++ SharedMem<int64_t*>::shared(reinterpret_cast<int64_t*>(addr));
++
++ if (!LLBit_) {
++ return 0;
++ }
++
++ LLBit_ = false;
++ LLAddr_ = 0;
++ int64_t expected = lastLLValue_;
++ int64_t old =
++ AtomicOperations::compareExchangeSeqCst(ptr, expected, value);
++ return (old == expected) ? 1 : 0;
++ }
++ printf("Unaligned stdcx. at 0x%016" PRIx64 ", pc=0x%016" PRIxPTR "\n", addr,
++ reinterpret_cast<intptr_t>(instr));
++ MOZ_CRASH();
++ return 0;
++}
++
++// -----------------------------------------------------------------------------
++// Stack limit / recursion helpers.
++
++uintptr_t Simulator::stackLimit() const { return stackLimit_; }
++
++uintptr_t* Simulator::addressOfStackLimit() { return &stackLimit_; }
++
++bool Simulator::overRecursed(uintptr_t newsp) const {
++ if (newsp == 0) {
++ newsp = getRegister(sp);
++ }
++ return newsp <= stackLimit();
++}
++
++bool Simulator::overRecursedWithExtra(uint32_t extra) const {
++ uintptr_t newsp = getRegister(sp) - extra;
++ return newsp <= stackLimit();
++}
++
++void Simulator::format(SimInstruction* instr, const char* format) {
++ printf("Simulator found unsupported instruction:\n 0x%016" PRIxPTR
++ ": %08x %s\n",
++ reinterpret_cast<intptr_t>(instr), instr->instructionBits(), format);
++ MOZ_CRASH();
++}
++
++// -----------------------------------------------------------------------------
++// softwareInterrupt - handle kCallRedirInstr (PPC_stop) and PPC_trap.
++
++ABI_FUNCTION_TYPE_SIM_PROTOTYPES
++
++void Simulator::softwareInterrupt(SimInstruction* instr) {
++ uint32_t instrBits = instr->instructionBits();
++
++ if (instrBits == kCallRedirInstr) {
++ Redirection* redirection = Redirection::FromSwiInstruction(instr);
++ uintptr_t nativeFn =
++ reinterpret_cast<uintptr_t>(redirection->nativeFunction());
++
++ // Get the SP for reading stack arguments.
++ int64_t* sp_ = reinterpret_cast<int64_t*>(getRegister(sp));
++ // Skip past the PPC64 ELFv2 link area (4 doublewords = 32 bytes).
++ sp_ = reinterpret_cast<int64_t*>(reinterpret_cast<uintptr_t>(sp_) + 32);
++
++ // PPC64 ELFv2: integer args in r3-r10, FP args in f1-f13.
++ int64_t a0_ = getRegister(r3);
++ int64_t a1_ = getRegister(r4);
++ int64_t a2_ = getRegister(r5);
++ int64_t a3_ = getRegister(r6);
++ int64_t a4_ = getRegister(r7);
++ int64_t a5_ = getRegister(r8);
++ int64_t a6_ = getRegister(r9);
++ int64_t a7_ = getRegister(r10);
++ // PPC64 ELFv2: FP args in f1-f13, mapped to f0_s..f12_s and f0_d..f12_d.
++ float f0_s = getFpuRegisterFloat(Simulator::f1);
++ float f1_s = getFpuRegisterFloat(Simulator::f2);
++ float f2_s = getFpuRegisterFloat(Simulator::f3);
++ float f3_s = getFpuRegisterFloat(Simulator::f4);
++ float f4_s = getFpuRegisterFloat(Simulator::f5);
++ float f5_s = getFpuRegisterFloat(Simulator::f6);
++ float f6_s = getFpuRegisterFloat(Simulator::f7);
++ float f7_s = getFpuRegisterFloat(Simulator::f8);
++ float f8_s = getFpuRegisterFloat(Simulator::f9);
++ float f9_s = getFpuRegisterFloat(Simulator::f10);
++ float f10_s = getFpuRegisterFloat(Simulator::f11);
++ float f11_s = getFpuRegisterFloat(Simulator::f12);
++ float f12_s = getFpuRegisterFloat(Simulator::f13);
++ double f0_d = getFpuRegisterDouble(Simulator::f1);
++ double f1_d = getFpuRegisterDouble(Simulator::f2);
++ double f2_d = getFpuRegisterDouble(Simulator::f3);
++ double f3_d = getFpuRegisterDouble(Simulator::f4);
++ double f4_d = getFpuRegisterDouble(Simulator::f5);
++ double f5_d = getFpuRegisterDouble(Simulator::f6);
++ double f6_d = getFpuRegisterDouble(Simulator::f7);
++ double f7_d = getFpuRegisterDouble(Simulator::f8);
++ double f8_d = getFpuRegisterDouble(Simulator::f9);
++ double f9_d = getFpuRegisterDouble(Simulator::f10);
++ double f10_d = getFpuRegisterDouble(Simulator::f11);
++ double f11_d = getFpuRegisterDouble(Simulator::f12);
++ double f12_d = getFpuRegisterDouble(Simulator::f13);
++
++ // Suppress unused-variable warnings for higher FP arg registers.
++ // They exist for ABI completeness but few function types use >5 FP args.
++ (void)f4_s; (void)f5_s; (void)f6_s; (void)f7_s; (void)f8_s; (void)f9_s;
++ (void)f10_s; (void)f11_s; (void)f12_s;
++ (void)f4_d; (void)f5_d; (void)f6_d; (void)f7_d; (void)f8_d; (void)f9_d;
++ (void)f10_d; (void)f11_d; (void)f12_d;
++
++ int64_t saved_lr = getLR();
++
++ bool stack_aligned = (getRegister(sp) & (ABIStackAlignment - 1)) == 0;
++ if (!stack_aligned) {
++ fprintf(stderr, "Runtime call with unaligned stack!\n");
++ MOZ_CRASH();
++ }
++
++ if (single_stepping_) {
++ single_step_callback_(single_step_callback_arg_, this, nullptr);
++ }
++
++ switch (redirection->type()) {
++ ABI_FUNCTION_TYPE_PPC64_SIM_DISPATCH
++
++ default:
++ MOZ_CRASH("Unknown function type.");
++ }
++
++ if (single_stepping_) {
++ single_step_callback_(single_step_callback_arg_, this, nullptr);
++ }
++
++ setLR(saved_lr);
++ set_pc(getLR());
++ } else if (instrBits == 0x7FE00008) {
++ // PPC_trap: used for wasm traps.
++ uint8_t* newPC;
++ if (wasm::HandleIllegalInstruction(registerState(), &newPC)) {
++ set_pc(int64_t(newPC));
++ return;
++ }
++ MOZ_CRASH("Unexpected trap instruction");
++ } else {
++ // Other trap-like instructions: enter debugger.
++ ppc64Debugger dbg(this);
++ dbg.debug();
++ }
++}
++
++// -----------------------------------------------------------------------------
++// Stop/breakpoint helpers.
++
++bool Simulator::isWatchpoint(uint32_t code) {
++ return (code <= kMaxWatchpointCode);
++}
++
++void Simulator::printWatchpoint(uint32_t code) {
++ ppc64Debugger dbg(this);
++ ++break_count_;
++ printf("\n---- break %d marker: %20" PRIi64 " (instr count: %20" PRIi64
++ ") ----\n",
++ code, break_count_, icount_);
++ dbg.printAllRegs();
++}
++
++void Simulator::handleStop(uint32_t code, SimInstruction* instr) {
++ if (isEnabledStop(code)) {
++ ppc64Debugger dbg(this);
++ dbg.stop(instr);
++ } else {
++ set_pc(get_pc() + SimInstruction::kInstrSize);
++ }
++}
++
++bool Simulator::isStopInstruction(SimInstruction* instr) {
++ return instr->instructionBits() == kCallRedirInstr;
++}
++
++bool Simulator::isEnabledStop(uint32_t code) {
++ MOZ_ASSERT(code <= kMaxStopCode);
++ MOZ_ASSERT(code > kMaxWatchpointCode);
++ return !(watchedStops_[code].count_ & kStopDisabledBit);
++}
++
++void Simulator::enableStop(uint32_t code) {
++ if (!isEnabledStop(code)) {
++ watchedStops_[code].count_ &= ~kStopDisabledBit;
++ }
++}
++
++void Simulator::disableStop(uint32_t code) {
++ if (isEnabledStop(code)) {
++ watchedStops_[code].count_ |= kStopDisabledBit;
++ }
++}
++
++void Simulator::increaseStopCounter(uint32_t code) {
++ MOZ_ASSERT(code <= kMaxStopCode);
++ if ((watchedStops_[code].count_ & ~(1 << 31)) == 0x7fffffff) {
++ printf(
++ "Stop counter for code %i has overflowed.\n"
++ "Enabling this code and reseting the counter to 0.\n",
++ code);
++ watchedStops_[code].count_ = 0;
++ enableStop(code);
++ } else {
++ watchedStops_[code].count_++;
++ }
++}
++
++void Simulator::printStopInfo(uint32_t code) {
++ if (code <= kMaxWatchpointCode) {
++ printf("That is a watchpoint, not a stop.\n");
++ return;
++ } else if (code > kMaxStopCode) {
++ printf("Code too large, only %u stops can be used\n", kMaxStopCode + 1);
++ return;
++ }
++ const char* state = isEnabledStop(code) ? "Enabled" : "Disabled";
++ int32_t count = watchedStops_[code].count_ & ~kStopDisabledBit;
++ if (count != 0) {
++ if (watchedStops_[code].desc_) {
++ printf("stop %i - 0x%x: \t%s, \tcounter = %i, \t%s\n", code, code,
++ state, count, watchedStops_[code].desc_);
++ } else {
++ printf("stop %i - 0x%x: \t%s, \tcounter = %i\n", code, code, state,
++ count);
++ }
++ }
++}
++
++// =============================================================================
++// Instruction decoders.
++// =============================================================================
++
++// Compute effective address for D-form instructions.
++// If RA==0, the base is 0 (not GPR[0]).
++static inline int64_t DFormEA(Simulator* sim, SimInstruction* instr,
++ int16_t offset) {
++ uint32_t ra = instr->raValue();
++ int64_t base = (ra == 0) ? 0 : sim->getRegister(ra);
++ return base + offset;
++}
++
++// Compute effective address for DS-form instructions.
++static inline int64_t DSFormEA(Simulator* sim, SimInstruction* instr,
++ int16_t offset) {
++ uint32_t ra = instr->raValue();
++ int64_t base = (ra == 0) ? 0 : sim->getRegister(ra);
++ return base + offset;
++}
++
++// Compute effective address for X-form indexed instructions.
++// If RA==0, base is 0 (not GPR[0]).
++static inline int64_t XFormEA(Simulator* sim, SimInstruction* instr) {
++ uint32_t ra = instr->raValue();
++ uint32_t rb = instr->rbValue();
++ int64_t base = (ra == 0) ? 0 : sim->getRegister(ra);
++ return base + sim->getRegister(rb);
++}
++
++// Compute effective address for X-form indexed updates (RA != 0 required).
++static inline int64_t XFormEAUpdate(Simulator* sim, SimInstruction* instr) {
++ uint32_t ra = instr->raValue();
++ uint32_t rb = instr->rbValue();
++ return sim->getRegister(ra) + sim->getRegister(rb);
++}
++
++// -----------------------------------------------------------------------------
++// decodeDFormALU: addi, addis, ori, oris, xori, xoris, andi., andis.,
++// cmpi, cmpli, subfic, addic, addic., mulli, twi
++
++void Simulator::decodeDFormALU(SimInstruction* instr) {
++ uint32_t opcode = instr->opcode();
++ uint32_t rt = instr->rtValue();
++ uint32_t ra = instr->raValue();
++ int16_t si = instr->imm16Value();
++ uint16_t ui = instr->uimm16Value();
++
++ switch (opcode) {
++ case 14: {
++ // addi: RT = (RA|0) + SI
++ int64_t base = (ra == 0) ? 0 : getRegister(ra);
++ setRegister(rt, base + (int64_t)si);
++ break;
++ }
++ case 15: {
++ // addis: RT = (RA|0) + (SI << 16)
++ int64_t base = (ra == 0) ? 0 : getRegister(ra);
++ setRegister(rt, base + ((int64_t)si << 16));
++ break;
++ }
++ case 24: {
++ // ori: RA = RS | UI
++ setRegister(ra, getRegister(rt) | (uint64_t)ui);
++ break;
++ }
++ case 25: {
++ // oris: RA = RS | (UI << 16)
++ setRegister(ra, getRegister(rt) | ((uint64_t)ui << 16));
++ break;
++ }
++ case 26: {
++ // xori: RA = RS ^ UI
++ setRegister(ra, getRegister(rt) ^ (uint64_t)ui);
++ break;
++ }
++ case 27: {
++ // xoris: RA = RS ^ (UI << 16)
++ setRegister(ra, getRegister(rt) ^ ((uint64_t)ui << 16));
++ break;
++ }
++ case 28: {
++ // andi.: RA = RS & UI, update CR0
++ int64_t result = getRegister(rt) & (uint64_t)ui;
++ setRegister(ra, result);
++ updateCR0(result);
++ break;
++ }
++ case 29: {
++ // andis.: RA = RS & (UI << 16), update CR0
++ int64_t result = getRegister(rt) & ((uint64_t)ui << 16);
++ setRegister(ra, result);
++ updateCR0(result);
++ break;
++ }
++ case 11: {
++ // cmpi: compare RA with SI, signed
++ uint32_t bf = instr->bfValue();
++ bool l = instr->lBit();
++ if (l) {
++ // 64-bit compare
++ setCRFieldCmp(bf, getRegister(ra), (int64_t)si);
++ } else {
++ // 32-bit compare
++ int32_t ra32 = I32(getRegister(ra));
++ setCRFieldCmp(bf, (int64_t)ra32, (int64_t)(int32_t)si);
++ }
++ break;
++ }
++ case 10: {
++ // cmpli: compare RA with UI, unsigned
++ uint32_t bf = instr->bfValue();
++ bool l = instr->lBit();
++ if (l) {
++ // 64-bit unsigned compare
++ setCRFieldCmpU(bf, U64(getRegister(ra)), (uint64_t)ui);
++ } else {
++ // 32-bit unsigned compare
++ uint32_t ra32 = U32(getRegister(ra));
++ setCRFieldCmpU(bf, (uint64_t)ra32, (uint64_t)ui);
++ }
++ break;
++ }
++ case 8: {
++ // subfic: RT = SI - RA, set CA
++ uint64_t ra_val = U64(getRegister(ra));
++ uint64_t imm = U64((int64_t)si);
++ uint64_t result = imm + ~ra_val + 1;
++ setRegister(rt, I64(result));
++ // CA is set if there is a carry out of the addition (~RA + IMM + 1).
++ // Equivalently, CA = (IMM >= RA) for unsigned interpretation of the
++ // full 64-bit subtraction.
++ bool carry = (imm >= ra_val) || (imm == 0 && ra_val == 0);
++ // More precise: carry = (~ra_val + imm) would overflow, or adding 1
++ // overflows.
++ uint64_t tmp = ~ra_val + imm;
++ carry = (tmp < ~ra_val) || (tmp < imm) || (result < tmp);
++ // Simplify: CA if no borrow.
++ carry = (U64((int64_t)si) >= ra_val);
++ if (ra_val == 0) carry = true;
++ // Actually, subfic CA: carry out of ~RA + IMM + 1.
++ // CA = (IMM > RA - 1) when RA != 0, CA = 1 when RA == 0.
++ // Or just: the unsigned result of (SI - RA) is valid (no borrow).
++ // Let's compute it correctly:
++ {
++ __uint128_t wide = (__uint128_t)(~ra_val) + (__uint128_t)imm + 1;
++ carry = (wide >> 64) != 0;
++ }
++ setXERCA(carry);
++ break;
++ }
++ case 12: {
++ // addic: RT = RA + SI, set CA
++ uint64_t ra_val = U64(getRegister(ra));
++ uint64_t imm = U64((int64_t)si);
++ uint64_t result = ra_val + imm;
++ setRegister(rt, I64(result));
++ setXERCA(result < ra_val);
++ break;
++ }
++ case 13: {
++ // addic.: RT = RA + SI, set CA, update CR0
++ uint64_t ra_val = U64(getRegister(ra));
++ uint64_t imm = U64((int64_t)si);
++ uint64_t result = ra_val + imm;
++ setRegister(rt, I64(result));
++ setXERCA(result < ra_val);
++ updateCR0(I64(result));
++ break;
++ }
++ case 7: {
++ // mulli: RT = RA * SI (low 64 bits)
++ int64_t result = getRegister(ra) * (int64_t)si;
++ setRegister(rt, result);
++ break;
++ }
++ case 3: {
++ // twi: Trap Word Immediate. We don't implement trapping in the
++ // simulator; just continue.
++ break;
++ }
++ default:
++ MOZ_CRASH_UNSAFE_PRINTF("decodeDFormALU: unhandled opcode %u", opcode);
++ }
++}
++
++// -----------------------------------------------------------------------------
++// decodeDFormLoad: lwz(32), lbz(34), lhz(40), lha(42), lfs(48), lfd(50)
++// and update variants
++
++void Simulator::decodeDFormLoad(SimInstruction* instr) {
++ uint32_t opcode = instr->opcode();
++ uint32_t rt = instr->rtValue();
++ int16_t si = instr->imm16Value();
++ uint64_t ea = DFormEA(this, instr, si);
++
++ switch (opcode) {
++ case 32:
++ // lwz
++ setRegister(rt, U64(readWU(ea, instr)));
++ break;
++ case 33: {
++ // lwzu: RA != 0, load and update RA
++ setRegister(rt, U64(readWU(ea, instr)));
++ setRegister(instr->raValue(), ea);
++ break;
++ }
++ case 34:
++ // lbz
++ setRegister(rt, U64(readBU(ea)));
++ break;
++ case 35: {
++ // lbzu
++ setRegister(rt, U64(readBU(ea)));
++ setRegister(instr->raValue(), ea);
++ break;
++ }
++ case 40:
++ // lhz
++ setRegister(rt, U64(readHU(ea, instr)));
++ break;
++ case 41: {
++ // lhzu
++ setRegister(rt, U64(readHU(ea, instr)));
++ setRegister(instr->raValue(), ea);
++ break;
++ }
++ case 42:
++ // lha (half-word, sign-extended)
++ setRegister(rt, (int64_t)readH(ea, instr));
++ break;
++ case 43: {
++ // lhau
++ setRegister(rt, (int64_t)readH(ea, instr));
++ setRegister(instr->raValue(), ea);
++ break;
++ }
++ case 48: {
++ // lfs: load float single, widen to double in FPR (NaN-preserving;
++ // matches Power ISA `lfs` which uses xscvspdpn semantics)
++ if (handleWasmSegFault(ea, 4)) break;
++ float val = *reinterpret_cast<float*>(ea);
++ setFpuRegisterDouble(rt, promoteFloatPreservingNaN(val));
++ break;
++ }
++ case 49: {
++ // lfsu
++ if (handleWasmSegFault(ea, 4)) break;
++ float val = *reinterpret_cast<float*>(ea);
++ setFpuRegisterDouble(rt, promoteFloatPreservingNaN(val));
++ setRegister(instr->raValue(), ea);
++ break;
++ }
++ case 50: {
++ // lfd: load float double
++ double val = readD(ea, instr);
++ setFpuRegisterDouble(rt, val);
++ break;
++ }
++ case 51: {
++ // lfdu
++ double val = readD(ea, instr);
++ setFpuRegisterDouble(rt, val);
++ setRegister(instr->raValue(), ea);
++ break;
++ }
++ default:
++ MOZ_CRASH_UNSAFE_PRINTF("decodeDFormLoad: unhandled opcode %u", opcode);
++ }
++}
++
++// -----------------------------------------------------------------------------
++// decodeDFormStore: stw(36), stwu(37), stb(38), sth(44), stfs(52), stfd(54)
++// and update variants
++
++void Simulator::decodeDFormStore(SimInstruction* instr) {
++ uint32_t opcode = instr->opcode();
++ uint32_t rs = instr->rsValue();
++ int16_t si = instr->imm16Value();
++
++ // For stores, the effective address calculation differs for update forms:
++ // - Non-update: EA = (RA|0) + D
++ // - Update: EA = RA + D (RA must not be 0)
++ bool isUpdate = false;
++ switch (opcode) {
++ case 37: case 39: case 45: case 53: case 55:
++ isUpdate = true;
++ break;
++ }
++
++ uint64_t ea;
++ if (isUpdate) {
++ ea = getRegister(instr->raValue()) + (int64_t)si;
++ } else {
++ ea = DFormEA(this, instr, si);
++ }
++
++ switch (opcode) {
++ case 36:
++ // stw
++ writeW(ea, I32(getRegister(rs)), instr);
++ break;
++ case 38:
++ // stb
++ writeB(ea, (uint8_t)(getRegister(rs) & 0xFF));
++ break;
++ case 39:
++ // stbu
++ writeB(ea, (uint8_t)(getRegister(rs) & 0xFF));
++ setRegister(instr->raValue(), ea);
++ break;
++ case 44:
++ // sth
++ writeH(ea, U16(getRegister(rs)), instr);
++ break;
++ case 45:
++ // sthu
++ writeH(ea, U16(getRegister(rs)), instr);
++ setRegister(instr->raValue(), ea);
++ break;
++ case 52: {
++ // stfs: convert double in FPR to single and store (NaN-preserving;
++ // matches Power ISA `stfs` which uses xscvdpspn semantics)
++ double dval = getFpuRegisterDouble(rs);
++ float fval = demoteDoublePreservingNaN(dval);
++ if (handleWasmSegFault(ea, 4)) break;
++ *reinterpret_cast<float*>(ea) = fval;
++ LLBit_ = false;
++ break;
++ }
++ case 53: {
++ // stfsu
++ double dval = getFpuRegisterDouble(rs);
++ float fval = demoteDoublePreservingNaN(dval);
++ if (handleWasmSegFault(ea, 4)) break;
++ *reinterpret_cast<float*>(ea) = fval;
++ LLBit_ = false;
++ setRegister(instr->raValue(), ea);
++ break;
++ }
++ case 54:
++ // stfd
++ writeD(ea, getFpuRegisterDouble(rs), instr);
++ break;
++ case 55:
++ // stfdu
++ writeD(ea, getFpuRegisterDouble(rs), instr);
++ setRegister(instr->raValue(), ea);
++ break;
++ default:
++ MOZ_CRASH_UNSAFE_PRINTF("decodeDFormStore: unhandled opcode %u", opcode);
++ }
++}
++
++// -----------------------------------------------------------------------------
++// decodeDSForm: ld(58/0), lwa(58/2), std(62/0), stdu(62/1)
++
++void Simulator::decodeDSForm(SimInstruction* instr) {
++ uint32_t opcode = instr->opcode();
++ uint32_t rt = instr->rtValue();
++ int16_t ds = instr->ds14Value();
++ uint32_t xo = instr->bits(1, 0);
++
++ if (opcode == 58) {
++ uint64_t ea = DSFormEA(this, instr, ds);
++ switch (xo) {
++ case 0:
++ // ld
++ setRegister(rt, readDW(ea, instr));
++ break;
++ case 1: {
++ // ldu
++ setRegister(rt, readDW(ea, instr));
++ setRegister(instr->raValue(), ea);
++ break;
++ }
++ case 2:
++ // lwa (load word algebraic, sign-extended to 64)
++ setRegister(rt, (int64_t)readW(ea, instr));
++ break;
++ default:
++ MOZ_CRASH_UNSAFE_PRINTF("decodeDSForm: opcode 58, xo=%u", xo);
++ }
++ } else if (opcode == 62) {
++ // For std/stdu, EA uses RA directly (no RA|0 rule).
++ uint64_t ea;
++ if (xo == 1) {
++ // stdu: update form
++ ea = getRegister(instr->raValue()) + (int64_t)ds;
++ } else {
++ ea = DSFormEA(this, instr, ds);
++ }
++ switch (xo) {
++ case 0:
++ // std
++ writeDW(ea, getRegister(rt), instr);
++ break;
++ case 1:
++ // stdu
++ writeDW(ea, getRegister(rt), instr);
++ setRegister(instr->raValue(), ea);
++ break;
++ default:
++ MOZ_CRASH_UNSAFE_PRINTF("decodeDSForm: opcode 62, xo=%u", xo);
++ }
++ } else {
++ MOZ_CRASH_UNSAFE_PRINTF("decodeDSForm: unhandled opcode %u", opcode);
++ }
++}
++
++// -----------------------------------------------------------------------------
++// decodeXForm: Major opcode 31 (X-form, XO-form, etc.)
++// This is the largest decoder covering most ALU, indexed load/store, SPR,
++// and atomic instructions.
++
++void Simulator::decodeXForm(SimInstruction* instr) {
++ uint32_t xo = instr->xoValue();
++ uint32_t rt = instr->rtValue();
++ uint32_t ra = instr->raValue();
++ uint32_t rb = instr->rbValue();
++ bool rc = instr->rcBit();
++
++ // Many instructions share major opcode 31. Switch on extended opcode.
++ // For XO-form with OE=1, the xoValue() includes bit 10, so
++ // addo (266 | 512 = 778) etc. are separate cases.
++
++ // First check for isel which uses bits 1-5 = 15 (XO = 15 in bits 1..5).
++ if ((xo & 0x1F) == 15) {
++ // isel: if CR[BC] then RT=RA else RT=RB
++ // BC is in bits 6..10 (the rc field position).
++ uint32_t bc = instr->rcValue();
++ uint32_t crField = bc / 4;
++ uint32_t crBit = bc % 4;
++ uint8_t crFieldVal = getCRField(crField);
++ // PPC CR field bits: bit3=LT(8), bit2=GT(4), bit1=EQ(2), bit0=SO(1)
++ // Bit numbering within field: 0=LT, 1=GT, 2=EQ, 3=SO
++ bool bitSet;
++ switch (crBit) {
++ case 0: bitSet = (crFieldVal & kCRFieldLT) != 0; break;
++ case 1: bitSet = (crFieldVal & kCRFieldGT) != 0; break;
++ case 2: bitSet = (crFieldVal & kCRFieldEQ) != 0; break;
++ case 3: bitSet = (crFieldVal & kCRFieldSO) != 0; break;
++ default: bitSet = false; break;
++ }
++ int64_t raVal = (ra == 0) ? 0 : getRegister(ra);
++ int64_t rbVal = getRegister(rb);
++ setRegister(rt, bitSet ? raVal : rbVal);
++ return;
++ }
++
++ switch (xo) {
++ // --- Arithmetic ---
++ case 266: {
++ // add
++ int64_t result = getRegister(ra) + getRegister(rb);
++ setRegister(rt, result);
++ if (rc) updateCR0(result);
++ break;
++ }
++ case 778: {
++ // addo
++ int64_t ra_val = getRegister(ra);
++ int64_t rb_val = getRegister(rb);
++ int64_t result = ra_val + rb_val;
++ setRegister(rt, result);
++ // Overflow if signs of inputs are same but result sign differs.
++ bool ov = ((ra_val ^ result) & (rb_val ^ result)) < 0;
++ setXEROV(ov);
++ if (rc) updateCR0(result);
++ break;
++ }
++ case 10: {
++ // addc
++ uint64_t ra_val = U64(getRegister(ra));
++ uint64_t rb_val = U64(getRegister(rb));
++ uint64_t result = ra_val + rb_val;
++ setRegister(rt, I64(result));
++ setXERCA(result < ra_val);
++ if (rc) updateCR0(I64(result));
++ break;
++ }
++ case 138: {
++ // adde: RT = RA + RB + CA
++ uint64_t ra_val = U64(getRegister(ra));
++ uint64_t rb_val = U64(getRegister(rb));
++ uint64_t ca = getXERCA() ? 1ULL : 0ULL;
++ uint64_t result = ra_val + rb_val + ca;
++ setRegister(rt, I64(result));
++ // Carry-out: when ca==0, only the ra+rb wrap matters; when ca==1,
++ // an additional wrap occurs iff result <= ra_val.
++ bool newCA = ca ? (result <= ra_val) : (result < ra_val);
++ setXERCA(newCA);
++ if (rc) updateCR0(I64(result));
++ break;
++ }
++ case 234: {
++ // addme: RT = RA + CA - 1
++ uint64_t ra_val = U64(getRegister(ra));
++ uint64_t ca = getXERCA() ? 1ULL : 0ULL;
++ uint64_t result = ra_val + ca + ~0ULL; // + CA + (-1)
++ setRegister(rt, I64(result));
++ // CA if carry out of (RA + CA + 0xFFFFFFFFFFFFFFFF)
++ bool newCA = (ra_val != 0) || (ca != 0);
++ setXERCA(newCA);
++ if (rc) updateCR0(I64(result));
++ break;
++ }
++ case 202: {
++ // addze: RT = RA + CA
++ uint64_t ra_val = U64(getRegister(ra));
++ uint64_t ca = getXERCA() ? 1ULL : 0ULL;
++ uint64_t result = ra_val + ca;
++ setRegister(rt, I64(result));
++ setXERCA(result < ra_val);
++ if (rc) updateCR0(I64(result));
++ break;
++ }
++ case 40: {
++ // subf: RT = RB - RA
++ int64_t result = getRegister(rb) - getRegister(ra);
++ setRegister(rt, result);
++ if (rc) updateCR0(result);
++ break;
++ }
++ case 552: {
++ // subfo: RT = RB - RA, set OV
++ int64_t ra_val = getRegister(ra);
++ int64_t rb_val = getRegister(rb);
++ int64_t result = rb_val - ra_val;
++ setRegister(rt, result);
++ bool ov = ((rb_val ^ ra_val) & (rb_val ^ result)) < 0;
++ setXEROV(ov);
++ if (rc) updateCR0(result);
++ break;
++ }
++ case 8: {
++ // subfc: RT = ~RA + RB + 1
++ uint64_t ra_val = U64(getRegister(ra));
++ uint64_t rb_val = U64(getRegister(rb));
++ uint64_t result = ~ra_val + rb_val + 1;
++ setRegister(rt, I64(result));
++ // CA = no borrow = (RB >= RA unsigned)
++ setXERCA(rb_val >= ra_val);
++ if (rc) updateCR0(I64(result));
++ break;
++ }
++ case 136: {
++ // subfe: RT = ~RA + RB + CA
++ uint64_t ra_val = U64(getRegister(ra));
++ uint64_t rb_val = U64(getRegister(rb));
++ uint64_t ca = getXERCA() ? 1ULL : 0ULL;
++ uint64_t result = ~ra_val + rb_val + ca;
++ setRegister(rt, I64(result));
++ __uint128_t wide = (__uint128_t)(~ra_val) + (__uint128_t)rb_val + ca;
++ setXERCA((wide >> 64) != 0);
++ if (rc) updateCR0(I64(result));
++ break;
++ }
++ case 232: {
++ // subfze: RT = ~RA + CA
++ uint64_t ra_val = U64(getRegister(ra));
++ uint64_t ca = getXERCA() ? 1ULL : 0ULL;
++ uint64_t result = ~ra_val + ca;
++ setRegister(rt, I64(result));
++ setXERCA(ca > ra_val); // CA if ~RA + CA overflows
++ if (rc) updateCR0(I64(result));
++ break;
++ }
++ case 104: {
++ // neg: RT = -RA
++ int64_t result = -getRegister(ra);
++ setRegister(rt, result);
++ if (rc) updateCR0(result);
++ break;
++ }
++
++ // --- Multiply ---
++ case 233: {
++ // mulld: RT = RA * RB (low 64 bits)
++ int64_t result = getRegister(ra) * getRegister(rb);
++ setRegister(rt, result);
++ if (rc) updateCR0(result);
++ break;
++ }
++ case 745: {
++ // mulldo: RT = RA * RB, set OV
++ int64_t ra_val = getRegister(ra);
++ int64_t rb_val = getRegister(rb);
++ int64_t result = ra_val * rb_val;
++ setRegister(rt, result);
++ // OV if high part of full 128-bit product is not all-sign.
++ int64_t hi = MultiplyHighSigned(ra_val, rb_val);
++ bool ov = (hi != (result >> 63));
++ setXEROV(ov);
++ if (rc) updateCR0(result);
++ break;
++ }
++ case 235: {
++ // mullw: RT = sign_ext(RA[32:63] * RB[32:63])
++ int64_t result = (int64_t)I32(getRegister(ra)) *
++ (int64_t)I32(getRegister(rb));
++ setRegister(rt, result);
++ if (rc) updateCR0(result);
++ break;
++ }
++ case 747: {
++ // mullwo
++ int64_t ra_val = I32(getRegister(ra));
++ int64_t rb_val = I32(getRegister(rb));
++ int64_t result = ra_val * rb_val;
++ setRegister(rt, result);
++ bool ov = (result != (int64_t)I32(result));
++ setXEROV(ov);
++ if (rc) updateCR0(result);
++ break;
++ }
++ case 73: {
++ // mulhd: RT = high 64 bits of RA * RB (signed)
++ setRegister(rt, MultiplyHighSigned(getRegister(ra), getRegister(rb)));
++ if (rc) updateCR0(getRegister(rt));
++ break;
++ }
++ case 9: {
++ // mulhdu: RT = high 64 bits of RA * RB (unsigned)
++ setRegister(rt, I64(MultiplyHighUnsigned(U64(getRegister(ra)),
++ U64(getRegister(rb)))));
++ if (rc) updateCR0(getRegister(rt));
++ break;
++ }
++ case 75: {
++ // mulhw: RT = high 32 bits of (RA[32:63] * RB[32:63]), signed
++ int64_t result =
++ (int64_t)I32(getRegister(ra)) * (int64_t)I32(getRegister(rb));
++ setRegister(rt, result >> 32);
++ if (rc) updateCR0(getRegister(rt));
++ break;
++ }
++ case 11: {
++ // mulhwu: RT = high 32 bits, unsigned
++ uint64_t result =
++ (uint64_t)U32(getRegister(ra)) * (uint64_t)U32(getRegister(rb));
++ setRegister(rt, I64(result >> 32));
++ if (rc) updateCR0(getRegister(rt));
++ break;
++ }
++
++ // --- Divide ---
++ case 489: {
++ // divd: RT = RA / RB (signed, 64-bit)
++ int64_t ra_val = getRegister(ra);
++ int64_t rb_val = getRegister(rb);
++ if (rb_val == 0 || (ra_val == INT64_MIN && rb_val == -1)) {
++ setRegister(rt, 0);
++ } else {
++ setRegister(rt, ra_val / rb_val);
++ }
++ if (rc) updateCR0(getRegister(rt));
++ break;
++ }
++ case 1001: {
++ // divdo
++ int64_t ra_val = getRegister(ra);
++ int64_t rb_val = getRegister(rb);
++ bool ov = (rb_val == 0) || (ra_val == INT64_MIN && rb_val == -1);
++ if (ov) {
++ setRegister(rt, 0);
++ } else {
++ setRegister(rt, ra_val / rb_val);
++ }
++ setXEROV(ov);
++ if (rc) updateCR0(getRegister(rt));
++ break;
++ }
++ case 457: {
++ // divdu: unsigned 64-bit divide
++ uint64_t ra_val = U64(getRegister(ra));
++ uint64_t rb_val = U64(getRegister(rb));
++ if (rb_val == 0) {
++ setRegister(rt, 0);
++ } else {
++ setRegister(rt, I64(ra_val / rb_val));
++ }
++ if (rc) updateCR0(getRegister(rt));
++ break;
++ }
++ case 969: {
++ // divduo
++ uint64_t ra_val = U64(getRegister(ra));
++ uint64_t rb_val = U64(getRegister(rb));
++ bool ov = (rb_val == 0);
++ if (ov) {
++ setRegister(rt, 0);
++ } else {
++ setRegister(rt, I64(ra_val / rb_val));
++ }
++ setXEROV(ov);
++ if (rc) updateCR0(getRegister(rt));
++ break;
++ }
++ case 491: {
++ // divw: signed 32-bit divide
++ int32_t ra_val = I32(getRegister(ra));
++ int32_t rb_val = I32(getRegister(rb));
++ if (rb_val == 0 || (ra_val == INT32_MIN && rb_val == -1)) {
++ setRegister(rt, 0);
++ } else {
++ setRegister(rt, (int64_t)(ra_val / rb_val));
++ }
++ if (rc) updateCR0(getRegister(rt));
++ break;
++ }
++ case 1003: {
++ // divwo
++ int32_t ra_val = I32(getRegister(ra));
++ int32_t rb_val = I32(getRegister(rb));
++ bool ov = (rb_val == 0) || (ra_val == INT32_MIN && rb_val == -1);
++ if (ov) {
++ setRegister(rt, 0);
++ } else {
++ setRegister(rt, (int64_t)(ra_val / rb_val));
++ }
++ setXEROV(ov);
++ if (rc) updateCR0(getRegister(rt));
++ break;
++ }
++ case 459: {
++ // divwu: unsigned 32-bit divide
++ uint32_t ra_val = U32(getRegister(ra));
++ uint32_t rb_val = U32(getRegister(rb));
++ if (rb_val == 0) {
++ setRegister(rt, 0);
++ } else {
++ setRegister(rt, (int64_t)(ra_val / rb_val));
++ }
++ if (rc) updateCR0(getRegister(rt));
++ break;
++ }
++ case 971: {
++ // divwuo
++ uint32_t ra_val = U32(getRegister(ra));
++ uint32_t rb_val = U32(getRegister(rb));
++ bool ov = (rb_val == 0);
++ if (ov) {
++ setRegister(rt, 0);
++ } else {
++ setRegister(rt, (int64_t)(ra_val / rb_val));
++ }
++ setXEROV(ov);
++ if (rc) updateCR0(getRegister(rt));
++ break;
++ }
++
++ // --- POWER9 modulo (ISA 3.0) ---
++ // Result of "undefined" division (rb_val == 0, or signed INT_MIN / -1)
++ // is implementation-defined per Power ISA; matching the divX behaviour
++ // above, we yield 0 in those cases. Rc has no encoding for these ops.
++ case 779: {
++ // modsw: RT = RA % RB (signed, 32-bit)
++ int32_t ra_val = I32(getRegister(ra));
++ int32_t rb_val = I32(getRegister(rb));
++ if (rb_val == 0 || (ra_val == INT32_MIN && rb_val == -1)) {
++ setRegister(rt, 0);
++ } else {
++ setRegister(rt, (int64_t)(ra_val % rb_val));
++ }
++ break;
++ }
++ case 267: {
++ // moduw: RT = RA % RB (unsigned, 32-bit)
++ uint32_t ra_val = U32(getRegister(ra));
++ uint32_t rb_val = U32(getRegister(rb));
++ if (rb_val == 0) {
++ setRegister(rt, 0);
++ } else {
++ setRegister(rt, (int64_t)(ra_val % rb_val));
++ }
++ break;
++ }
++ case 777: {
++ // modsd: RT = RA % RB (signed, 64-bit)
++ int64_t ra_val = getRegister(ra);
++ int64_t rb_val = getRegister(rb);
++ if (rb_val == 0 || (ra_val == INT64_MIN && rb_val == -1)) {
++ setRegister(rt, 0);
++ } else {
++ setRegister(rt, ra_val % rb_val);
++ }
++ break;
++ }
++ case 265: {
++ // modud: RT = RA % RB (unsigned, 64-bit)
++ uint64_t ra_val = U64(getRegister(ra));
++ uint64_t rb_val = U64(getRegister(rb));
++ if (rb_val == 0) {
++ setRegister(rt, 0);
++ } else {
++ setRegister(rt, I64(ra_val % rb_val));
++ }
++ break;
++ }
++
++ // --- Logical ---
++ case 28: {
++ // and: RA = RS & RB
++ int64_t result = getRegister(rt) & getRegister(rb);
++ setRegister(ra, result);
++ if (rc) updateCR0(result);
++ break;
++ }
++ case 60: {
++ // andc: RA = RS & ~RB
++ int64_t result = getRegister(rt) & ~getRegister(rb);
++ setRegister(ra, result);
++ if (rc) updateCR0(result);
++ break;
++ }
++ case 444: {
++ // or: RA = RS | RB
++ int64_t result = getRegister(rt) | getRegister(rb);
++ setRegister(ra, result);
++ if (rc) updateCR0(result);
++ break;
++ }
++ case 412: {
++ // orc: RA = RS | ~RB
++ int64_t result = getRegister(rt) | ~getRegister(rb);
++ setRegister(ra, result);
++ if (rc) updateCR0(result);
++ break;
++ }
++ case 316: {
++ // xor: RA = RS ^ RB
++ int64_t result = getRegister(rt) ^ getRegister(rb);
++ setRegister(ra, result);
++ if (rc) updateCR0(result);
++ break;
++ }
++ case 476: {
++ // nand: RA = ~(RS & RB)
++ int64_t result = ~(getRegister(rt) & getRegister(rb));
++ setRegister(ra, result);
++ if (rc) updateCR0(result);
++ break;
++ }
++ case 124: {
++ // nor: RA = ~(RS | RB)
++ int64_t result = ~(getRegister(rt) | getRegister(rb));
++ setRegister(ra, result);
++ if (rc) updateCR0(result);
++ break;
++ }
++ case 284: {
++ // eqv: RA = ~(RS ^ RB)
++ int64_t result = ~(getRegister(rt) ^ getRegister(rb));
++ setRegister(ra, result);
++ if (rc) updateCR0(result);
++ break;
++ }
++
++ // --- Shifts ---
++ case 27: {
++ // sld: RA = RS << RB[58:63] if RB[57]==0, else RA=0
++ uint64_t shift = U64(getRegister(rb));
++ uint64_t rs_val = U64(getRegister(rt));
++ int64_t result;
++ if (shift & 0x40) {
++ result = 0;
++ } else {
++ result = I64(rs_val << (shift & 0x3F));
++ }
++ setRegister(ra, result);
++ if (rc) updateCR0(result);
++ break;
++ }
++ case 24: {
++ // slw: RA = RS[32:63] << RB[59:63] if RB[58]==0, else RA=0 (32-bit)
++ uint32_t shift = U32(getRegister(rb));
++ uint32_t rs_val = U32(getRegister(rt));
++ uint32_t result;
++ if (shift & 0x20) {
++ result = 0;
++ } else {
++ result = rs_val << (shift & 0x1F);
++ }
++ setRegister(ra, (int64_t)(int32_t)result);
++ if (rc) updateCR0(getRegister(ra));
++ break;
++ }
++ case 539: {
++ // srd: RA = RS >> RB[58:63] if RB[57]==0, else RA=0 (logical)
++ uint64_t shift = U64(getRegister(rb));
++ uint64_t rs_val = U64(getRegister(rt));
++ int64_t result;
++ if (shift & 0x40) {
++ result = 0;
++ } else {
++ result = I64(rs_val >> (shift & 0x3F));
++ }
++ setRegister(ra, result);
++ if (rc) updateCR0(result);
++ break;
++ }
++ case 536: {
++ // srw: RA = RS[32:63] >> RB[59:63] logical (32-bit)
++ uint32_t shift = U32(getRegister(rb));
++ uint32_t rs_val = U32(getRegister(rt));
++ uint32_t result;
++ if (shift & 0x20) {
++ result = 0;
++ } else {
++ result = rs_val >> (shift & 0x1F);
++ }
++ setRegister(ra, (int64_t)(int32_t)result);
++ if (rc) updateCR0(getRegister(ra));
++ break;
++ }
++ case 794: {
++ // srad: RA = RS >> RB[58:63] arithmetic (64-bit), set CA
++ uint64_t shift = U64(getRegister(rb));
++ int64_t rs_val = getRegister(rt);
++ int64_t result;
++ bool carry;
++ if (shift & 0x40) {
++ result = rs_val >> 63; // all sign bits
++ carry = (rs_val < 0);
++ } else {
++ uint32_t sh = shift & 0x3F;
++ result = rs_val >> sh;
++ // CA = 1 if RS is negative and any 1-bits were shifted out.
++ carry = (rs_val < 0) && ((rs_val & ((1ULL << sh) - 1)) != 0);
++ }
++ setRegister(ra, result);
++ setXERCA(carry);
++ if (rc) updateCR0(result);
++ break;
++ }
++ case 792: {
++ // sraw: RA = RS[32:63] >> RB[59:63] arithmetic (32-bit), set CA
++ uint32_t shift = U32(getRegister(rb));
++ int32_t rs_val = I32(getRegister(rt));
++ int32_t result;
++ bool carry;
++ if (shift & 0x20) {
++ result = rs_val >> 31;
++ carry = (rs_val < 0);
++ } else {
++ uint32_t sh = shift & 0x1F;
++ result = rs_val >> sh;
++ carry = (rs_val < 0) && ((rs_val & ((1U << sh) - 1)) != 0);
++ }
++ setRegister(ra, (int64_t)result);
++ setXERCA(carry);
++ if (rc) updateCR0(getRegister(ra));
++ break;
++ }
++ case 826:
++ case 827: {
++ // sradi RA, RS, SH: RA = EXTS(RS) >> sh arithmetic (64-bit), set CA.
++ // XS-form, XO=413 (9-bit, bits 21-29), sh[5] at bit 30, Rc at bit 31.
++ // Our xoValue() extracts bits 10:1 (10 bits)
++ // which yields 413*2 + sh[5] = 826 (sh[5]=0) or 827 (sh[5]=1).
++ // sh[0:4] at instruction bits 15:11 (= raValue field position, but
++ // for this XS-form they're the SH[0:4] subfield).
++ uint32_t sh = instr->bits(15, 11) | (instr->bit(1) << 5);
++ int64_t rs_val = getRegister(rt);
++ int64_t result = (sh == 0) ? rs_val : (rs_val >> sh);
++ // CA := rs_val < 0 && any bits shifted out are 1.
++ bool carry = (rs_val < 0) && sh > 0 &&
++ ((U64(rs_val) & ((1ULL << sh) - 1)) != 0);
++ setRegister(ra, result);
++ setXERCA(carry);
++ if (rc) updateCR0(result);
++ break;
++ }
++ case 824: {
++ // srawi: RA = RS[32:63] >> SH arithmetic (32-bit), set CA
++ uint32_t sh = instr->bits(15, 11);
++ int32_t rs_val = I32(getRegister(rt));
++ int32_t result = rs_val >> sh;
++ bool carry = (rs_val < 0) && sh > 0 &&
++ ((U32(rs_val) & ((1U << sh) - 1)) != 0);
++ setRegister(ra, (int64_t)result);
++ setXERCA(carry);
++ if (rc) updateCR0(getRegister(ra));
++ break;
++ }
++
++ // --- Extend / count ---
++ case 954: {
++ // extsb: RA = sign_ext(RS[56:63])
++ int64_t result = (int64_t)(int8_t)(getRegister(rt) & 0xFF);
++ setRegister(ra, result);
++ if (rc) updateCR0(result);
++ break;
++ }
++ case 922: {
++ // extsh: RA = sign_ext(RS[48:63])
++ int64_t result = (int64_t)(int16_t)(getRegister(rt) & 0xFFFF);
++ setRegister(ra, result);
++ if (rc) updateCR0(result);
++ break;
++ }
++ case 986: {
++ // extsw: RA = sign_ext(RS[32:63])
++ int64_t result = (int64_t)(int32_t)(getRegister(rt) & 0xFFFFFFFF);
++ setRegister(ra, result);
++ if (rc) updateCR0(result);
++ break;
++ }
++ case 58: {
++ // cntlzd: RA = count leading zeros of RS (64-bit)
++ setRegister(ra, CountLeadingZeros64(U64(getRegister(rt))));
++ if (rc) updateCR0(getRegister(ra));
++ break;
++ }
++ case 26: {
++ // cntlzw: RA = count leading zeros of RS[32:63] (32-bit)
++ setRegister(ra, CountLeadingZeros32(U32(getRegister(rt))));
++ if (rc) updateCR0(getRegister(ra));
++ break;
++ }
++ case 570: {
++ // cnttzd
++ setRegister(ra, CountTrailingZeros64(U64(getRegister(rt))));
++ if (rc) updateCR0(getRegister(ra));
++ break;
++ }
++ case 538: {
++ // cnttzw
++ setRegister(ra, CountTrailingZeros32(U32(getRegister(rt))));
++ if (rc) updateCR0(getRegister(ra));
++ break;
++ }
++ case 506: {
++ // popcntd
++ setRegister(ra, PopCount64(U64(getRegister(rt))));
++ break;
++ }
++ case 378: {
++ // popcntw: popcount each 32-bit half independently, sum in each half
++ uint64_t val = U64(getRegister(rt));
++ uint32_t lo = PopCount32(U32(val));
++ uint32_t hi = PopCount32(U32(val >> 32));
++ setRegister(ra, I64(((uint64_t)hi << 32) | lo));
++ break;
++ }
++ case 122: {
++ // popcntb: popcount each byte independently
++ setRegister(ra, I64(PopCountPerByte(U64(getRegister(rt)))));
++ break;
++ }
++ case 187: {
++ // brd (POWER10): RA = byte-reverse(RS) full 64-bit doubleword.
++ setRegister(ra, I64(__builtin_bswap64(U64(getRegister(rt)))));
++ break;
++ }
++ case 219: {
++ // brh (POWER10): byte-reverse each of the 4 halfwords in RS.
++ uint64_t v = U64(getRegister(rt));
++ uint64_t out = ((v & 0xFF00FF00FF00FF00ULL) >> 8) |
++ ((v & 0x00FF00FF00FF00FFULL) << 8);
++ setRegister(ra, I64(out));
++ break;
++ }
++ case 155: {
++ // brw (POWER10): byte-reverse each of the 2 words in RS.
++ uint64_t v = U64(getRegister(rt));
++ uint64_t out = ((uint64_t)__builtin_bswap32((uint32_t)(v >> 32)) << 32) |
++ (uint64_t)__builtin_bswap32((uint32_t)v);
++ setRegister(ra, I64(out));
++ break;
++ }
++
++ // --- Compare (X-form) ---
++ case 0: {
++ // cmp (cmpw/cmpd): signed compare
++ uint32_t bf = instr->bfValue();
++ bool l = instr->lBit();
++ if (l) {
++ setCRFieldCmp(bf, getRegister(ra), getRegister(rb));
++ } else {
++ setCRFieldCmp(bf, (int64_t)I32(getRegister(ra)),
++ (int64_t)I32(getRegister(rb)));
++ }
++ break;
++ }
++ case 32: {
++ // cmpl (cmplw/cmpld): unsigned compare
++ uint32_t bf = instr->bfValue();
++ bool l = instr->lBit();
++ if (l) {
++ setCRFieldCmpU(bf, U64(getRegister(ra)), U64(getRegister(rb)));
++ } else {
++ setCRFieldCmpU(bf, (uint64_t)U32(getRegister(ra)),
++ (uint64_t)U32(getRegister(rb)));
++ }
++ break;
++ }
++
++ // --- Trap ---
++ case 4: {
++ // tw: Trap Word. The JIT uses this for debugging / tagging.
++ // In the simulator we just treat it as a NOP (the JIT uses tagged
++ // trap words that are never actually reached during normal execution,
++ // they serve as metadata for the patcher).
++ break;
++ }
++
++ // --- SPR ---
++ case 339: {
++ // mfspr: RT = SPR
++ // SPR encoding: spr[4:0] at bits 16..20, spr[9:5] at bits 11..15
++ uint32_t spr_lo = instr->raValue(); // bits 16..20
++ uint32_t spr_hi = instr->rbValue(); // bits 11..15
++ uint32_t spr = (spr_lo) | (spr_hi << 5);
++ switch (spr) {
++ case 8: // LR
++ setRegister(rt, getLR());
++ break;
++ case 9: // CTR
++ setRegister(rt, getCTR());
++ break;
++ case 1: // XER
++ setRegister(rt, I64(getXER()));
++ break;
++ default:
++ MOZ_CRASH_UNSAFE_PRINTF("mfspr: unhandled SPR %u", spr);
++ }
++ break;
++ }
++ case 467: {
++ // mtspr: SPR = RS
++ uint32_t spr_lo = instr->raValue();
++ uint32_t spr_hi = instr->rbValue();
++ uint32_t spr = (spr_lo) | (spr_hi << 5);
++ int64_t val = getRegister(rt);
++ switch (spr) {
++ case 8: // LR
++ setLR(val);
++ break;
++ case 9: // CTR
++ setCTR(val);
++ break;
++ case 1: // XER
++ setXER(U64(val));
++ break;
++ default:
++ MOZ_CRASH_UNSAFE_PRINTF("mtspr: unhandled SPR %u", spr);
++ }
++ break;
++ }
++ case 19: {
++ // mfocrf: read one CR field selected by the FXM bitmask into RT.
++ // (Plain mfcr shares this XO with FXM=0; we model both by reading
++ // the full CR — the JIT only emits mfocrf and the bits outside the
++ // selected field are spec'd "undefined", so reading the full CR is
++ // a valid implementation.)
++ setRegister(rt, (int64_t)getCR());
++ break;
++ }
++ case 144: {
++ // mtcrf: move to CR fields
++ // FXM field is in bits 12..19.
++ uint32_t fxm = instr->bits(19, 12);
++ uint32_t rs_val = U32(getRegister(rt));
++ uint32_t cr = getCR();
++ for (int i = 0; i < 8; i++) {
++ if (fxm & (0x80 >> i)) {
++ uint32_t shift = 4 * (7 - i);
++ cr = (cr & ~(0xFu << shift)) | (rs_val & (0xFu << shift));
++ }
++ }
++ setCR(cr);
++ break;
++ }
++ case 576: {
++ // mcrxrx: move XER[OV,OV32,CA,CA32] to CR field BF
++ uint32_t bf = instr->bfValue();
++ uint8_t field = 0;
++ if (getXEROV()) field |= 0x8;
++ // OV32 at bit 19 of XER
++ if ((getXER() >> kXEROV32Bit) & 1) field |= 0x4;
++ if (getXERCA()) field |= 0x2;
++ if ((getXER() >> kXERCA32Bit) & 1) field |= 0x1;
++ setCRField(bf, field);
++ break;
++ }
++ case 384:
++ case 416: {
++ // POWER10 setbc/setbcr: RT = (CR[BI]==N) ? 1 : 0
++ // BI at bits 11..15; xo=384 (setbc, N=1), xo=416 (setbcr, N=0).
++ uint32_t bi = instr->raValue();
++ uint32_t crField = bi / 4;
++ uint32_t crBit = bi % 4;
++ uint8_t crFieldVal = getCRField(crField);
++ bool bitSet;
++ switch (crBit) {
++ case 0: bitSet = (crFieldVal & kCRFieldLT) != 0; break;
++ case 1: bitSet = (crFieldVal & kCRFieldGT) != 0; break;
++ case 2: bitSet = (crFieldVal & kCRFieldEQ) != 0; break;
++ case 3: bitSet = (crFieldVal & kCRFieldSO) != 0; break;
++ default: bitSet = false; break;
++ }
++ bool want = (xo == 384) ? bitSet : !bitSet;
++ setRegister(rt, want ? 1 : 0);
++ break;
++ }
++
++ // --- Indexed loads ---
++ case 21: {
++ // ldx: RT = [RA|0 + RB], 8 bytes
++ uint64_t ea = XFormEA(this, instr);
++ setRegister(rt, readDW(ea, instr));
++ break;
++ }
++ case 53: {
++ // ldux: RT = [RA + RB], update RA
++ uint64_t ea = XFormEAUpdate(this, instr);
++ setRegister(rt, readDW(ea, instr));
++ setRegister(ra, ea);
++ break;
++ }
++ case 23: {
++ // lwzx: RT = zero_ext([RA|0 + RB], 4 bytes)
++ uint64_t ea = XFormEA(this, instr);
++ setRegister(rt, U64(readWU(ea, instr)));
++ break;
++ }
++ case 341: {
++ // lwax: RT = sign_ext([RA|0 + RB], 4 bytes)
++ uint64_t ea = XFormEA(this, instr);
++ setRegister(rt, (int64_t)readW(ea, instr));
++ break;
++ }
++ case 87: {
++ // lbzx
++ uint64_t ea = XFormEA(this, instr);
++ setRegister(rt, U64(readBU(ea)));
++ break;
++ }
++ case 279: {
++ // lhzx
++ uint64_t ea = XFormEA(this, instr);
++ setRegister(rt, U64(readHU(ea, instr)));
++ break;
++ }
++ case 343: {
++ // lhax
++ uint64_t ea = XFormEA(this, instr);
++ setRegister(rt, (int64_t)readH(ea, instr));
++ break;
++ }
++ case 535: {
++ // lfsx: load float single indexed, widen to double (NaN-preserving)
++ uint64_t ea = XFormEA(this, instr);
++ if (!handleWasmSegFault(ea, 4)) {
++ float val = *reinterpret_cast<float*>(ea);
++ setFpuRegisterDouble(rt, promoteFloatPreservingNaN(val));
++ }
++ break;
++ }
++ case 599: {
++ // lfdx: load float double indexed
++ uint64_t ea = XFormEA(this, instr);
++ setFpuRegisterDouble(rt, readD(ea, instr));
++ break;
++ }
++ case 855: {
++ // lfiwax: load float as integer word algebraic
++ uint64_t ea = XFormEA(this, instr);
++ int32_t val = readW(ea, instr);
++ setFpuRegister(rt, (int64_t)val);
++ break;
++ }
++ case 887: {
++ // lfiwzx: load float as integer word zero
++ uint64_t ea = XFormEA(this, instr);
++ uint32_t val = readWU(ea, instr);
++ setFpuRegister(rt, (int64_t)(uint64_t)val);
++ break;
++ }
++
++ // --- Indexed stores ---
++ case 149: {
++ // stdx
++ uint64_t ea = XFormEA(this, instr);
++ writeDW(ea, getRegister(rt), instr);
++ break;
++ }
++ case 151: {
++ // stwx
++ uint64_t ea = XFormEA(this, instr);
++ writeW(ea, I32(getRegister(rt)), instr);
++ break;
++ }
++ case 215: {
++ // stbx
++ uint64_t ea = XFormEA(this, instr);
++ writeB(ea, (uint8_t)(getRegister(rt) & 0xFF));
++ break;
++ }
++ case 407: {
++ // sthx
++ uint64_t ea = XFormEA(this, instr);
++ writeH(ea, U16(getRegister(rt)), instr);
++ break;
++ }
++ case 663: {
++ // stfsx: store float single indexed (NaN-preserving)
++ uint64_t ea = XFormEA(this, instr);
++ if (!handleWasmSegFault(ea, 4)) {
++ float fval = demoteDoublePreservingNaN(getFpuRegisterDouble(rt));
++ *reinterpret_cast<float*>(ea) = fval;
++ LLBit_ = false;
++ }
++ break;
++ }
++ case 727: {
++ // stfdx: store float double indexed
++ uint64_t ea = XFormEA(this, instr);
++ writeD(ea, getFpuRegisterDouble(rt), instr);
++ break;
++ }
++
++ // --- Byte-reversed stores ---
++ case 662: {
++ // stwbrx
++ uint64_t ea = XFormEA(this, instr);
++ uint32_t val = U32(getRegister(rt));
++ writeW(ea, (int32_t)__builtin_bswap32(val), instr);
++ break;
++ }
++
++ // --- Atomic load/store ---
++ //
++ // Load-reserve and store-conditional. Sub-word variants
++ // (lbarx/lharx/stbcx./sthcx.) were added in ISA v2.06 (POWER7+).
++ // Word/doubleword variants (lwarx/stwcx./ldarx/stdcx.) go back
++ // to the base ISA.
++ case 52: {
++ // lbarx RT, RA, RB, EH
++ uint64_t ea = XFormEA(this, instr);
++ uint8_t val = loadLinkedB(ea, instr);
++ setRegister(rt, (int64_t)val);
++ break;
++ }
++ case 116: {
++ // lharx RT, RA, RB, EH
++ uint64_t ea = XFormEA(this, instr);
++ uint16_t val = loadLinkedH(ea, instr);
++ setRegister(rt, (int64_t)val);
++ break;
++ }
++ case 694: {
++ // stbcx. RS, RA, RB: always Rc=1.
++ uint64_t ea = XFormEA(this, instr);
++ uint8_t val = uint8_t(getRegister(rt));
++ int result = storeConditionalB(ea, val, instr);
++ if (result) {
++ setCRField(0, kCRFieldEQ | (kCRFieldSO * getXERSO()));
++ } else {
++ setCRField(0, kCRFieldSO * getXERSO());
++ }
++ break;
++ }
++ case 726: {
++ // sthcx. RS, RA, RB: always Rc=1.
++ uint64_t ea = XFormEA(this, instr);
++ uint16_t val = uint16_t(getRegister(rt));
++ int result = storeConditionalH(ea, val, instr);
++ if (result) {
++ setCRField(0, kCRFieldEQ | (kCRFieldSO * getXERSO()));
++ } else {
++ setCRField(0, kCRFieldSO * getXERSO());
++ }
++ break;
++ }
++ case 20: {
++ // lwarx
++ uint64_t ea = XFormEA(this, instr);
++ int32_t val = loadLinkedW(ea, instr);
++ setRegister(rt, (int64_t)val);
++ break;
++ }
++ case 150: {
++ // stwcx.
++ uint64_t ea = XFormEA(this, instr);
++ int32_t val = I32(getRegister(rt));
++ int result = storeConditionalW(ea, val, instr);
++ // stwcx. always updates CR0: EQ if store succeeded, else clear.
++ if (result) {
++ setCRField(0, kCRFieldEQ | (kCRFieldSO * getXERSO()));
++ } else {
++ setCRField(0, kCRFieldSO * getXERSO());
++ }
++ break;
++ }
++ case 84: {
++ // ldarx
++ uint64_t ea = XFormEA(this, instr);
++ int64_t val = loadLinkedD(ea, instr);
++ setRegister(rt, val);
++ break;
++ }
++ case 214: {
++ // stdcx.
++ uint64_t ea = XFormEA(this, instr);
++ int64_t val = getRegister(rt);
++ int result = storeConditionalD(ea, val, instr);
++ if (result) {
++ setCRField(0, kCRFieldEQ | (kCRFieldSO * getXERSO()));
++ } else {
++ setCRField(0, kCRFieldSO * getXERSO());
++ }
++ break;
++ }
++
++ // --- Synchronization ---
++ case 598:
++ // sync / lwsync / ptesync: no-op in simulator
++ break;
++ case 854:
++ // eieio: no-op in simulator
++ break;
++
++ // --- GPR <-> VSR move (major opcode 31, XX1-form) ---
++ //
++ // Two sub-encodings:
++ // mtvsr* XT,RA{,RB}: XX1Form — XT at bits 25:21 (5) + TX at bit 0 (1);
++ // RA at bits 20:16; RB (if any) at bits 15:11.
++ // mfvsr* RA,XS: XX1FormMfvsr — XS at bits 25:21 (5) + SX at bit 0 (1);
++ // RA (GPR dest) at bits 20:16.
++ //
++ // The original decoder treated "rsValue()" (bits 25:21 = VSR field) as a
++ // GPR index — doubly wrong: the GPR side lives at bits 20:16 (= raValue())
++ // and the VSR side is 6 bits (5-bit field + extension bit at bit 0). Fixed
++ // here and extended for the full VSR namespace (0-63).
++ // The ISA names each field in BE. "XT.DW0" is the BE doubleword which on
++ // PPC64LE register storage lives at LE bytes 8-15 (our bytes[] is LE-natural:
++ // bytes[0] = lowest address). With `mtvsrd / mfvsrd / mtvsrdd / mfvsrld
++ // / stxvx`: mtvsrd of 0x1122334455667788 produces `00 00 00 00 00 00 00 00
++ // 88 77 66 55 44 33 22 11` in memory (LE bytes 8-15 hold the GPR bits with
++ // LSB at byte 8). Matching semantics here means the sim respects
++ // the full Power ISA, not a self-consistent LE-reversed
++ // convention.
++ case 51: {
++ // mfvsrd RA, XS: GPR[RA] = XS.DW0 = LE bytes 8..15.
++ int xs = int(instr->rtValue() | (instr->bit(0) << 5)); // T + SX(TX)
++ uint8_t bytes[16];
++ getVSR128(xs, bytes);
++ int64_t val;
++ memcpy(&val, bytes + 8, 8);
++ setRegister(instr->raValue(), val);
++ break;
++ }
++ case 211: {
++ // mtvsrwa XT, RA: XT.DW0 = sign_ext_64(RA[32:63]); XT.DW1 = 0.
++ // POWER8+ (ISA 2.07). Combines extsw + mtvsrd. LE layout: bytes
++ // 8-15 ← sign-extended low 32 of RA; bytes 0-7 ← 0.
++ int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++ uint8_t bytes[16];
++ int64_t val = (int64_t)(int32_t)getRegister(instr->raValue());
++ memset(bytes, 0, 8);
++ memcpy(bytes + 8, &val, 8);
++ setVSR128(xt, bytes);
++ break;
++ }
++ case 179: {
++ // mtvsrd XT, RA: XT.DW0 = RA; XT.DW1 = 0.
++ // LE layout: bytes 8-15 ← RA, bytes 0-7 ← 0.
++ int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++ uint8_t bytes[16];
++ int64_t val = getRegister(instr->raValue());
++ memset(bytes, 0, 8);
++ memcpy(bytes + 8, &val, 8);
++ setVSR128(xt, bytes);
++ break;
++ }
++ case 243: {
++ // mtvsrwz XT, RA: XT.DW0 = zero_ext(RA[32:63]); XT.DW1 = 0.
++ // The 32-bit value lives in the low 32 bits of DW0 = BE word 1,
++ // which on LE storage is LE bytes 8..11 (LE word 2); LE bytes
++ // 12..15 = 0 (upper half of DW0 = BE word 0 = zero-extended).
++ int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++ uint8_t bytes[16];
++ uint32_t lo = U32(getRegister(instr->raValue()));
++ memset(bytes, 0, 16);
++ bytes[8] = (uint8_t)(lo);
++ bytes[9] = (uint8_t)(lo >> 8);
++ bytes[10] = (uint8_t)(lo >> 16);
++ bytes[11] = (uint8_t)(lo >> 24);
++ setVSR128(xt, bytes);
++ break;
++ }
++ case 307: {
++ // mfvsrld RA, XS: GPR[RA] = XS.DW1 = LE bytes 0..7.
++ // POWER9.
++ int xs = int(instr->rtValue() | (instr->bit(0) << 5));
++ uint8_t bytes[16];
++ getVSR128(xs, bytes);
++ int64_t val;
++ memcpy(&val, bytes, 8);
++ setRegister(instr->raValue(), val);
++ break;
++ }
++ case 403: {
++ // mtvsrws XT, RA (POWER9): splat low 32 bits of RA into all four
++ // word elements of XT. The same 32-bit value appears in lanes 0..3,
++ // so the byte layout is identical in LE and BE —
++ // bytes 0..15 = lo | lo | lo | lo.
++ int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++ uint8_t bytes[16];
++ uint32_t lo = U32(getRegister(instr->raValue()));
++ uint64_t val = ((uint64_t)lo << 32) | lo;
++ memcpy(bytes, &val, 8);
++ memcpy(bytes + 8, &val, 8);
++ setVSR128(xt, bytes);
++ break;
++ }
++ case 435: {
++ // mtvsrdd XT, RA, RB: XT.DW0 = RA; XT.DW1 = RB. POWER9.
++ // LE: bytes 8-15 ← RA, bytes 0-7 ← RB.
++ int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++ uint8_t bytes[16];
++ int64_t dw0 = getRegister(instr->raValue());
++ int64_t dw1 = getRegister(instr->rbValue());
++ memcpy(bytes, &dw1, 8);
++ memcpy(bytes + 8, &dw0, 8);
++ setVSR128(xt, bytes);
++ break;
++ }
++
++ // --- VMX vector memory (major opcode 31) ---
++ //
++ // lvx / stvx / lvxl / stvxl.
++ // EA = (RA|0) + RB; EA = EA & ~0xF (alignment)
++ // lvx: VRT[0:127] <- MEM(EA, 16) bytes[0] = *(EA+0)
++ // stvx: MEM(EA, 16) <- VRS[0:127] *(EA+0) = bytes[0]
++ // lvxl / stvxl are identical in effect to lvx / stvx (the "l" form
++ // hints "least recently used"; semantically indistinguishable).
++ case 103: {
++ // lvx: VRT = MEM(EA & ~0xF, 16 bytes)
++ uint64_t ea = XFormEA(this, instr) & ~uint64_t(0xF);
++ if (handleWasmSegFault(ea, 16)) break;
++ memcpy(VRregisters_[rt], reinterpret_cast<const void*>(ea), 16);
++ break;
++ }
++ case 231: {
++ // stvx: MEM(EA & ~0xF, 16 bytes) = VRS
++ uint64_t ea = XFormEA(this, instr) & ~uint64_t(0xF);
++ if (handleWasmSegFault(ea, 16)) break;
++ memcpy(reinterpret_cast<void*>(ea), VRregisters_[rt], 16);
++ break;
++ }
++ case 359: {
++ // lvxl: semantically identical to lvx
++ uint64_t ea = XFormEA(this, instr) & ~uint64_t(0xF);
++ if (handleWasmSegFault(ea, 16)) break;
++ memcpy(VRregisters_[rt], reinterpret_cast<const void*>(ea), 16);
++ break;
++ }
++ case 487: {
++ // stvxl: semantically identical to stvx
++ uint64_t ea = XFormEA(this, instr) & ~uint64_t(0xF);
++ if (handleWasmSegFault(ea, 16)) break;
++ memcpy(reinterpret_cast<void*>(ea), VRregisters_[rt], 16);
++ break;
++ }
++
++ // --- VSX vector memory indexed (major opcode 31) ---
++ //
++ // These ops take a 6-bit VSR register,
++ // encoded as 5-bit T/S + 1-bit TX/SX extension at instruction LSB
++ // bit 0 (= our instr->bit(0)). EA = (RA|0) + RB. 16-byte access,
++ // not forced-aligned (hardware may handle misaligned via sub-access
++ // or alignment interrupt per impl).
++ //
++ // Byte-order note: lxvx/stxvx perform a natural 16-byte LE
++ // memcpy. lxvd2x/stxvd2x on real PPC64 LE hardware load/store
++ // doublewords in BE-pair order — i.e. lxvd2x places memory bytes
++ // 0-7 in the register's BE-DW0 (= LE bytes 8-15) and bytes 8-15
++ // in BE-DW1 (= LE bytes 0-7). The JIT brackets every wasm SIMD
++ // load/store with a compensating xxpermdi DM=2 so the net effect
++ // is a natural LE byte order. The constant pool emits the same
++ // lxvd2x + xxpermdi sequence (per PatchConstantPoolLoad) but
++ // assumes the hardware semantics, not a plain memcpy. So the sim
++ // must match real-hardware lxvd2x/stxvd2x semantics including the
++ // BE-DW byte order — otherwise the post-load xxpermdi unswaps
++ // bytes that were never swapped, and constant-pool Simd128 loads
++ // (e.g. shuffle masks) come out with halves transposed.
++ case 268: {
++ // lxvx: XT = MEM((RA|0)+RB, 16)
++ uint64_t ea = XFormEA(this, instr);
++ if (handleWasmSegFault(ea, 16)) break;
++ int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++ uint8_t buf[16];
++ memcpy(buf, reinterpret_cast<const void*>(ea), 16);
++ setVSR128(xt, buf);
++ break;
++ }
++ case 396: {
++ // stxvx: MEM((RA|0)+RB, 16) = XS
++ uint64_t ea = XFormEA(this, instr);
++ if (handleWasmSegFault(ea, 16)) break;
++ int xs = int(instr->rtValue() | (instr->bit(0) << 5));
++ uint8_t buf[16];
++ getVSR128(xs, buf);
++ memcpy(reinterpret_cast<void*>(ea), buf, 16);
++ break;
++ }
++ case 813: {
++ // lxsihzx XT, RA, RB: P9 (ISA 3.0). Load halfword to VSR & zero,
++ // indexed. MEM(EA, 2) (LE-natural halfword) is placed in dw[0]
++ // low 16 bits; the rest of the VSR is zeroed. In sim LE-byte
++ // storage, that is bytes[8..9] (low byte at bytes[8]).
++ uint64_t ea = XFormEA(this, instr);
++ if (handleWasmSegFault(ea, 2)) break;
++ int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++ uint16_t halfword = readH(ea, instr);
++ uint8_t buf[16];
++ memset(buf, 0, 16);
++ buf[8] = (uint8_t)(halfword & 0xFF);
++ buf[9] = (uint8_t)((halfword >> 8) & 0xFF);
++ setVSR128(xt, buf);
++ break;
++ }
++ case 941: {
++ // stxsihx XS, RA, RB: P9 (ISA 3.0). Store halfword from VSR,
++ // indexed. dw[0] low 16 bits (sim bytes[8..9] in host-LE order)
++ // are written as a halfword at MEM(EA, 2).
++ uint64_t ea = XFormEA(this, instr);
++ if (handleWasmSegFault(ea, 2)) break;
++ int xs = int(instr->rtValue() | (instr->bit(0) << 5));
++ uint8_t buf[16];
++ getVSR128(xs, buf);
++ uint16_t halfword =
++ (uint16_t)buf[8] | ((uint16_t)buf[9] << 8);
++ writeH(ea, halfword, instr);
++ break;
++ }
++ case 844: {
++ // lxvd2x: XT = MEM((RA|0)+RB, 16) with BE-DW byte ordering.
++ // Memory bytes 0-7 land in BE-DW0 (= LE bytes 8-15); memory
++ // bytes 8-15 land in BE-DW1 (= LE bytes 0-7).
++ uint64_t ea = XFormEA(this, instr);
++ if (handleWasmSegFault(ea, 16)) break;
++ int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++ uint8_t mem[16], buf[16];
++ memcpy(mem, reinterpret_cast<const void*>(ea), 16);
++ memcpy(buf, mem + 8, 8);
++ memcpy(buf + 8, mem, 8);
++ setVSR128(xt, buf);
++ break;
++ }
++ case 972: {
++ // stxvd2x: MEM((RA|0)+RB, 16) = XS with BE-DW byte ordering.
++ // Inverse of lxvd2x: register LE bytes 0-7 → memory bytes 8-15;
++ // LE bytes 8-15 → memory bytes 0-7.
++ uint64_t ea = XFormEA(this, instr);
++ if (handleWasmSegFault(ea, 16)) break;
++ int xs = int(instr->rtValue() | (instr->bit(0) << 5));
++ uint8_t buf[16], mem[16];
++ getVSR128(xs, buf);
++ memcpy(mem, buf + 8, 8);
++ memcpy(mem + 8, buf, 8);
++ memcpy(reinterpret_cast<void*>(ea), mem, 16);
++ break;
++ }
++
++ default:
++ MOZ_CRASH_UNSAFE_PRINTF(
++ "decodeXForm: unimplemented XO=%u (instruction 0x%08x)", xo,
++ instr->instructionBits());
++ }
++}
++
++// -----------------------------------------------------------------------------
++// decodeRotateMask: rlwinm(21), rlwnm(23), rlwimi(20),
++// rldicl(30), rldicr(30), rldic(30), rldimi(30), rldcl(30), rldcr(30)
++
++void Simulator::decodeRotateMask(SimInstruction* instr) {
++ uint32_t opcode = instr->opcode();
++
++ if (opcode == 21) {
++ // rlwinm: RA = ROTL32(RS,SH) & MASK(MB,ME), Rc
++ uint32_t rs_val = U32(getRegister(instr->rsValue()));
++ uint32_t sh = instr->mSHValue();
++ uint32_t mb = instr->mMBValue();
++ uint32_t me = instr->mMEValue();
++ uint32_t rotated = RotateLeft32(rs_val, sh);
++ uint32_t mask = MASK32(mb, me);
++ int64_t result = (int64_t)(uint64_t)(rotated & mask);
++ setRegister(instr->raValue(), result);
++ if (instr->rcBit()) updateCR0(result);
++ } else if (opcode == 23) {
++ // rlwnm: RA = ROTL32(RS,RB[27:31]) & MASK(MB,ME), Rc
++ uint32_t rs_val = U32(getRegister(instr->rsValue()));
++ uint32_t sh = U32(getRegister(instr->rbValue())) & 0x1F;
++ uint32_t mb = instr->mMBValue();
++ uint32_t me = instr->mMEValue();
++ uint32_t rotated = RotateLeft32(rs_val, sh);
++ uint32_t mask = MASK32(mb, me);
++ int64_t result = (int64_t)(uint64_t)(rotated & mask);
++ setRegister(instr->raValue(), result);
++ if (instr->rcBit()) updateCR0(result);
++ } else if (opcode == 20) {
++ // rlwimi: RA = (ROTL32(RS,SH) & MASK) | (RA & ~MASK), Rc
++ uint32_t rs_val = U32(getRegister(instr->rsValue()));
++ uint32_t sh = instr->mSHValue();
++ uint32_t mb = instr->mMBValue();
++ uint32_t me = instr->mMEValue();
++ uint32_t rotated = RotateLeft32(rs_val, sh);
++ uint32_t mask = MASK32(mb, me);
++ uint32_t ra_val = U32(getRegister(instr->raValue()));
++ int64_t result = (int64_t)(uint64_t)((rotated & mask) | (ra_val & ~mask));
++ setRegister(instr->raValue(), result);
++ if (instr->rcBit()) updateCR0(result);
++ } else if (opcode == 30) {
++ // MD-form / MDS-form: 64-bit rotate/mask
++ uint32_t rs = instr->rsValue();
++ uint64_t rs_val = U64(getRegister(rs));
++ uint32_t ra_reg = instr->raValue();
++
++ // Determine which sub-opcode: bits 2..4 for MD-form, bit 4 for MDS.
++ // MD: bits 2..4
++ // MDS: bit 4 (rldcl has bit4=0, bit3..2=00 with bit1=1; rldcr has
++ // bit4=0, bit3..2=01 with bit1=1). Actually:
++ // rldicl: 30 | MD-XO=0 (bits 2..4 = 000), bit1=0
++ // rldicr: 30 | MD-XO=1 (bits 2..4 = 001), bit1=0
++ // rldic: 30 | MD-XO=2 (bits 2..4 = 010), bit1=0
++ // rldimi: 30 | MD-XO=3 (bits 2..4 = 011), bit1=0
++ // rldcl: 30 | MDS, bit4=0, bit3..1=000, bit0=Rc => bits 1..4=1000
++ // Actually rldcl: bits 1..4 = 1000, i.e. bit(4)=1,bit(3)=0,
++ // bit(2)=0,bit(1)=0
++ // rldcr: 30 | MDS, bits 1..4 = 1001
++ //
++ // Let's check bit 4 first: if bit(4)==1, it's MDS-form (rldcl/rldcr).
++ if (instr->bit(4)) {
++ // MDS-form: shift amount from RB register
++ uint32_t sh = U32(getRegister(instr->rbValue())) & 0x3F;
++ uint64_t rotated = RotateLeft64(rs_val, sh);
++ uint32_t mb = instr->mdsMBValue();
++
++ if (!instr->bit(1)) {
++ // rldcl: RA = ROTL64(RS, RB[58:63]) & MASK(mb, 63)
++ uint64_t mask = MASK64(mb, 63);
++ int64_t result = I64(rotated & mask);
++ setRegister(ra_reg, result);
++ if (instr->rcBit()) updateCR0(result);
++ } else {
++ // rldcr: RA = ROTL64(RS, RB[58:63]) & MASK(0, me)
++ uint32_t me = instr->mdsMBValue();
++ uint64_t mask = MASK64(0, me);
++ int64_t result = I64(rotated & mask);
++ setRegister(ra_reg, result);
++ if (instr->rcBit()) updateCR0(result);
++ }
++ } else {
++ // MD-form
++ uint32_t sh = instr->mdSHValue();
++ uint64_t rotated = RotateLeft64(rs_val, sh);
++ uint32_t xo_md = instr->bits(3, 2);
++
++ switch (xo_md) {
++ case 0: {
++ // rldicl: RA = ROTL64(RS, SH) & MASK(mb, 63)
++ uint32_t mb = instr->mdMBValue();
++ uint64_t mask = MASK64(mb, 63);
++ int64_t result = I64(rotated & mask);
++ setRegister(ra_reg, result);
++ if (instr->rcBit()) updateCR0(result);
++ break;
++ }
++ case 1: {
++ // rldicr: RA = ROTL64(RS, SH) & MASK(0, me)
++ uint32_t me = instr->mdMEValue();
++ uint64_t mask = MASK64(0, me);
++ int64_t result = I64(rotated & mask);
++ setRegister(ra_reg, result);
++ if (instr->rcBit()) updateCR0(result);
++ break;
++ }
++ case 2: {
++ // rldic: RA = ROTL64(RS, SH) & MASK(mb, ~SH)
++ // Actually: MASK(mb, 63-SH)
++ uint32_t mb = instr->mdMBValue();
++ uint64_t mask = MASK64(mb, 63 - sh);
++ int64_t result = I64(rotated & mask);
++ setRegister(ra_reg, result);
++ if (instr->rcBit()) updateCR0(result);
++ break;
++ }
++ case 3: {
++ // rldimi: RA = (ROTL64(RS,SH) & MASK) | (RA & ~MASK)
++ uint32_t mb = instr->mdMBValue();
++ uint64_t mask = MASK64(mb, 63 - sh);
++ uint64_t ra_val = U64(getRegister(ra_reg));
++ int64_t result = I64((rotated & mask) | (ra_val & ~mask));
++ setRegister(ra_reg, result);
++ if (instr->rcBit()) updateCR0(result);
++ break;
++ }
++ default:
++ MOZ_CRASH_UNSAFE_PRINTF("decodeRotateMask: MD xo=%u", xo_md);
++ }
++ }
++ } else {
++ MOZ_CRASH_UNSAFE_PRINTF("decodeRotateMask: opcode=%u", opcode);
++ }
++}
++
++// -----------------------------------------------------------------------------
++// CR-bit accessors used by the XL-form CR-logic ops (crand, crandc, cror,
++// crorc, crxor, creqv). Bit index is in BIF*4+x form: field=b/4, bit=b%4
++// where 0=LT, 1=GT, 2=EQ, 3=SO.
++static inline uint8_t CRBitMask(uint32_t bitInField) {
++ switch (bitInField) {
++ case 0: return kCRFieldLT;
++ case 1: return kCRFieldGT;
++ case 2: return kCRFieldEQ;
++ case 3: return kCRFieldSO;
++ }
++ return 0;
++}
++
++static inline bool GetCRBit(Simulator& s, uint32_t b) {
++ return (s.getCRField(b / 4) & CRBitMask(b % 4)) != 0;
++}
++
++static inline void SetCRBit(Simulator& s, uint32_t b, bool val) {
++ uint8_t fv = s.getCRField(b / 4);
++ uint8_t mask = CRBitMask(b % 4);
++ s.setCRField(b / 4, val ? (fv | mask) : (fv & ~mask));
++}
++
++// -----------------------------------------------------------------------------
++// decodeBranch: b(18), bc(16), XL-form(19)
++
++void Simulator::decodeBranch(SimInstruction* instr) {
++ uint32_t opcode = instr->opcode();
++
++ if (opcode == 18) {
++ // b / bl: I-form unconditional branch
++ int32_t offset = instr->li26Value();
++ bool lk = instr->lkBit();
++ bool aa = instr->aaBit();
++
++ int64_t target;
++ if (aa) {
++ target = (int64_t)offset;
++ } else {
++ target = get_pc() + (int64_t)offset;
++ }
++
++ if (lk) {
++ setLR(get_pc() + SimInstruction::kInstrSize);
++ }
++
++ set_pc(target);
++ return;
++ }
++
++ if (opcode == 16) {
++ // bc / bcl: B-form conditional branch
++ uint32_t bo = instr->boValue();
++ uint32_t bi = instr->biValue();
++ int32_t bd = instr->bd16Value();
++ bool lk = instr->lkBit();
++ bool aa = instr->aaBit();
++
++ // Decrement CTR if BO[2] (bit 2 of BO, which is bo & 0x04) is clear.
++ if (!(bo & 0x04)) {
++ setCTR(getCTR() - 1);
++ }
++
++ // Evaluate CTR condition.
++ bool ctr_ok = (bo & 0x04) ||
++ ((getCTR() != 0) ^ ((bo & 0x02) != 0));
++
++ // Evaluate CR condition.
++ uint32_t crField = bi / 4;
++ uint32_t crBit = bi % 4;
++ uint8_t crFieldVal = getCRField(crField);
++ bool crBitSet;
++ switch (crBit) {
++ case 0: crBitSet = (crFieldVal & kCRFieldLT) != 0; break;
++ case 1: crBitSet = (crFieldVal & kCRFieldGT) != 0; break;
++ case 2: crBitSet = (crFieldVal & kCRFieldEQ) != 0; break;
++ case 3: crBitSet = (crFieldVal & kCRFieldSO) != 0; break;
++ default: crBitSet = false; break;
++ }
++ bool cond_ok = (bo & 0x10) || (crBitSet == ((bo & 0x08) != 0));
++
++ if (ctr_ok && cond_ok) {
++ int64_t target;
++ if (aa) {
++ target = (int64_t)bd;
++ } else {
++ target = get_pc() + (int64_t)bd;
++ }
++ if (lk) {
++ setLR(get_pc() + SimInstruction::kInstrSize);
++ }
++ set_pc(target);
++ } else {
++ // Branch not taken.
++ set_pc(get_pc() + SimInstruction::kInstrSize);
++ }
++ return;
++ }
++
++ if (opcode == 19) {
++ // XL-form: bclr, bcctr, crand, crandc, cror, crorc, crxor, creqv,
++ // mcrf, isync
++ uint32_t xl = instr->xlValue();
++
++ switch (xl) {
++ case 16: {
++ // bclr: conditional branch to LR
++ uint32_t bo = instr->boValue();
++ uint32_t bi = instr->biValue();
++ bool lk = instr->lkBit();
++
++ if (!(bo & 0x04)) {
++ setCTR(getCTR() - 1);
++ }
++
++ bool ctr_ok = (bo & 0x04) ||
++ ((getCTR() != 0) ^ ((bo & 0x02) != 0));
++
++ uint32_t crField = bi / 4;
++ uint32_t crBit = bi % 4;
++ uint8_t crFieldVal = getCRField(crField);
++ bool crBitSet;
++ switch (crBit) {
++ case 0: crBitSet = (crFieldVal & kCRFieldLT) != 0; break;
++ case 1: crBitSet = (crFieldVal & kCRFieldGT) != 0; break;
++ case 2: crBitSet = (crFieldVal & kCRFieldEQ) != 0; break;
++ case 3: crBitSet = (crFieldVal & kCRFieldSO) != 0; break;
++ default: crBitSet = false; break;
++ }
++ bool cond_ok = (bo & 0x10) || (crBitSet == ((bo & 0x08) != 0));
++
++ if (ctr_ok && cond_ok) {
++ int64_t target = getLR() & ~3LL;
++ if (lk) {
++ setLR(get_pc() + SimInstruction::kInstrSize);
++ }
++ set_pc(target);
++ } else {
++ set_pc(get_pc() + SimInstruction::kInstrSize);
++ }
++ break;
++ }
++ case 528: {
++ // bcctr: conditional branch to CTR
++ uint32_t bo = instr->boValue();
++ uint32_t bi = instr->biValue();
++ bool lk = instr->lkBit();
++
++ // CTR is not decremented for bcctr.
++ uint32_t crField = bi / 4;
++ uint32_t crBit = bi % 4;
++ uint8_t crFieldVal = getCRField(crField);
++ bool crBitSet;
++ switch (crBit) {
++ case 0: crBitSet = (crFieldVal & kCRFieldLT) != 0; break;
++ case 1: crBitSet = (crFieldVal & kCRFieldGT) != 0; break;
++ case 2: crBitSet = (crFieldVal & kCRFieldEQ) != 0; break;
++ case 3: crBitSet = (crFieldVal & kCRFieldSO) != 0; break;
++ default: crBitSet = false; break;
++ }
++ bool cond_ok = (bo & 0x10) || (crBitSet == ((bo & 0x08) != 0));
++
++ if (cond_ok) {
++ int64_t target = getCTR() & ~3LL;
++ if (lk) {
++ setLR(get_pc() + SimInstruction::kInstrSize);
++ }
++ set_pc(target);
++ } else {
++ set_pc(get_pc() + SimInstruction::kInstrSize);
++ }
++ break;
++ }
++ case 257: {
++ // crand: CR[BT] = CR[BA] & CR[BB]
++ uint32_t bt = instr->rtValue();
++ uint32_t ba = instr->raValue();
++ uint32_t bb = instr->rbValue();
++ SetCRBit(*this, bt, GetCRBit(*this, ba) && GetCRBit(*this, bb));
++ break;
++ }
++ case 129: {
++ // crandc: CR[BT] = CR[BA] & ~CR[BB]
++ uint32_t bt = instr->rtValue();
++ uint32_t ba = instr->raValue();
++ uint32_t bb = instr->rbValue();
++ SetCRBit(*this, bt, GetCRBit(*this, ba) && !GetCRBit(*this, bb));
++ break;
++ }
++ case 449: {
++ // cror: CR[BT] = CR[BA] | CR[BB]
++ uint32_t bt = instr->rtValue();
++ uint32_t ba = instr->raValue();
++ uint32_t bb = instr->rbValue();
++ SetCRBit(*this, bt, GetCRBit(*this, ba) || GetCRBit(*this, bb));
++ break;
++ }
++ case 417: {
++ // crorc: CR[BT] = CR[BA] | ~CR[BB]
++ uint32_t bt = instr->rtValue();
++ uint32_t ba = instr->raValue();
++ uint32_t bb = instr->rbValue();
++ SetCRBit(*this, bt, GetCRBit(*this, ba) || !GetCRBit(*this, bb));
++ break;
++ }
++ case 193: {
++ // crxor: CR[BT] = CR[BA] ^ CR[BB]
++ uint32_t bt = instr->rtValue();
++ uint32_t ba = instr->raValue();
++ uint32_t bb = instr->rbValue();
++ SetCRBit(*this, bt, GetCRBit(*this, ba) ^ GetCRBit(*this, bb));
++ break;
++ }
++ case 289: {
++ // creqv: CR[BT] = ~(CR[BA] ^ CR[BB])
++ uint32_t bt = instr->rtValue();
++ uint32_t ba = instr->raValue();
++ uint32_t bb = instr->rbValue();
++ SetCRBit(*this, bt, !(GetCRBit(*this, ba) ^ GetCRBit(*this, bb)));
++ break;
++ }
++ case 150: {
++ // isync: no-op in simulator
++ break;
++ }
++ case 370: {
++ // PPC_stop (0x4C0002E4) decoded as XL-form opcode 19, XL=370.
++ // This is our kCallRedirInstr. Handle via softwareInterrupt.
++ softwareInterrupt(instr);
++ break;
++ }
++ case 2: {
++ // POWER9 addpcis rT, D (DX-form). Computes rT = (CIA + 4) +
++ // (sext16(D) << 16). The 16-bit signed displacement D is split
++ // across three sub-fields:
++ // d0 = bits LE 6..15 (10 bits) — D[15:6]
++ // d1 = bits LE 16..20 (5 bits) — D[5:1]
++ // d2 = bit LE 0 (1 bit) — D[0]
++ // (Mirrors the encoder in Assembler-ppc64.cpp:as_addpcis.)
++ uint32_t rt = instr->rtValue();
++ uint32_t d0 = instr->bits(15, 6);
++ uint32_t d1 = instr->bits(20, 16);
++ uint32_t d2 = instr->bit(0);
++ int16_t D = (int16_t)((d0 << 6) | (d1 << 1) | d2);
++ int64_t cia = reinterpret_cast<int64_t>(instr);
++ setRegister(rt, cia + SimInstruction::kInstrSize +
++ (static_cast<int64_t>(D) << 16));
++ break;
++ }
++ default:
++ MOZ_CRASH_UNSAFE_PRINTF("decodeBranch: XL opcode 19, xl=%u", xl);
++ }
++ return;
++ }
++
++ MOZ_CRASH_UNSAFE_PRINTF("decodeBranch: opcode=%u", opcode);
++}
++
++// -----------------------------------------------------------------------------
++// decodeFP: Major opcodes 59 (A-form single) and 63 (X-form / A-form double)
++
++void Simulator::decodeFP(SimInstruction* instr) {
++ uint32_t opcode = instr->opcode();
++ uint32_t rt = instr->rtValue(); // FRT
++ uint32_t ra = instr->raValue(); // FRA
++ uint32_t rb = instr->rbValue(); // FRB
++ uint32_t rc_reg = instr->rcValue(); // FRC (A-form)
++
++ if (opcode == 63) {
++ // X-form and A-form double-precision instructions.
++ // For A-form, the sub-opcode is in bits 1..5.
++ // For X-form, the sub-opcode is in bits 1..10.
++ uint32_t xo_a = instr->bits(5, 1); // A-form sub-opcode
++ uint32_t xo_x = instr->bits(10, 1); // X-form sub-opcode
++
++ // Try A-form first (5-bit sub-opcode in bits 1..5).
++ switch (xo_a) {
++ case 21: {
++ // fadd
++ double result = getFpuRegisterDouble(ra) + getFpuRegisterDouble(rb);
++ setFpuRegisterDouble(rt, result);
++ return;
++ }
++ case 20: {
++ // fsub
++ double result = getFpuRegisterDouble(ra) - getFpuRegisterDouble(rb);
++ setFpuRegisterDouble(rt, result);
++ return;
++ }
++ case 25: {
++ // fmul: FRT = FRA * FRC (note: FRC, not FRB!)
++ double result = getFpuRegisterDouble(ra) * getFpuRegisterDouble(rc_reg);
++ setFpuRegisterDouble(rt, result);
++ return;
++ }
++ case 18: {
++ // fdiv
++ double result = getFpuRegisterDouble(ra) / getFpuRegisterDouble(rb);
++ setFpuRegisterDouble(rt, result);
++ return;
++ }
++ case 22: {
++ // fsqrt
++ double result = sqrt(getFpuRegisterDouble(rb));
++ setFpuRegisterDouble(rt, result);
++ return;
++ }
++ case 29: {
++ // fmadd: FRT = FRA * FRC + FRB
++ double result = std::fma(getFpuRegisterDouble(ra),
++ getFpuRegisterDouble(rc_reg),
++ getFpuRegisterDouble(rb));
++ setFpuRegisterDouble(rt, result);
++ return;
++ }
++ case 30: {
++ // fnmsub: FRT = -(FRA * FRC - FRB)
++ double result = -(std::fma(getFpuRegisterDouble(ra),
++ getFpuRegisterDouble(rc_reg),
++ -getFpuRegisterDouble(rb)));
++ setFpuRegisterDouble(rt, result);
++ return;
++ }
++ case 28: {
++ // fmsub: FRT = FRA * FRC - FRB
++ double result = std::fma(getFpuRegisterDouble(ra),
++ getFpuRegisterDouble(rc_reg),
++ -getFpuRegisterDouble(rb));
++ setFpuRegisterDouble(rt, result);
++ return;
++ }
++ case 31: {
++ // fnmadd: FRT = -(FRA * FRC + FRB)
++ double result = -(std::fma(getFpuRegisterDouble(ra),
++ getFpuRegisterDouble(rc_reg),
++ getFpuRegisterDouble(rb)));
++ setFpuRegisterDouble(rt, result);
++ return;
++ }
++ case 23: {
++ // fsel: FRT = (FRA >= 0) ? FRC : FRB
++ double fra = getFpuRegisterDouble(ra);
++ setFpuRegisterDouble(rt, (fra >= 0.0) ? getFpuRegisterDouble(rc_reg)
++ : getFpuRegisterDouble(rb));
++ return;
++ }
++ case 26: {
++ // frsqrte: FRT = 1.0 / sqrt(FRB) (estimate)
++ double result = 1.0 / sqrt(getFpuRegisterDouble(rb));
++ setFpuRegisterDouble(rt, result);
++ return;
++ }
++ }
++
++ // X-form (10-bit sub-opcode).
++ switch (xo_x) {
++ case 72: {
++ // fmr: FRT = FRB
++ setFpuRegisterDouble(rt, getFpuRegisterDouble(rb));
++ break;
++ }
++ case 40: {
++ // fneg: FRT = -FRB
++ setFpuRegisterDouble(rt, -getFpuRegisterDouble(rb));
++ break;
++ }
++ case 264: {
++ // fabs: FRT = |FRB|
++ setFpuRegisterDouble(rt, fabs(getFpuRegisterDouble(rb)));
++ break;
++ }
++ case 136: {
++ // fnabs: FRT = -|FRB|
++ setFpuRegisterDouble(rt, -fabs(getFpuRegisterDouble(rb)));
++ break;
++ }
++ case 8: {
++ // fcpsgn: FRT = sign(FRA) || magnitude(FRB)
++ double fra = getFpuRegisterDouble(ra);
++ double frb = getFpuRegisterDouble(rb);
++ setFpuRegisterDouble(rt, std::copysign(frb, fra));
++ break;
++ }
++ case 0: {
++ // fcmpu: compare FRA, FRB unordered
++ uint32_t bf = instr->bfValue();
++ double fra = getFpuRegisterDouble(ra);
++ double frb = getFpuRegisterDouble(rb);
++ uint8_t field = 0;
++ if (std::isnan(fra) || std::isnan(frb)) {
++ field = kCRFieldSO;
++ } else if (fra < frb) {
++ field = kCRFieldLT;
++ } else if (fra > frb) {
++ field = kCRFieldGT;
++ } else {
++ field = kCRFieldEQ;
++ }
++ setCRField(bf, field);
++ break;
++ }
++ case 32: {
++ // fcmpo: compare FRA, FRB ordered
++ uint32_t bf = instr->bfValue();
++ double fra = getFpuRegisterDouble(ra);
++ double frb = getFpuRegisterDouble(rb);
++ uint8_t field = 0;
++ if (std::isnan(fra) || std::isnan(frb)) {
++ field = kCRFieldSO;
++ } else if (fra < frb) {
++ field = kCRFieldLT;
++ } else if (fra > frb) {
++ field = kCRFieldGT;
++ } else {
++ field = kCRFieldEQ;
++ }
++ setCRField(bf, field);
++ break;
++ }
++ // For fctid* and fctiw* the ISA specifies that bit 23 of FPSCR (VXCVI,
++ // "invalid op for integer convert") is set when the source is NaN, +Inf,
++ // -Inf, or out of the destination's range. Wasm's out-of-range trap
++ // sequence is `mtfsb0 23; fctidz; mfvsrd; mcrfs cr0,5; bt SOBit,trap`,
++ // so the simulator MUST update VXCVI here for the trap to fire. With
++ // FPSCR_ in the low-half PPC layout (PPC bit N → int64 bit (31-N)),
++ // VXCVI lives at int64 bit (31-23) = 8.
++ case 814: {
++ // fctid: convert double to int64 (current rounding)
++ double frb = getFpuRegisterDouble(rb);
++ int64_t result;
++ bool invalid = false;
++ if (std::isnan(frb)) {
++ result = INT64_MIN;
++ invalid = true;
++ } else if (frb >= -(double)INT64_MIN || frb < (double)INT64_MIN) {
++ result = (frb < 0) ? INT64_MIN : INT64_MAX;
++ invalid = true;
++ } else {
++ switch (FPSCR_ & kFPSCRRNMask) {
++ case RN: result = (int64_t)llrint(frb); break;
++ case RZ: result = (int64_t)frb; break;
++ case RP: result = (int64_t)ceil(frb); break;
++ case RM: result = (int64_t)floor(frb); break;
++ default: result = (int64_t)frb; break;
++ }
++ }
++ if (invalid) FPSCR_ |= (1ULL << 8); /* VXCVI: PPC bit 23 in low-half layout */
++ setFpuRegister(rt, result);
++ break;
++ }
++ case 815: {
++ // fctidz: convert double to int64 (round toward zero)
++ double frb = getFpuRegisterDouble(rb);
++ int64_t result;
++ bool invalid = false;
++ if (std::isnan(frb)) {
++ result = INT64_MIN;
++ invalid = true;
++ } else if (frb >= -(double)INT64_MIN) {
++ result = INT64_MAX;
++ invalid = true;
++ } else if (frb < (double)INT64_MIN) {
++ result = INT64_MIN;
++ invalid = true;
++ } else {
++ result = (int64_t)frb;
++ }
++ if (invalid) FPSCR_ |= (1ULL << 8); /* VXCVI: PPC bit 23 in low-half layout */
++ setFpuRegister(rt, result);
++ break;
++ }
++ case 942: {
++ // fctidu: convert double to uint64 (current rounding).
++ // VXCVI is signaled when source is NaN, ±Inf, or the rounded value
++ // is outside [0, 2^64-1]. Notably,
++ // a negative source whose rounded value is 0 (e.g. -0.4 in RN, or
++ // any value in (-1, 0) in RZ) is NOT invalid.
++ double frb = getFpuRegisterDouble(rb);
++ uint64_t result;
++ bool invalid = false;
++ if (std::isnan(frb)) {
++ result = 0;
++ invalid = true;
++ } else if (frb >= -2.0 * (double)INT64_MIN /* 2^64 */) {
++ result = UINT64_MAX;
++ invalid = true;
++ } else {
++ double rounded;
++ switch (FPSCR_ & kFPSCRRNMask) {
++ case RN: rounded = nearbyint(frb); break;
++ case RZ: rounded = trunc(frb); break;
++ case RP: rounded = ceil(frb); break;
++ case RM: rounded = floor(frb); break;
++ default: rounded = trunc(frb); break;
++ }
++ if (rounded < 0.0) {
++ result = 0;
++ invalid = true;
++ } else {
++ result = (uint64_t)rounded;
++ }
++ }
++ if (invalid) FPSCR_ |= (1ULL << 8); /* VXCVI: PPC bit 23 in low-half layout */
++ setFpuRegister(rt, I64(result));
++ break;
++ }
++ case 943: {
++ // fctiduz: convert double to uint64 (round toward zero).
++ // Same VXCVI rule as fctidu but rounding is fixed to truncate
++ // toward zero. Source in (-1, 0) truncates to 0 — VALID.
++ double frb = getFpuRegisterDouble(rb);
++ uint64_t result;
++ bool invalid = false;
++ if (std::isnan(frb)) {
++ result = 0;
++ invalid = true;
++ } else if (frb >= -2.0 * (double)INT64_MIN /* 2^64 */) {
++ result = UINT64_MAX;
++ invalid = true;
++ } else if (frb <= -1.0) {
++ // Truncated value is negative — invalid for unsigned.
++ result = 0;
++ invalid = true;
++ } else {
++ // Source is in (-1, 2^64); truncation toward zero yields a value
++ // in [0, 2^64).
++ result = (uint64_t)trunc(frb);
++ }
++ if (invalid) FPSCR_ |= (1ULL << 8); /* VXCVI: PPC bit 23 in low-half layout */
++ setFpuRegister(rt, I64(result));
++ break;
++ }
++ case 14: {
++ // fctiw: convert double to int32 (current rounding).
++ // Invalid range: rounded value < INT32_MIN or > INT32_MAX. The
++ // double-precision boundary on the negative side is INT32_MIN-1 =
++ // -2^31-1 = -2147483649.0 (exactly representable; doubles in
++ // (-2^31-1, -2^31) all round-to-nearest to -2^31 which is valid).
++ double frb = getFpuRegisterDouble(rb);
++ int32_t result;
++ bool invalid = false;
++ if (std::isnan(frb)) {
++ result = INT32_MIN;
++ invalid = true;
++ } else {
++ double rounded;
++ switch (FPSCR_ & kFPSCRRNMask) {
++ case RN: rounded = nearbyint(frb); break;
++ case RZ: rounded = trunc(frb); break;
++ case RP: rounded = ceil(frb); break;
++ case RM: rounded = floor(frb); break;
++ default: rounded = trunc(frb); break;
++ }
++ if (rounded > (double)INT32_MAX) {
++ result = INT32_MAX;
++ invalid = true;
++ } else if (rounded < (double)INT32_MIN) {
++ result = INT32_MIN;
++ invalid = true;
++ } else {
++ result = (int32_t)rounded;
++ }
++ }
++ if (invalid) FPSCR_ |= (1ULL << 8); /* VXCVI: PPC bit 23 in low-half layout */
++ setFpuRegister(rt, (int64_t)result);
++ break;
++ }
++ case 15: {
++ // fctiwz: convert double to int32 (round toward zero).
++ // Truncation of a value in (-2^31-1, INT32_MIN) toward zero gives
++ // INT32_MIN — valid. Only `frb <= -2^31-1` (i.e. `frb < INT32_MIN-1+1`
++ // = `frb < -2147483648` ... wait, simplest: check truncated value in
++ // range AFTER truncation.)
++ double frb = getFpuRegisterDouble(rb);
++ int32_t result;
++ bool invalid = false;
++ if (std::isnan(frb)) {
++ result = INT32_MIN;
++ invalid = true;
++ } else {
++ double truncated = trunc(frb);
++ if (truncated > (double)INT32_MAX) {
++ result = INT32_MAX;
++ invalid = true;
++ } else if (truncated < (double)INT32_MIN) {
++ result = INT32_MIN;
++ invalid = true;
++ } else {
++ result = (int32_t)truncated;
++ }
++ }
++ if (invalid) FPSCR_ |= (1ULL << 8); /* VXCVI: PPC bit 23 in low-half layout */
++ setFpuRegister(rt, (int64_t)result);
++ break;
++ }
++ case 142: {
++ // fctiwu: convert double to uint32 (current rounding). The check is
++ // on the ROUNDED value: VXCVI iff rounded < 0 or rounded > UINT32_MAX.
++ double frb = getFpuRegisterDouble(rb);
++ uint32_t result;
++ bool invalid = false;
++ if (std::isnan(frb)) {
++ result = 0;
++ invalid = true;
++ } else {
++ double rounded;
++ switch (FPSCR_ & kFPSCRRNMask) {
++ case RN: rounded = nearbyint(frb); break;
++ case RZ: rounded = trunc(frb); break;
++ case RP: rounded = ceil(frb); break;
++ case RM: rounded = floor(frb); break;
++ default: rounded = trunc(frb); break;
++ }
++ if (rounded < 0.0) {
++ result = 0;
++ invalid = true;
++ } else if (rounded > (double)UINT32_MAX) {
++ result = UINT32_MAX;
++ invalid = true;
++ } else {
++ result = (uint32_t)rounded;
++ }
++ }
++ if (invalid) FPSCR_ |= (1ULL << 8); /* VXCVI: PPC bit 23 in low-half layout */
++ setFpuRegister(rt, (int64_t)(uint64_t)result);
++ break;
++ }
++ case 143: {
++ // fctiwuz: convert double to uint32 (round toward zero).
++ // Source in (-1, 0) truncates to 0 — VALID.
++ double frb = getFpuRegisterDouble(rb);
++ uint32_t result;
++ bool invalid = false;
++ if (std::isnan(frb)) {
++ result = 0;
++ invalid = true;
++ } else {
++ double truncated = trunc(frb);
++ if (truncated > (double)UINT32_MAX) {
++ result = UINT32_MAX;
++ invalid = true;
++ } else if (truncated < 0.0) {
++ result = 0;
++ invalid = true;
++ } else {
++ result = (uint32_t)truncated;
++ }
++ }
++ if (invalid) FPSCR_ |= (1ULL << 8); /* VXCVI: PPC bit 23 in low-half layout */
++ setFpuRegister(rt, (int64_t)(uint64_t)result);
++ break;
++ }
++ case 846: {
++ // fcfid: convert int64 in FPR to double
++ int64_t val = getFpuRegister(rb);
++ setFpuRegisterDouble(rt, (double)val);
++ break;
++ }
++ case 974: {
++ // fcfidu: convert uint64 in FPR to double
++ uint64_t val = U64(getFpuRegister(rb));
++ setFpuRegisterDouble(rt, (double)val);
++ break;
++ }
++ case 12: {
++ // frsp: round double to single precision (then re-extend in FPR).
++ // sNaN inputs are quieted (the result payload MSB is set).
++ // wasm f32.demote_f64 lowers to this op when
++ // not using xscvdpsp directly.
++ double frb = getFpuRegisterDouble(rb);
++ float result = demoteDoublePreservingNaN(frb);
++ uint32_t fbits;
++ memcpy(&fbits, &result, sizeof(fbits));
++ if ((fbits & 0x7F800000u) == 0x7F800000u &&
++ (fbits & 0x007FFFFFu) != 0) {
++ fbits |= 0x00400000u;
++ memcpy(&result, &fbits, sizeof(result));
++ }
++ setFpuRegisterDouble(rt, promoteFloatPreservingNaN(result));
++ break;
++ }
++ case 392: {
++ // frin: round to nearest integer (ties away from zero)
++ double frb = getFpuRegisterDouble(rb);
++ setFpuRegisterDouble(rt, round(frb));
++ break;
++ }
++ case 424: {
++ // friz: round toward zero
++ double frb = getFpuRegisterDouble(rb);
++ setFpuRegisterDouble(rt, trunc(frb));
++ break;
++ }
++ case 456: {
++ // frip: round toward +infinity (ceil). XO=456.
++ double frb = getFpuRegisterDouble(rb);
++ setFpuRegisterDouble(rt, ceil(frb));
++ break;
++ }
++ case 488: {
++ // frim: round toward -infinity (floor). XO=488.
++ double frb = getFpuRegisterDouble(rb);
++ setFpuRegisterDouble(rt, floor(frb));
++ break;
++ }
++ case 583: {
++ // mffs: FRT = FPSCR (as double bit pattern)
++ setFpuRegister(rt, I64(FPSCR_));
++ break;
++ }
++ // FPSCR is treated as a 32-bit register stored in the low 32 bits of
++ // FPSCR_ (uint64_t), with PPC bit numbering: PPC bit N (where bit 0 is
++ // the MSB) lives at int64 bit (31-N). Field F (4 bits) covers PPC bits
++ // 4F..4F+3 → int64 bit-LSB (28-4F) to bit-MSB (31-4F). This matches
++ // mcrfs, mtfsfi, kFPSCRRNMask (which checks bits 30-31 PPC = int64 bits
++ // 0-1), and mffs (which copies FPSCR into FPR bits 32..63 PPC = int64
++ // bits 0..31). Earlier mtfsb0/mtfsb1 used (63-bt) which placed bits in
++ // the high half of FPSCR_ where mcrfs etc. would never see them — so
++ // the wasm trap sequence `mtfsb0 23; fctidz; mcrfs cr0,5; bt SO,oolEntry`
++ // could not detect VXCVI.
++ case 70: {
++ // mtfsb0: clear FPSCR bit. XO=70.
++ // (Cases 38 and 70 had the labels swapped, so wasm's
++ // `mtfsb0 23; fctidz; mcrfs cr0,5; bt SO,trap` sequence accidentally
++ // SET VXCVI before the convert ran, causing every fctid* to trap.)
++ uint32_t bt = instr->rtValue();
++ FPSCR_ &= ~(1ULL << (31 - bt));
++ break;
++ }
++ case 64: {
++ // mcrfs: copy FPSCR field to CR field
++ uint32_t bf = instr->bfValue();
++ uint32_t bfa = instr->bits(20, 18);
++ uint32_t shift = 4 * (7 - bfa);
++ uint8_t val = (FPSCR_ >> shift) & 0xF;
++ setCRField(bf, val);
++ break;
++ }
++ default:
++ MOZ_CRASH_UNSAFE_PRINTF(
++ "decodeFP: opcode 63, xo_x=%u (instruction 0x%08x)", xo_x,
++ instr->instructionBits());
++ }
++ } else if (opcode == 59) {
++ // A-form single-precision instructions.
++ uint32_t xo_a = instr->bits(5, 1);
++
++ switch (xo_a) {
++ case 21: {
++ // fadds
++ double result = (double)((float)(getFpuRegisterDouble(ra) +
++ getFpuRegisterDouble(rb)));
++ setFpuRegisterDouble(rt, result);
++ break;
++ }
++ case 20: {
++ // fsubs
++ double result = (double)((float)(getFpuRegisterDouble(ra) -
++ getFpuRegisterDouble(rb)));
++ setFpuRegisterDouble(rt, result);
++ break;
++ }
++ case 25: {
++ // fmuls: FRT = (float)(FRA * FRC)
++ double result = (double)((float)(getFpuRegisterDouble(ra) *
++ getFpuRegisterDouble(rc_reg)));
++ setFpuRegisterDouble(rt, result);
++ break;
++ }
++ case 18: {
++ // fdivs
++ double result = (double)((float)(getFpuRegisterDouble(ra) /
++ getFpuRegisterDouble(rb)));
++ setFpuRegisterDouble(rt, result);
++ break;
++ }
++ case 22: {
++ // fsqrts
++ double result = (double)sqrtf((float)getFpuRegisterDouble(rb));
++ setFpuRegisterDouble(rt, result);
++ break;
++ }
++ case 29: {
++ // fmadds
++ double result = (double)((float)std::fma(getFpuRegisterDouble(ra),
++ getFpuRegisterDouble(rc_reg),
++ getFpuRegisterDouble(rb)));
++ setFpuRegisterDouble(rt, result);
++ break;
++ }
++ case 30: {
++ // fnmsubs
++ double result = (double)(-(float)std::fma(getFpuRegisterDouble(ra),
++ getFpuRegisterDouble(rc_reg),
++ -getFpuRegisterDouble(rb)));
++ setFpuRegisterDouble(rt, result);
++ break;
++ }
++ case 28: {
++ // fmsubs
++ double result = (double)((float)std::fma(getFpuRegisterDouble(ra),
++ getFpuRegisterDouble(rc_reg),
++ -getFpuRegisterDouble(rb)));
++ setFpuRegisterDouble(rt, result);
++ break;
++ }
++ case 31: {
++ // fnmadds
++ double result = (double)(-(float)std::fma(getFpuRegisterDouble(ra),
++ getFpuRegisterDouble(rc_reg),
++ getFpuRegisterDouble(rb)));
++ setFpuRegisterDouble(rt, result);
++ break;
++ }
++ default: {
++ // Try X-form sub-opcodes for opcode 59 (e.g., fcfids, fcfidus).
++ uint32_t xo_x = instr->bits(10, 1);
++ switch (xo_x) {
++ case 846: {
++ // fcfids: convert int64 to float single (result stored as double)
++ int64_t val = getFpuRegister(rb);
++ setFpuRegisterDouble(rt, (double)(float)val);
++ break;
++ }
++ case 974: {
++ // fcfidus: convert uint64 to float single
++ uint64_t val = U64(getFpuRegister(rb));
++ setFpuRegisterDouble(rt, (double)(float)val);
++ break;
++ }
++ default:
++ MOZ_CRASH_UNSAFE_PRINTF(
++ "decodeFP: opcode 59, xo_a=%u xo_x=%u", xo_a, xo_x);
++ }
++ break;
++ }
++ }
++ } else {
++ MOZ_CRASH_UNSAFE_PRINTF("decodeFP: opcode=%u", opcode);
++ }
++}
++
++// -----------------------------------------------------------------------------
++// decodeVMX: Major opcode 4 (AltiVec/VMX vector ops on VR0-VR31).
++//
++// VR-form (VX-form): bits 0-5 = primary opcode (4), bits 6-10 = VRT,
++// bits 11-15 = VRA, bits 16-20 = VRB, bits 21-31 = XO (11 bits).
++// XO extracted via `instructionBits() & 0x7FF`.
++//
++// Helpers below pack/unpack each VR via the VRregisters_ byte storage
++// (16 bytes, big-endian PPC numbering: bytes[0] is the most-significant
++// byte of the architectural register, but on PPC64 LE wasm the lane
++// ordering is what the JIT expects). All ops here use byte-level
++// accessors for consistency with the existing VMX memory ops.
++
++void Simulator::decodeVMX(SimInstruction* instr) {
++ uint32_t xo = instr->instructionBits() & 0x7FFu;
++ uint32_t vrt = instr->rtValue(); // bits 6..10
++ uint32_t vra = instr->raValue(); // bits 11..15
++ uint32_t vrb = instr->rbValue(); // bits 16..20
++ uint32_t uimm = instr->raValue(); // VA-form: 5-bit immediate at bits 11..15
++
++ uint8_t a[16], b[16], r[16];
++ getVRBytes(vra, a);
++ getVRBytes(vrb, b);
++
++ // Helpers for treating the byte storage as typed lane arrays.
++ // The PPC64LE wasm SIMD lowering stores each lane's bytes in
++ // little-endian order, so lane i of an N-byte element occupies bytes
++ // (i*N) .. (i*N + N - 1) with the LSB at byte (i*N). For example,
++ // a v128.const i32x4 0x12345678 has bytes [78 56 34 12 …].
++ #define LANE_U8(buf, i) ((uint8_t)(buf)[(i)])
++ #define LANE_S8(buf, i) ((int8_t)(buf)[(i)])
++ #define LANE_U16(buf, i) \
++ ((uint16_t)((uint16_t)(buf)[(i) * 2] | \
++ ((uint16_t)(buf)[(i) * 2 + 1] << 8)))
++ #define LANE_S16(buf, i) ((int16_t)LANE_U16(buf, i))
++ #define LANE_U32(buf, i) \
++ ((uint32_t)((uint32_t)(buf)[(i) * 4] | \
++ ((uint32_t)(buf)[(i) * 4 + 1] << 8) | \
++ ((uint32_t)(buf)[(i) * 4 + 2] << 16) | \
++ ((uint32_t)(buf)[(i) * 4 + 3] << 24)))
++ #define LANE_S32(buf, i) ((int32_t)LANE_U32(buf, i))
++ #define LANE_U64(buf, i) \
++ ((uint64_t)((uint64_t)(buf)[(i) * 8] | \
++ ((uint64_t)(buf)[(i) * 8 + 1] << 8) | \
++ ((uint64_t)(buf)[(i) * 8 + 2] << 16) | \
++ ((uint64_t)(buf)[(i) * 8 + 3] << 24) | \
++ ((uint64_t)(buf)[(i) * 8 + 4] << 32) | \
++ ((uint64_t)(buf)[(i) * 8 + 5] << 40) | \
++ ((uint64_t)(buf)[(i) * 8 + 6] << 48) | \
++ ((uint64_t)(buf)[(i) * 8 + 7] << 56)))
++ #define LANE_S64(buf, i) ((int64_t)LANE_U64(buf, i))
++ #define SET_LANE_U8(buf, i, v) do { (buf)[(i)] = (uint8_t)(v); } while (0)
++ #define SET_LANE_U16(buf, i, v) do { \
++ (buf)[(i) * 2] = (uint8_t)((uint16_t)(v) & 0xFF); \
++ (buf)[(i) * 2 + 1] = (uint8_t)(((uint16_t)(v) >> 8) & 0xFF); \
++ } while (0)
++ #define SET_LANE_U32(buf, i, v) do { \
++ (buf)[(i) * 4] = (uint8_t)((uint32_t)(v) & 0xFF); \
++ (buf)[(i) * 4 + 1] = (uint8_t)(((uint32_t)(v) >> 8) & 0xFF); \
++ (buf)[(i) * 4 + 2] = (uint8_t)(((uint32_t)(v) >> 16) & 0xFF); \
++ (buf)[(i) * 4 + 3] = (uint8_t)(((uint32_t)(v) >> 24) & 0xFF); \
++ } while (0)
++ #define SET_LANE_U64(buf, i, v) do { \
++ (buf)[(i) * 8] = (uint8_t)((uint64_t)(v) & 0xFF); \
++ (buf)[(i) * 8 + 1] = (uint8_t)(((uint64_t)(v) >> 8) & 0xFF); \
++ (buf)[(i) * 8 + 2] = (uint8_t)(((uint64_t)(v) >> 16) & 0xFF); \
++ (buf)[(i) * 8 + 3] = (uint8_t)(((uint64_t)(v) >> 24) & 0xFF); \
++ (buf)[(i) * 8 + 4] = (uint8_t)(((uint64_t)(v) >> 32) & 0xFF); \
++ (buf)[(i) * 8 + 5] = (uint8_t)(((uint64_t)(v) >> 40) & 0xFF); \
++ (buf)[(i) * 8 + 6] = (uint8_t)(((uint64_t)(v) >> 48) & 0xFF); \
++ (buf)[(i) * 8 + 7] = (uint8_t)(((uint64_t)(v) >> 56) & 0xFF); \
++ } while (0)
++
++ // --- VA-form pre-dispatch ---
++ //
++ // VA-form has a 6-bit XO at bits 26-31 and a 5-bit VRC at bits 21-25.
++ // decodeVMX's 11-bit XO mask conflates VRC with
++ // XO, so a plain `switch (xo)` over 11-bit values only matches when
++ // VRC == 0. Peel off the three VA-form ops actually used by the JIT
++ // (vmladduhm, vsel, vperm) before the main switch so any VRC value
++ // works. vsldoi (XO=44) is VX-form with SH at bits 22-25, not VA —
++ // handled in the switch below.
++ {
++ uint32_t va_xo = xo & 0x3Fu;
++ if (va_xo == 32 || va_xo == 33 || va_xo == 34 || va_xo == 38 ||
++ va_xo == 40 || va_xo == 42 || va_xo == 43) {
++ uint32_t vrc = (instr->instructionBits() >> 6) & 0x1F;
++ uint8_t cv[16];
++ getVRBytes(vrc, cv);
++ if (va_xo == 32) {
++ // vmhaddshs VT,VA,VB,VC : VT[i] = sat_s16(
++ // (s32)VA.h[i] * (s32)VB.h[i] >> 15 + (s32)VC.h[i])
++ // (no rounding term — use vmhraddshs for the rounded form).
++ for (int i = 0; i < 8; i++) {
++ int32_t prod = (int32_t)LANE_S16(a, i) * (int32_t)LANE_S16(b, i);
++ int32_t sum = (prod >> 15) + (int32_t)LANE_S16(cv, i);
++ if (sum > INT16_MAX) sum = INT16_MAX;
++ if (sum < INT16_MIN) sum = INT16_MIN;
++ SET_LANE_U16(r, i, (uint16_t)(int16_t)sum);
++ }
++ } else if (va_xo == 33) {
++ // vmhraddshs VT,VA,VB,VC : rounded Q15 multiply-add-saturate.
++ // VT[i] = sat_s16(((s32)VA.h[i] * (s32)VB.h[i] + 0x4000)
++ // >> 15 + (s32)VC.h[i])
++ // Used by wasm i16x8.q15mulr_sat_s (VC is zero).
++ for (int i = 0; i < 8; i++) {
++ int32_t prod = (int32_t)LANE_S16(a, i) * (int32_t)LANE_S16(b, i);
++ int32_t sum = ((prod + 0x4000) >> 15) + (int32_t)LANE_S16(cv, i);
++ if (sum > INT16_MAX) sum = INT16_MAX;
++ if (sum < INT16_MIN) sum = INT16_MIN;
++ SET_LANE_U16(r, i, (uint16_t)(int16_t)sum);
++ }
++ } else if (va_xo == 34) {
++ // vmladduhm VT,VA,VB,VC : VT = low16(VA*VB + VC)
++ for (int i = 0; i < 8; i++) {
++ uint16_t prod = LANE_U16(a, i) * LANE_U16(b, i);
++ SET_LANE_U16(r, i, prod + LANE_U16(cv, i));
++ }
++ } else if (va_xo == 40) {
++ // vmsumshm VT,VA,VB,VC : pairwise multiply-sum of signed halfwords
++ // into i32 lanes, modulo i32 wrap.
++ // VT.i32[k] = VC.i32[k] + VA.i16[2k]*VB.i16[2k]
++ // + VA.i16[2k+1]*VB.i16[2k+1]
++ // Used by wasm i32x4.dot_i16x8_s with VC = 0, and by
++ // i32x4.extadd_pairwise_i16x8_s with VB = splat(1) and VC = 0.
++ for (int k = 0; k < 4; k++) {
++ int32_t a0 = (int32_t)LANE_S16(a, 2 * k);
++ int32_t a1 = (int32_t)LANE_S16(a, 2 * k + 1);
++ int32_t b0 = (int32_t)LANE_S16(b, 2 * k);
++ int32_t b1 = (int32_t)LANE_S16(b, 2 * k + 1);
++ int32_t c = LANE_S32(cv, k);
++ int32_t result = (int32_t)((uint32_t)c + (uint32_t)(a0 * b0) +
++ (uint32_t)(a1 * b1));
++ SET_LANE_U32(r, k, (uint32_t)result);
++ }
++ } else if (va_xo == 38) {
++ // vmsumuhm VT,VA,VB,VC : same as vmsumshm but unsigned halfwords.
++ // VT.u32[k] = VC.u32[k] + VA.u16[2k]*VB.u16[2k]
++ // + VA.u16[2k+1]*VB.u16[2k+1]
++ // Used by wasm i32x4.extadd_pairwise_i16x8_u with VB = splat(1)
++ // and VC = 0.
++ for (int k = 0; k < 4; k++) {
++ uint32_t a0 = (uint32_t)LANE_U16(a, 2 * k);
++ uint32_t a1 = (uint32_t)LANE_U16(a, 2 * k + 1);
++ uint32_t b0 = (uint32_t)LANE_U16(b, 2 * k);
++ uint32_t b1 = (uint32_t)LANE_U16(b, 2 * k + 1);
++ uint32_t c = LANE_U32(cv, k);
++ uint32_t result = c + a0 * b0 + a1 * b1;
++ SET_LANE_U32(r, k, result);
++ }
++ } else if (va_xo == 42) {
++ // vsel VT,VA,VB,VC : VT[i] = (VC[i] & VB[i]) | (~VC[i] & VA[i])
++ for (int i = 0; i < 16; i++) {
++ r[i] = (uint8_t)((cv[i] & b[i]) | (~cv[i] & a[i]));
++ }
++ } else {
++ // vperm VT,VA,VB,VC; empirical LE:
++ // r[LE_i] = (VC[LE_i] < 16) ? VA[LE_(15-VC[i])]
++ // : VB[LE_(31-VC[i])]
++ for (int i = 0; i < 16; i++) {
++ uint8_t idx = cv[i] & 0x1F;
++ r[i] = (idx < 16) ? a[15 - idx] : b[31 - idx];
++ }
++ }
++ setVRBytes(vrt, r);
++ goto vmx_done;
++ }
++ }
++
++ switch (xo) {
++ // === Integer add (modulo) ===
++ case 0: // vaddubm
++ for (int i = 0; i < 16; i++) {
++ SET_LANE_U8(r, i, LANE_U8(a, i) + LANE_U8(b, i));
++ }
++ setVRBytes(vrt, r); break;
++ case 64: // vadduhm
++ for (int i = 0; i < 8; i++) {
++ SET_LANE_U16(r, i, LANE_U16(a, i) + LANE_U16(b, i));
++ }
++ setVRBytes(vrt, r); break;
++ case 128: // vadduwm
++ for (int i = 0; i < 4; i++) {
++ SET_LANE_U32(r, i, LANE_U32(a, i) + LANE_U32(b, i));
++ }
++ setVRBytes(vrt, r); break;
++ case 192: // vaddudm
++ for (int i = 0; i < 2; i++) {
++ SET_LANE_U64(r, i, LANE_U64(a, i) + LANE_U64(b, i));
++ }
++ setVRBytes(vrt, r); break;
++
++ // === Integer sub (modulo) ===
++ case 1024: // vsububm
++ for (int i = 0; i < 16; i++) {
++ SET_LANE_U8(r, i, LANE_U8(a, i) - LANE_U8(b, i));
++ }
++ setVRBytes(vrt, r); break;
++ case 1088: // vsubuhm
++ for (int i = 0; i < 8; i++) {
++ SET_LANE_U16(r, i, LANE_U16(a, i) - LANE_U16(b, i));
++ }
++ setVRBytes(vrt, r); break;
++ case 1152: // vsubuwm
++ for (int i = 0; i < 4; i++) {
++ SET_LANE_U32(r, i, LANE_U32(a, i) - LANE_U32(b, i));
++ }
++ setVRBytes(vrt, r); break;
++ case 1216: // vsubudm
++ for (int i = 0; i < 2; i++) {
++ SET_LANE_U64(r, i, LANE_U64(a, i) - LANE_U64(b, i));
++ }
++ setVRBytes(vrt, r); break;
++
++ // === Integer add (saturating, signed) ===
++ case 768: // vaddsbs
++ for (int i = 0; i < 16; i++) {
++ int s = (int)LANE_S8(a, i) + (int)LANE_S8(b, i);
++ if (s > INT8_MAX) s = INT8_MAX;
++ if (s < INT8_MIN) s = INT8_MIN;
++ SET_LANE_U8(r, i, (uint8_t)s);
++ }
++ setVRBytes(vrt, r); break;
++ case 832: // vaddshs
++ for (int i = 0; i < 8; i++) {
++ int s = (int)LANE_S16(a, i) + (int)LANE_S16(b, i);
++ if (s > INT16_MAX) s = INT16_MAX;
++ if (s < INT16_MIN) s = INT16_MIN;
++ SET_LANE_U16(r, i, (uint16_t)s);
++ }
++ setVRBytes(vrt, r); break;
++ case 896: // vaddsws
++ for (int i = 0; i < 4; i++) {
++ int64_t s = (int64_t)LANE_S32(a, i) + (int64_t)LANE_S32(b, i);
++ if (s > INT32_MAX) s = INT32_MAX;
++ if (s < INT32_MIN) s = INT32_MIN;
++ SET_LANE_U32(r, i, (uint32_t)s);
++ }
++ setVRBytes(vrt, r); break;
++
++ // === Integer add (saturating, unsigned) ===
++ case 512: // vaddubs
++ for (int i = 0; i < 16; i++) {
++ unsigned s = (unsigned)LANE_U8(a, i) + (unsigned)LANE_U8(b, i);
++ if (s > UINT8_MAX) s = UINT8_MAX;
++ SET_LANE_U8(r, i, (uint8_t)s);
++ }
++ setVRBytes(vrt, r); break;
++ case 576: // vadduhs
++ for (int i = 0; i < 8; i++) {
++ unsigned s = (unsigned)LANE_U16(a, i) + (unsigned)LANE_U16(b, i);
++ if (s > UINT16_MAX) s = UINT16_MAX;
++ SET_LANE_U16(r, i, (uint16_t)s);
++ }
++ setVRBytes(vrt, r); break;
++ case 640: // vadduws
++ for (int i = 0; i < 4; i++) {
++ uint64_t s = (uint64_t)LANE_U32(a, i) + (uint64_t)LANE_U32(b, i);
++ if (s > UINT32_MAX) s = UINT32_MAX;
++ SET_LANE_U32(r, i, (uint32_t)s);
++ }
++ setVRBytes(vrt, r); break;
++
++ // === Integer sub (saturating, signed) ===
++ case 1792: // vsubsbs
++ for (int i = 0; i < 16; i++) {
++ int s = (int)LANE_S8(a, i) - (int)LANE_S8(b, i);
++ if (s > INT8_MAX) s = INT8_MAX;
++ if (s < INT8_MIN) s = INT8_MIN;
++ SET_LANE_U8(r, i, (uint8_t)s);
++ }
++ setVRBytes(vrt, r); break;
++ case 1856: // vsubshs
++ for (int i = 0; i < 8; i++) {
++ int s = (int)LANE_S16(a, i) - (int)LANE_S16(b, i);
++ if (s > INT16_MAX) s = INT16_MAX;
++ if (s < INT16_MIN) s = INT16_MIN;
++ SET_LANE_U16(r, i, (uint16_t)s);
++ }
++ setVRBytes(vrt, r); break;
++
++ // === Integer sub (saturating, unsigned) ===
++ case 1536: // vsububs
++ for (int i = 0; i < 16; i++) {
++ int s = (int)LANE_U8(a, i) - (int)LANE_U8(b, i);
++ if (s < 0) s = 0;
++ SET_LANE_U8(r, i, (uint8_t)s);
++ }
++ setVRBytes(vrt, r); break;
++ case 1600: // vsubuhs
++ for (int i = 0; i < 8; i++) {
++ int s = (int)LANE_U16(a, i) - (int)LANE_U16(b, i);
++ if (s < 0) s = 0;
++ SET_LANE_U16(r, i, (uint16_t)s);
++ }
++ setVRBytes(vrt, r); break;
++
++ // === Average unsigned (rounded: (a+b+1)>>1) ===
++ case 1026: // vavgub
++ for (int i = 0; i < 16; i++) {
++ SET_LANE_U8(r, i,
++ ((unsigned)LANE_U8(a, i) + LANE_U8(b, i) + 1) >> 1);
++ }
++ setVRBytes(vrt, r); break;
++ case 1090: // vavguh
++ for (int i = 0; i < 8; i++) {
++ SET_LANE_U16(r, i,
++ ((unsigned)LANE_U16(a, i) + LANE_U16(b, i) + 1) >> 1);
++ }
++ setVRBytes(vrt, r); break;
++
++ // === Vector multiply per-lane (i32x4.mul) ===
++ case 137: { // vmuluwm: per-lane i32 multiply (low 32 bits)
++ for (int i = 0; i < 4; i++) {
++ SET_LANE_U32(r, i, LANE_U32(a, i) * LANE_U32(b, i));
++ }
++ setVRBytes(vrt, r); break;
++ }
++
++ // === POWER10 vmulld: per-lane i64 multiply (low 64 bits) ===
++ case 457: {
++ for (int i = 0; i < 2; i++) {
++ uint64_t av = 0, bv = 0;
++ for (int j = 0; j < 8; j++) {
++ av |= ((uint64_t)a[i * 8 + j]) << (j * 8);
++ bv |= ((uint64_t)b[i * 8 + j]) << (j * 8);
++ }
++ uint64_t prod = av * bv; // low 64 bits, modulo wrap
++ for (int j = 0; j < 8; j++) {
++ r[i * 8 + j] = (uint8_t)(prod >> (j * 8));
++ }
++ }
++ setVRBytes(vrt, r); break;
++ }
++
++ // === vmule/vmulo* (multiply even/odd lanes, widening) ===
++ //
++ // All XO values below were verified by disassembling the
++ // PPC_vmule*/PPC_vmulo* constants from Assembler-ppc64.h with
++ // `as -mppc64 -mlittle` + `objdump -Mpower9 -d`. The previous
++ // version had all 12 XO labels swapped with each other's semantic
++ // pair (so the JIT's vmulesb was decoded as vmulosb and vice
++ // versa), causing i8x16→i16x8 extmul to produce wrong halfwords.
++ //
++ // PPC_vmuloub = 0x10000008 → XO=8 vmuloub (LE even-byte pairs)
++ // PPC_vmulouh = 0x10000048 → XO=72 vmulouh
++ // PPC_vmulouw = 0x10000088 → XO=136 vmulouw
++ // PPC_vmulosb = 0x10000108 → XO=264 vmulosb
++ // PPC_vmulosh = 0x10000148 → XO=328 vmulosh
++ // PPC_vmulosw = 0x10000188 → XO=392 vmulosw
++ // PPC_vmuleub = 0x10000208 → XO=520 vmuleub (LE odd-byte pairs)
++ // PPC_vmuleuh = 0x10000248 → XO=584 vmuleuh
++ // PPC_vmuleuw = 0x10000288 → XO=648 vmuleuw
++ // PPC_vmulesb = 0x10000308 → XO=776 vmulesb
++ // PPC_vmulesh = 0x10000348 → XO=840 vmulesh
++ // PPC_vmulesw = 0x10000388 → XO=904 vmulesw
++ //
++ // Lane indexing on LE storage: "BE-even byte i" is stored at LE
++ // byte index (15 - 2i); since our LANE_S8 uses LE byte index, the
++ // "BE-even" = "LE-odd" mapping gives `2*i + 1` for vmule, `2*i`
++ // for vmulo. The JIT's extmul helpers emit `vmulesb + vmulosb +
++ // vmrglh` to pack both halves; getting the semantics swapped here
++ // produces the right result register but with the halves in the
++ // wrong merge order, breaking extmul.
++ case 776: { // vmulesb: signed BE-even byte → halfword (8 results)
++ for (int i = 0; i < 8; i++) {
++ SET_LANE_U16(r, i,
++ (int16_t)LANE_S8(a, 2 * i + 1) *
++ (int16_t)LANE_S8(b, 2 * i + 1));
++ }
++ setVRBytes(vrt, r); break;
++ }
++ case 520: { // vmuleub: unsigned BE-even byte → halfword
++ for (int i = 0; i < 8; i++) {
++ SET_LANE_U16(r, i,
++ (uint16_t)LANE_U8(a, 2 * i + 1) *
++ (uint16_t)LANE_U8(b, 2 * i + 1));
++ }
++ setVRBytes(vrt, r); break;
++ }
++ case 840: { // vmulesh: signed BE-even halfword → word
++ for (int i = 0; i < 4; i++) {
++ SET_LANE_U32(r, i,
++ (int32_t)LANE_S16(a, 2 * i + 1) *
++ (int32_t)LANE_S16(b, 2 * i + 1));
++ }
++ setVRBytes(vrt, r); break;
++ }
++ case 584: { // vmuleuh: unsigned BE-even halfword → word
++ for (int i = 0; i < 4; i++) {
++ SET_LANE_U32(r, i,
++ (uint32_t)LANE_U16(a, 2 * i + 1) *
++ (uint32_t)LANE_U16(b, 2 * i + 1));
++ }
++ setVRBytes(vrt, r); break;
++ }
++ case 904: { // vmulesw: signed BE-even word → dword (POWER8)
++ for (int i = 0; i < 2; i++) {
++ SET_LANE_U64(r, i,
++ (int64_t)LANE_S32(a, 2 * i + 1) *
++ (int64_t)LANE_S32(b, 2 * i + 1));
++ }
++ setVRBytes(vrt, r); break;
++ }
++ case 648: { // vmuleuw: unsigned BE-even word → dword (POWER8)
++ for (int i = 0; i < 2; i++) {
++ SET_LANE_U64(r, i,
++ (uint64_t)LANE_U32(a, 2 * i + 1) *
++ (uint64_t)LANE_U32(b, 2 * i + 1));
++ }
++ setVRBytes(vrt, r); break;
++ }
++ case 264: { // vmulosb: signed BE-odd byte → halfword
++ for (int i = 0; i < 8; i++) {
++ SET_LANE_U16(r, i,
++ (int16_t)LANE_S8(a, 2 * i) *
++ (int16_t)LANE_S8(b, 2 * i));
++ }
++ setVRBytes(vrt, r); break;
++ }
++ case 8: { // vmuloub: unsigned BE-odd byte
++ for (int i = 0; i < 8; i++) {
++ SET_LANE_U16(r, i,
++ (uint16_t)LANE_U8(a, 2 * i) *
++ (uint16_t)LANE_U8(b, 2 * i));
++ }
++ setVRBytes(vrt, r); break;
++ }
++ case 328: { // vmulosh: signed BE-odd halfword → word
++ for (int i = 0; i < 4; i++) {
++ SET_LANE_U32(r, i,
++ (int32_t)LANE_S16(a, 2 * i) *
++ (int32_t)LANE_S16(b, 2 * i));
++ }
++ setVRBytes(vrt, r); break;
++ }
++ case 72: { // vmulouh: unsigned BE-odd halfword → word
++ for (int i = 0; i < 4; i++) {
++ SET_LANE_U32(r, i,
++ (uint32_t)LANE_U16(a, 2 * i) *
++ (uint32_t)LANE_U16(b, 2 * i));
++ }
++ setVRBytes(vrt, r); break;
++ }
++ case 392: { // vmulosw: signed BE-odd word
++ for (int i = 0; i < 2; i++) {
++ SET_LANE_U64(r, i,
++ (int64_t)LANE_S32(a, 2 * i) *
++ (int64_t)LANE_S32(b, 2 * i));
++ }
++ setVRBytes(vrt, r); break;
++ }
++ case 136: { // vmulouw: unsigned BE-odd word
++ for (int i = 0; i < 2; i++) {
++ SET_LANE_U64(r, i,
++ (uint64_t)LANE_U32(a, 2 * i) *
++ (uint64_t)LANE_U32(b, 2 * i));
++ }
++ setVRBytes(vrt, r); break;
++ }
++
++ // === Per-lane rotate left (vrl{b,h,w,d}) ===
++ case 4: // vrlb
++ for (int i = 0; i < 16; i++) {
++ uint8_t v = LANE_U8(a, i);
++ uint32_t s = LANE_U8(b, i) & 7;
++ SET_LANE_U8(r, i, (uint8_t)((v << s) | (v >> ((8 - s) & 7))));
++ }
++ setVRBytes(vrt, r); break;
++ case 68: // vrlh
++ for (int i = 0; i < 8; i++) {
++ uint16_t v = LANE_U16(a, i);
++ uint32_t s = LANE_U16(b, i) & 15;
++ SET_LANE_U16(r, i, (uint16_t)((v << s) | (v >> ((16 - s) & 15))));
++ }
++ setVRBytes(vrt, r); break;
++ case 132: // vrlw
++ for (int i = 0; i < 4; i++) {
++ uint32_t v = LANE_U32(a, i);
++ uint32_t s = LANE_U32(b, i) & 31;
++ SET_LANE_U32(r, i, (v << s) | (v >> ((32 - s) & 31)));
++ }
++ setVRBytes(vrt, r); break;
++ case 196: // vrld
++ for (int i = 0; i < 2; i++) {
++ uint64_t v = LANE_U64(a, i);
++ uint32_t s = LANE_U64(b, i) & 63;
++ SET_LANE_U64(r, i, (v << s) | (v >> ((64 - s) & 63)));
++ }
++ setVRBytes(vrt, r); break;
++
++ // === Min / Max signed ===
++ case 258: // vmaxsb
++ for (int i = 0; i < 16; i++) {
++ SET_LANE_U8(r, i, std::max(LANE_S8(a, i), LANE_S8(b, i)));
++ }
++ setVRBytes(vrt, r); break;
++ case 322: // vmaxsh
++ for (int i = 0; i < 8; i++) {
++ SET_LANE_U16(r, i, std::max(LANE_S16(a, i), LANE_S16(b, i)));
++ }
++ setVRBytes(vrt, r); break;
++ case 386: // vmaxsw
++ for (int i = 0; i < 4; i++) {
++ SET_LANE_U32(r, i, std::max(LANE_S32(a, i), LANE_S32(b, i)));
++ }
++ setVRBytes(vrt, r); break;
++ case 450: // vmaxsd
++ for (int i = 0; i < 2; i++) {
++ SET_LANE_U64(r, i, std::max(LANE_S64(a, i), LANE_S64(b, i)));
++ }
++ setVRBytes(vrt, r); break;
++ case 770: // vminsb
++ for (int i = 0; i < 16; i++) {
++ SET_LANE_U8(r, i, std::min(LANE_S8(a, i), LANE_S8(b, i)));
++ }
++ setVRBytes(vrt, r); break;
++ case 834: // vminsh
++ for (int i = 0; i < 8; i++) {
++ SET_LANE_U16(r, i, std::min(LANE_S16(a, i), LANE_S16(b, i)));
++ }
++ setVRBytes(vrt, r); break;
++ case 898: // vminsw
++ for (int i = 0; i < 4; i++) {
++ SET_LANE_U32(r, i, std::min(LANE_S32(a, i), LANE_S32(b, i)));
++ }
++ setVRBytes(vrt, r); break;
++ case 962: // vminsd
++ for (int i = 0; i < 2; i++) {
++ SET_LANE_U64(r, i, std::min(LANE_S64(a, i), LANE_S64(b, i)));
++ }
++ setVRBytes(vrt, r); break;
++
++ // === Min / Max unsigned ===
++ case 2: // vmaxub
++ for (int i = 0; i < 16; i++) {
++ SET_LANE_U8(r, i, std::max(LANE_U8(a, i), LANE_U8(b, i)));
++ }
++ setVRBytes(vrt, r); break;
++ case 66: // vmaxuh
++ for (int i = 0; i < 8; i++) {
++ SET_LANE_U16(r, i, std::max(LANE_U16(a, i), LANE_U16(b, i)));
++ }
++ setVRBytes(vrt, r); break;
++ case 130: // vmaxuw
++ for (int i = 0; i < 4; i++) {
++ SET_LANE_U32(r, i, std::max(LANE_U32(a, i), LANE_U32(b, i)));
++ }
++ setVRBytes(vrt, r); break;
++ case 194: // vmaxud
++ for (int i = 0; i < 2; i++) {
++ SET_LANE_U64(r, i, std::max(LANE_U64(a, i), LANE_U64(b, i)));
++ }
++ setVRBytes(vrt, r); break;
++ case 514: // vminub
++ for (int i = 0; i < 16; i++) {
++ SET_LANE_U8(r, i, std::min(LANE_U8(a, i), LANE_U8(b, i)));
++ }
++ setVRBytes(vrt, r); break;
++ case 578: // vminuh
++ for (int i = 0; i < 8; i++) {
++ SET_LANE_U16(r, i, std::min(LANE_U16(a, i), LANE_U16(b, i)));
++ }
++ setVRBytes(vrt, r); break;
++ case 642: // vminuw
++ for (int i = 0; i < 4; i++) {
++ SET_LANE_U32(r, i, std::min(LANE_U32(a, i), LANE_U32(b, i)));
++ }
++ setVRBytes(vrt, r); break;
++ case 706: // vminud
++ for (int i = 0; i < 2; i++) {
++ SET_LANE_U64(r, i, std::min(LANE_U64(a, i), LANE_U64(b, i)));
++ }
++ setVRBytes(vrt, r); break;
++
++ // === Vector compare (eq, gt signed, gt unsigned, ne POWER9) ===
++ //
++ // All vcmp* ops set per-lane all-1s on true, all-0s on false. The
++ // record form (Rc=1, XO MSB bit set; XO_rec = XO_base + 1024) must
++ // additionally write CR6:
++ // CR6.LT = 1 iff ALL lanes are true;
++ // CR6.GT = 0 (always);
++ // CR6.EQ = 1 iff NO lane is true;
++ // CR6.SO = 0 (always).
++ // `i8x16.all_true` etc. in wasm rely on CR6.EQ via `mfocrf cr6`; the
++ // previous simulator implementation left CR6 untouched, so the
++ // predicate was always wrong.
++ //
++ // Helper: count true lanes by looking at byte 0 of each lane (all
++ // bytes within a "true" lane are 0xFF so byte 0 is a sound proxy).
++ #define VCMP_DONE(lanes_, lane_bytes_) \
++ do { \
++ setVRBytes(vrt, r); \
++ if (xo >= 1024) { \
++ int numTrue_ = 0; \
++ for (int i_ = 0; i_ < (lanes_); i_++) { \
++ if (r[i_ * (lane_bytes_)] == 0xFF) numTrue_++; \
++ } \
++ uint8_t field_ = 0; \
++ if (numTrue_ == (lanes_)) field_ |= kCRFieldLT; \
++ if (numTrue_ == 0) field_ |= kCRFieldEQ; \
++ setCRField(6, field_); \
++ } \
++ } while (0)
++
++ case 6: // vcmpequb (Rc=0)
++ case 1030: // vcmpequb. (record, CR6 updated)
++ for (int i = 0; i < 16; i++) {
++ SET_LANE_U8(r, i, LANE_U8(a, i) == LANE_U8(b, i) ? 0xFF : 0);
++ }
++ VCMP_DONE(16, 1); break;
++ case 70: // vcmpequh
++ case 1094: // vcmpequh.
++ for (int i = 0; i < 8; i++) {
++ SET_LANE_U16(r, i, LANE_U16(a, i) == LANE_U16(b, i) ? 0xFFFF : 0);
++ }
++ VCMP_DONE(8, 2); break;
++ case 134: // vcmpequw
++ case 1158: // vcmpequw.
++ for (int i = 0; i < 4; i++) {
++ SET_LANE_U32(r, i,
++ LANE_U32(a, i) == LANE_U32(b, i) ? 0xFFFFFFFFu : 0);
++ }
++ VCMP_DONE(4, 4); break;
++ case 199: // vcmpequd
++ case 1223: // vcmpequd.
++ for (int i = 0; i < 2; i++) {
++ SET_LANE_U64(r, i,
++ LANE_U64(a, i) == LANE_U64(b, i)
++ ? UINT64_MAX
++ : 0);
++ }
++ VCMP_DONE(2, 8); break;
++
++ // === Compare greater-than signed ===
++ case 774: // vcmpgtsb
++ case 1798: // vcmpgtsb.
++ for (int i = 0; i < 16; i++) {
++ SET_LANE_U8(r, i, LANE_S8(a, i) > LANE_S8(b, i) ? 0xFF : 0);
++ }
++ VCMP_DONE(16, 1); break;
++ case 838: // vcmpgtsh
++ case 1862: // vcmpgtsh.
++ for (int i = 0; i < 8; i++) {
++ SET_LANE_U16(r, i, LANE_S16(a, i) > LANE_S16(b, i) ? 0xFFFF : 0);
++ }
++ VCMP_DONE(8, 2); break;
++ case 902: // vcmpgtsw
++ case 1926: // vcmpgtsw.
++ for (int i = 0; i < 4; i++) {
++ SET_LANE_U32(r, i,
++ LANE_S32(a, i) > LANE_S32(b, i) ? 0xFFFFFFFFu : 0);
++ }
++ VCMP_DONE(4, 4); break;
++ case 967: // vcmpgtsd
++ case 1991: // vcmpgtsd.
++ for (int i = 0; i < 2; i++) {
++ SET_LANE_U64(r, i,
++ LANE_S64(a, i) > LANE_S64(b, i) ? UINT64_MAX : 0);
++ }
++ VCMP_DONE(2, 8); break;
++
++ // === Compare greater-than unsigned ===
++ case 518: // vcmpgtub
++ case 1542: // vcmpgtub.
++ for (int i = 0; i < 16; i++) {
++ SET_LANE_U8(r, i, LANE_U8(a, i) > LANE_U8(b, i) ? 0xFF : 0);
++ }
++ VCMP_DONE(16, 1); break;
++ case 582: // vcmpgtuh
++ case 1606: // vcmpgtuh.
++ for (int i = 0; i < 8; i++) {
++ SET_LANE_U16(r, i, LANE_U16(a, i) > LANE_U16(b, i) ? 0xFFFF : 0);
++ }
++ VCMP_DONE(8, 2); break;
++ case 646: // vcmpgtuw
++ case 1670: // vcmpgtuw.
++ for (int i = 0; i < 4; i++) {
++ SET_LANE_U32(r, i,
++ LANE_U32(a, i) > LANE_U32(b, i) ? 0xFFFFFFFFu : 0);
++ }
++ VCMP_DONE(4, 4); break;
++ case 711: // vcmpgtud
++ case 1735: // vcmpgtud.
++ for (int i = 0; i < 2; i++) {
++ SET_LANE_U64(r, i,
++ LANE_U64(a, i) > LANE_U64(b, i) ? UINT64_MAX : 0);
++ }
++ VCMP_DONE(2, 8); break;
++
++ // === Splat from immediate (5-bit signed splat into all lanes) ===
++ // ISA defines UIM in BE element numbering. For LE storage, BE element i = LE element (N-1-i).
++ case 524: // vspltb: VRT[*] = VRB[BE-byte-UIM]; uimm from VRA field (bits 11..15)
++ for (int i = 0; i < 16; i++) {
++ SET_LANE_U8(r, i, LANE_U8(b, 15 - (uimm & 0xF)));
++ }
++ setVRBytes(vrt, r); break;
++ case 588: // vsplth
++ for (int i = 0; i < 8; i++) {
++ SET_LANE_U16(r, i, LANE_U16(b, 7 - (uimm & 0x7)));
++ }
++ setVRBytes(vrt, r); break;
++ case 652: // vspltw
++ for (int i = 0; i < 4; i++) {
++ SET_LANE_U32(r, i, LANE_U32(b, 3 - (uimm & 0x3)));
++ }
++ setVRBytes(vrt, r); break;
++
++ // === Splat 5-bit signed immediate to all byte lanes ===
++ case 780: { // vspltisb VRT, SIMM5
++ int32_t simm5 = (int32_t)((instr->instructionBits() >> 16) & 0x1F);
++ if (simm5 & 0x10) simm5 |= ~0x1F;
++ uint8_t b = (uint8_t)(int8_t)simm5;
++ memset(r, b, 16);
++ setVRBytes(vrt, r); break;
++ }
++
++ // === Splat 5-bit signed immediate to all halfword lanes ===
++ case 844: { // vspltish VRT, SIMM5
++ // SIMM5 occupies bits 11..15 of the instruction (VRA field). It
++ // is sign-extended to 16 bits and replicated across all 8 halfword
++ // lanes of VRT. Range: [-16, 15].
++ int32_t simm5 = (int32_t)((instr->instructionBits() >> 16) & 0x1F);
++ if (simm5 & 0x10) simm5 |= ~0x1F; // sign-extend bit 4
++ int16_t hw = (int16_t)simm5;
++ for (int i = 0; i < 8; i++) {
++ SET_LANE_U16(r, i, (uint16_t)hw);
++ }
++ setVRBytes(vrt, r); break;
++ }
++
++ // === Splat 5-bit signed immediate to all word lanes ===
++ case 908: { // vspltisw VRT, SIMM5
++ int32_t simm5 = (int32_t)((instr->instructionBits() >> 16) & 0x1F);
++ if (simm5 & 0x10) simm5 |= ~0x1F;
++ for (int i = 0; i < 4; i++) {
++ SET_LANE_U32(r, i, (uint32_t)simm5);
++ }
++ setVRBytes(vrt, r); break;
++ }
++
++ // === Merge (interleave) ===
++ //
++ // The ISA defines vmrgh* / vmrgl* in BE numbering; the
++ // empirical LE storage behaviour is:
++ // vmrgh* VT,VA,VB: for i in 0..N/2-1,
++ // VT.lane_LE[2i] = VB.lane_LE[(N/2) + i]
++ // VT.lane_LE[2i+1] = VA.lane_LE[(N/2) + i]
++ // vmrgl* VT,VA,VB: for i in 0..N/2-1,
++ // VT.lane_LE[2i] = VB.lane_LE[i]
++ // VT.lane_LE[2i+1] = VA.lane_LE[i]
++ // i.e. the VB operand goes to the even result positions (reversed
++ // from what a naïve BE reading would suggest) and the "high" form
++ // selects the upper-half of LE storage.
++ //
++ // Previous implementation had both the operand order swapped AND
++ // the high/low halves swapped (consistent with each other, so
++ // JIT-only-visible ops that round-tripped through vmrg* happened
++ // to produce the right answer, but wasm-visible extmul exposed
++ // the bug).
++ case 12: // vmrghb
++ for (int i = 0; i < 8; i++) {
++ SET_LANE_U8(r, 2 * i, LANE_U8(b, 8 + i));
++ SET_LANE_U8(r, 2 * i + 1, LANE_U8(a, 8 + i));
++ }
++ setVRBytes(vrt, r); break;
++ case 76: // vmrghh
++ for (int i = 0; i < 4; i++) {
++ SET_LANE_U16(r, 2 * i, LANE_U16(b, 4 + i));
++ SET_LANE_U16(r, 2 * i + 1, LANE_U16(a, 4 + i));
++ }
++ setVRBytes(vrt, r); break;
++ case 140: // vmrghw
++ for (int i = 0; i < 2; i++) {
++ SET_LANE_U32(r, 2 * i, LANE_U32(b, 2 + i));
++ SET_LANE_U32(r, 2 * i + 1, LANE_U32(a, 2 + i));
++ }
++ setVRBytes(vrt, r); break;
++ case 268: // vmrglb
++ for (int i = 0; i < 8; i++) {
++ SET_LANE_U8(r, 2 * i, LANE_U8(b, i));
++ SET_LANE_U8(r, 2 * i + 1, LANE_U8(a, i));
++ }
++ setVRBytes(vrt, r); break;
++ case 332: // vmrglh
++ for (int i = 0; i < 4; i++) {
++ SET_LANE_U16(r, 2 * i, LANE_U16(b, i));
++ SET_LANE_U16(r, 2 * i + 1, LANE_U16(a, i));
++ }
++ setVRBytes(vrt, r); break;
++ case 396: // vmrglw
++ for (int i = 0; i < 2; i++) {
++ SET_LANE_U32(r, 2 * i, LANE_U32(b, i));
++ SET_LANE_U32(r, 2 * i + 1, LANE_U32(a, i));
++ }
++ setVRBytes(vrt, r); break;
++
++ // === Per-lane shift left (count from VRB, low N bits per element) ===
++ case 260: // vslb
++ for (int i = 0; i < 16; i++) {
++ SET_LANE_U8(r, i, LANE_U8(a, i) << (LANE_U8(b, i) & 7));
++ }
++ setVRBytes(vrt, r); break;
++ case 324: // vslh
++ for (int i = 0; i < 8; i++) {
++ SET_LANE_U16(r, i, LANE_U16(a, i) << (LANE_U16(b, i) & 15));
++ }
++ setVRBytes(vrt, r); break;
++ case 388: // vslw
++ for (int i = 0; i < 4; i++) {
++ SET_LANE_U32(r, i, LANE_U32(a, i) << (LANE_U32(b, i) & 31));
++ }
++ setVRBytes(vrt, r); break;
++ case 1476: // vsld
++ for (int i = 0; i < 2; i++) {
++ SET_LANE_U64(r, i, LANE_U64(a, i) << (LANE_U64(b, i) & 63));
++ }
++ setVRBytes(vrt, r); break;
++
++ // === Per-lane shift right unsigned ===
++ case 516: // vsrb
++ for (int i = 0; i < 16; i++) {
++ SET_LANE_U8(r, i, LANE_U8(a, i) >> (LANE_U8(b, i) & 7));
++ }
++ setVRBytes(vrt, r); break;
++ case 580: // vsrh
++ for (int i = 0; i < 8; i++) {
++ SET_LANE_U16(r, i, LANE_U16(a, i) >> (LANE_U16(b, i) & 15));
++ }
++ setVRBytes(vrt, r); break;
++ case 644: // vsrw
++ for (int i = 0; i < 4; i++) {
++ SET_LANE_U32(r, i, LANE_U32(a, i) >> (LANE_U32(b, i) & 31));
++ }
++ setVRBytes(vrt, r); break;
++ case 1732: // vsrd
++ for (int i = 0; i < 2; i++) {
++ SET_LANE_U64(r, i, LANE_U64(a, i) >> (LANE_U64(b, i) & 63));
++ }
++ setVRBytes(vrt, r); break;
++
++ // === Per-lane shift right algebraic (signed) ===
++ case 772: // vsrab
++ for (int i = 0; i < 16; i++) {
++ SET_LANE_U8(r, i,
++ (uint8_t)(LANE_S8(a, i) >> (LANE_U8(b, i) & 7)));
++ }
++ setVRBytes(vrt, r); break;
++ case 836: // vsrah
++ for (int i = 0; i < 8; i++) {
++ SET_LANE_U16(r, i,
++ (uint16_t)(LANE_S16(a, i) >> (LANE_U16(b, i) & 15)));
++ }
++ setVRBytes(vrt, r); break;
++ case 900: // vsraw
++ for (int i = 0; i < 4; i++) {
++ SET_LANE_U32(r, i,
++ (uint32_t)(LANE_S32(a, i) >> (LANE_U32(b, i) & 31)));
++ }
++ setVRBytes(vrt, r); break;
++ case 964: // vsrad
++ for (int i = 0; i < 2; i++) {
++ SET_LANE_U64(r, i,
++ (uint64_t)(LANE_S64(a, i) >> (LANE_U64(b, i) & 63)));
++ }
++ setVRBytes(vrt, r); break;
++
++ // === POWER9 per-lane integer negate (subop in VRA field) ===
++ // PPC_vnegw = 0x10060602 → XO=0x602=1538, VRA=6
++ // PPC_vnegd = 0x10070602 → XO=0x602=1538, VRA=7
++ case 1538:
++ if (vra == 6) { // vnegw
++ for (int i = 0; i < 4; i++) {
++ SET_LANE_U32(r, i, (uint32_t)(-LANE_S32(b, i)));
++ }
++ } else if (vra == 7) { // vnegd
++ for (int i = 0; i < 2; i++) {
++ SET_LANE_U64(r, i, (uint64_t)(-LANE_S64(b, i)));
++ }
++ } else {
++ MOZ_CRASH_UNSAFE_PRINTF("decodeVMX XO=1538: unknown subop %u", vra);
++ }
++ setVRBytes(vrt, r); break;
++
++ // === POWER10 vextract{b,h,w,d}m (XO=1602=0x642) ===
++ // RT (GPR) gets the wasm-spec bitmask in low 16/8/4/2 bits. UIM at
++ // bits 11..15 (= sim `vra`) selects lane width: 8=byte, 9=halfword,
++ // 10=word, 11=doubleword.
++ case 1602: {
++ uint64_t result = 0;
++ switch (vra) {
++ case 8: // vextractbm: 16 byte lanes
++ for (int i = 0; i < 16; i++) {
++ if (b[i] & 0x80) result |= (1ULL << i);
++ }
++ break;
++ case 9: // vextracthm: 8 halfword lanes; MSB lives at byte 2i+1
++ for (int i = 0; i < 8; i++) {
++ if (b[2 * i + 1] & 0x80) result |= (1ULL << i);
++ }
++ break;
++ case 10: // vextractwm: 4 word lanes; MSB at byte 4i+3
++ for (int i = 0; i < 4; i++) {
++ if (b[4 * i + 3] & 0x80) result |= (1ULL << i);
++ }
++ break;
++ case 11: // vextractdm: 2 dword lanes; MSB at byte 8i+7
++ for (int i = 0; i < 2; i++) {
++ if (b[8 * i + 7] & 0x80) result |= (1ULL << i);
++ }
++ break;
++ default:
++ MOZ_CRASH_UNSAFE_PRINTF("decodeVMX XO=1602: unknown UIM %u", vra);
++ }
++ // vrt is the GPR target (RT field at bits 6..10).
++ setRegister(int(vrt), int64_t(result));
++ goto vmx_done; // Skip the trailing setVRBytes used by VR-targeting ops.
++ }
++
++ // === POWER9 vinsertb (XO=781) / vinserth (XO=845) ===
++ // Insert byte/halfword from a VR (NOT a GPR) at an immediate byte
++ // position UIM (BE).
++ // vinsertb: VRT.byte[UIM] (BE) ← VRB.byte[7] (BE)
++ // vinserth: VRT.byte[UIM] (BE) ← VRB.byte[6] (BE)
++ // VRT.byte[UIM+1] (BE) ← VRB.byte[7] (BE)
++ // BE byte i ↔ LE byte (15-i). So VRB.byte[6] (BE) = LE byte 9 of
++ // VRB, VRB.byte[7] (BE) = LE byte 8. (Byte-pair order matters.)
++ case 781: // vinsertb
++ case 845: { // vinserth
++ getVRBytes(vrt, r); // start from current VRT
++ if (xo == 845) {
++ // vinserth: copy 2-byte halfword (BE bytes 6..7 of VRB).
++ r[15 - uimm] = b[9]; // BE byte UIM ← VRB BE byte 6
++ r[14 - uimm] = b[8]; // BE byte UIM+1 ← VRB BE byte 7
++ } else {
++ // vinsertb: copy a single byte (BE byte 7 of VRB).
++ r[15 - uimm] = b[8]; // BE byte UIM ← VRB BE byte 7
++ }
++ setVRBytes(vrt, r); break;
++ }
++
++ // === POWER9 vextractub (XO=525) / vextractuh (XO=589) ===
++ // Extract one byte/halfword from VRB at immediate BE position UIM
++ // and place it at BE byte 7 of VRT, with all other bytes of VRT
++ // zeroed. Companion to vinsertb/h; chooses an immediate BE position
++ // and lands the result at the low byte of VRT (= low byte of mfvsrd).
++ // vextractub: VRT.byte[7] (BE) ← VRB.byte[UIM] (BE), rest = 0
++ // vextractuh: VRT.byte[6] (BE) ← VRB.byte[UIM] (BE)
++ // VRT.byte[7] (BE) ← VRB.byte[UIM+1] (BE), rest = 0
++ case 525: // vextractub
++ case 589: { // vextractuh
++ memset(r, 0, sizeof(r));
++ if (xo == 589) {
++ r[9] = b[15 - uimm]; // VRT BE byte 6 ← VRB BE byte UIM
++ r[8] = b[14 - uimm]; // VRT BE byte 7 ← VRB BE byte UIM+1
++ } else {
++ r[8] = b[15 - uimm]; // VRT BE byte 7 ← VRB BE byte UIM
++ }
++ setVRBytes(vrt, r); break;
++ }
++
++ // === POWER10 vinsbrx (XO=783) / vinshrx (XO=847) ===
++ // Right-indexed (LE-natural) byte/halfword insert from GPR. RA's
++ // low 4 bits supply the byte position (mod 16); for vinshrx the
++ // position is also masked to even (& 0xE) so the halfword is
++ // 2-byte aligned. RB's low 8 / 16 bits are inserted; other bytes
++ // of VRT are unchanged. RA and RB are GPRs (NOT VRs) — sim's
++ // pre-fetched `a` and `b` from getVRBytes are unused here.
++ case 783: // vinsbrx
++ case 847: { // vinshrx
++ uint64_t ra_val = U64(getRegister(int(vra)));
++ uint64_t rb_val = U64(getRegister(int(vrb)));
++ getVRBytes(vrt, r); // start from current VRT
++ const bool isHalf = (xo == 847);
++ const uint32_t pos = isHalf ? uint32_t(ra_val & 0xEULL)
++ : uint32_t(ra_val & 0xFULL);
++ r[pos] = (uint8_t)(rb_val & 0xFFULL);
++ if (isHalf) {
++ r[pos + 1] = (uint8_t)((rb_val >> 8) & 0xFFULL);
++ }
++ setVRBytes(vrt, r); break;
++ }
++
++ // === POWER10 vinsw (XO=207) / vinsd (XO=463) ===
++ // VRT[UIM*8:UIM*8+N-1] (BE bits) ← RB low N bits, where N = 32 or 64.
++ // RB is a GPR (the `vrb` field at sim bits 15..11). UIM is at sim
++ // bits 20..16 (= the `uimm` / `vra` decode). Other bytes of VRT are
++ // unchanged, so we read VRT first then patch UIM..UIM+(N/8-1).
++ case 207: // vinsw
++ case 463: { // vinsd
++ uint64_t rb_val = U64(getRegister(int(vrb)));
++ getVRBytes(vrt, r); // start from current VRT
++ const int width = (xo == 463) ? 8 : 4; // bytes
++ // BE byte UIM+i of VRT = LE byte (15 - UIM - i).
++ // For vinsd, RB.dword[0] (BE) = bits 56..63 of rb_val (host LSB end
++ // of the GPR — recall U64() puts the canonical 64-bit value in a
++ // host uint64_t with bit 63 = MSB).
++ // For vinsw, source is RB[32:63] = low 32 bits of rb_val.
++ uint64_t src = (width == 8) ? rb_val : (rb_val & 0xFFFFFFFFULL);
++ const int srcMsbShift = (width * 8) - 8; // 56 or 24
++ for (int i = 0; i < width; i++) {
++ r[15 - uimm - i] = (uint8_t)(src >> (srcMsbShift - 8 * i));
++ }
++ setVRBytes(vrt, r); break;
++ }
++
++ // === POWER8+ vbpermq (XO=1356=0x54C): per-byte bit permute ===
++ // For each i in 0..15, take VRB BE-byte i (= sim b[15-i]); if its
++ // high bit is set, perm[i]=0; else perm[i] = bit at BE position
++ // (low 7 bits) of VRA. ISA says perm[0..15] go into VRT.dw[1] low
++ // 16 bits, but on real LE silicon the bitmap is observable in dw[0]
++ // low 16 bits — i.e., recoverable via mfvsrd. Match that observable
++ // behaviour: write the bitmap into sim bytes[8..9] (where mfvsrd
++ // reads dw[0] from), zero the rest.
++ case 1356: {
++ uint8_t perm[16];
++ for (int k = 0; k < 16; k++) {
++ uint8_t ctl = b[15 - k];
++ if (ctl & 0x80) {
++ perm[k] = 0;
++ } else {
++ int p = ctl & 0x7F;
++ int le_idx = 15 - (p / 8);
++ int bit_in_byte = 7 - (p % 8);
++ perm[k] = (a[le_idx] >> bit_in_byte) & 1;
++ }
++ }
++ uint8_t lo = 0, hi = 0;
++ for (int k = 0; k < 8; k++) hi = (hi << 1) | perm[k];
++ for (int k = 8; k < 16; k++) lo = (lo << 1) | perm[k];
++ for (int i = 0; i < 16; i++) r[i] = 0;
++ r[8] = lo;
++ r[9] = hi;
++ setVRBytes(vrt, r); break;
++ }
++
++ // VA-form ops vmladduhm (XO=34), vsel (XO=42), vperm (XO=43) are
++ // peeled off in the pre-dispatch above (see "VA-form pre-dispatch"
++ // comment near the top of this function), since the 11-bit XO
++ // mask conflates VRC into the case label.
++
++ // === Unpack high signed (BE-numbering = LE indices 8..15) ===
++ // vupkhsb: VRT[i] = sign_extend_to_16(VRA[i+0..7]). On LE storage with
++ // BE-named "high" being the low-indexed bytes, vupkhsb sign-extends the
++ // low 8 bytes of VRA into 8 halfwords. PPC64LE wasm calls these the
++ // "high" lanes per PPC convention; the JIT compensates internally via
++ // the vupklsb/vupkhsb swap documented in MacroAssembler-ppc64-inl.h.
++ case 526: // vupkhsb (high signed byte → halfword)
++ for (int i = 0; i < 8; i++) {
++ SET_LANE_U16(r, i, (uint16_t)(int16_t)LANE_S8(b, 8 + i));
++ }
++ setVRBytes(vrt, r); break;
++ case 590: // vupkhsh (high signed halfword → word)
++ for (int i = 0; i < 4; i++) {
++ SET_LANE_U32(r, i, (uint32_t)(int32_t)LANE_S16(b, 4 + i));
++ }
++ setVRBytes(vrt, r); break;
++ case 1614: // vupkhsw (high signed word → dword) POWER8+
++ for (int i = 0; i < 2; i++) {
++ SET_LANE_U64(r, i, (uint64_t)(int64_t)LANE_S32(b, 2 + i));
++ }
++ setVRBytes(vrt, r); break;
++ case 654: // vupklsb (low signed byte → halfword) — PPC LE: takes high lanes
++ for (int i = 0; i < 8; i++) {
++ SET_LANE_U16(r, i, (uint16_t)(int16_t)LANE_S8(b, i));
++ }
++ setVRBytes(vrt, r); break;
++ case 718: // vupklsh (low signed halfword → word)
++ for (int i = 0; i < 4; i++) {
++ SET_LANE_U32(r, i, (uint32_t)(int32_t)LANE_S16(b, i));
++ }
++ setVRBytes(vrt, r); break;
++ case 1742: // vupklsw (low signed word → dword)
++ for (int i = 0; i < 2; i++) {
++ SET_LANE_U64(r, i, (uint64_t)(int64_t)LANE_S32(b, i));
++ }
++ setVRBytes(vrt, r); break;
++
++ // === Pack (saturate or modulo) ===
++ //
++ // vpk* definitions are BE-specified:
++ // VT.byte[0..7] = saturate(VA.halfword[0..7]), VT.byte[8..15] =
++ // saturate(VB.halfword[0..7]) (BE-numbered throughout). On
++ // PPC64LE register storage that inverts to: LE bytes 0-7 = VB's
++ // saturated halfwords, LE bytes 8-15 = VA's.
++ //
++ // vpkshus = XO 270 (s16 → u8 sat)
++ // vpkshss = XO 398 (s16 → s8 sat)
++ // vpkswus = XO 334 (s32 → u16 sat)
++ // vpkswss = XO 462 (s32 → s16 sat)
++ // The sim previously had three of these four labels rotated
++ // (270=vpkshss, 334=vpkshus, 398=vpkswus) so every i8x16/i16x8
++ // narrow_* call silently used the wrong saturation kind or
++ // lane width — vpkshss was completely absent.
++ case 398: { // vpkshss (signed halfword → signed byte)
++ for (int i = 0; i < 8; i++) {
++ int v = LANE_S16(b, i);
++ if (v > INT8_MAX) v = INT8_MAX;
++ if (v < INT8_MIN) v = INT8_MIN;
++ SET_LANE_U8(r, i, (uint8_t)(int8_t)v);
++ }
++ for (int i = 0; i < 8; i++) {
++ int v = LANE_S16(a, i);
++ if (v > INT8_MAX) v = INT8_MAX;
++ if (v < INT8_MIN) v = INT8_MIN;
++ SET_LANE_U8(r, 8 + i, (uint8_t)(int8_t)v);
++ }
++ setVRBytes(vrt, r); break;
++ }
++ case 462: { // vpkswss (signed word → signed halfword)
++ for (int i = 0; i < 4; i++) {
++ int64_t v = LANE_S32(b, i);
++ if (v > INT16_MAX) v = INT16_MAX;
++ if (v < INT16_MIN) v = INT16_MIN;
++ SET_LANE_U16(r, i, (uint16_t)(int16_t)v);
++ }
++ for (int i = 0; i < 4; i++) {
++ int64_t v = LANE_S32(a, i);
++ if (v > INT16_MAX) v = INT16_MAX;
++ if (v < INT16_MIN) v = INT16_MIN;
++ SET_LANE_U16(r, 4 + i, (uint16_t)(int16_t)v);
++ }
++ setVRBytes(vrt, r); break;
++ }
++ case 270: { // vpkshus (signed halfword → unsigned byte, sat)
++ for (int i = 0; i < 8; i++) {
++ int v = LANE_S16(b, i);
++ if (v > UINT8_MAX) v = UINT8_MAX;
++ if (v < 0) v = 0;
++ SET_LANE_U8(r, i, (uint8_t)v);
++ }
++ for (int i = 0; i < 8; i++) {
++ int v = LANE_S16(a, i);
++ if (v > UINT8_MAX) v = UINT8_MAX;
++ if (v < 0) v = 0;
++ SET_LANE_U8(r, 8 + i, (uint8_t)v);
++ }
++ setVRBytes(vrt, r); break;
++ }
++ case 334: { // vpkswus (signed word → unsigned halfword, sat)
++ for (int i = 0; i < 4; i++) {
++ int64_t v = LANE_S32(b, i);
++ if (v > UINT16_MAX) v = UINT16_MAX;
++ if (v < 0) v = 0;
++ SET_LANE_U16(r, i, (uint16_t)v);
++ }
++ for (int i = 0; i < 4; i++) {
++ int64_t v = LANE_S32(a, i);
++ if (v > UINT16_MAX) v = UINT16_MAX;
++ if (v < 0) v = 0;
++ SET_LANE_U16(r, 4 + i, (uint16_t)v);
++ }
++ setVRBytes(vrt, r); break;
++ }
++
++ // === POWER9 compare not-equal (vcmpne{b,h,w}) — Rc=0 and Rc=1 ===
++ case 7: // vcmpneb
++ case 1031: // vcmpneb.
++ for (int i = 0; i < 16; i++) {
++ SET_LANE_U8(r, i, LANE_U8(a, i) != LANE_U8(b, i) ? 0xFF : 0);
++ }
++ VCMP_DONE(16, 1); break;
++ case 71: // vcmpneh
++ case 1095: // vcmpneh.
++ for (int i = 0; i < 8; i++) {
++ SET_LANE_U16(r, i, LANE_U16(a, i) != LANE_U16(b, i) ? 0xFFFF : 0);
++ }
++ VCMP_DONE(8, 2); break;
++ case 135: // vcmpnew
++ case 1159: // vcmpnew.
++ for (int i = 0; i < 4; i++) {
++ SET_LANE_U32(r, i,
++ LANE_U32(a, i) != LANE_U32(b, i) ? 0xFFFFFFFFu : 0);
++ }
++ VCMP_DONE(4, 4); break;
++ #undef VCMP_DONE
++
++ // === Population count per byte (POWER8) ===
++ case 1795: { // vpopcntb (XO 0x703 = 1795). VRA field unused.
++ for (int i = 0; i < 16; i++) {
++ SET_LANE_U8(r, i, (uint8_t)__builtin_popcount(LANE_U8(b, i)));
++ }
++ setVRBytes(vrt, r); break;
++ }
++
++ // === vsldoi: VRT = (VRA || VRB) shifted left by SH bytes (SH at bits 22..25) ===
++ case 44: case 45: case 46: case 47: {
++ // SH is at bits 22..25 (PPC) → LSB bits 6..9 of the instruction →
++ // (instructionBits >> 6) & 0xF. Our XO mask already bottoms-out at
++ // bit 0, so extract from the raw instruction.
++ uint32_t sh = (instr->instructionBits() >> 6) & 0xF;
++ uint8_t cat[32];
++ memcpy(cat, a, 16);
++ memcpy(cat + 16, b, 16);
++ for (int i = 0; i < 16; i++) {
++ r[i] = cat[sh + i];
++ }
++ setVRBytes(vrt, r); break;
++ }
++
++
++ default:
++ MOZ_CRASH_UNSAFE_PRINTF(
++ "decodeVMX: unimplemented XO=%u (instruction 0x%08x)", xo,
++ instr->instructionBits());
++ }
++
++vmx_done:
++ #undef LANE_U8
++ #undef LANE_S8
++ #undef LANE_U16
++ #undef LANE_S16
++ #undef LANE_U32
++ #undef LANE_S32
++ #undef LANE_U64
++ #undef LANE_S64
++ #undef SET_LANE_U8
++ #undef SET_LANE_U16
++ #undef SET_LANE_U32
++ #undef SET_LANE_U64
++ ; // empty stmt for label
++}
++
++// -----------------------------------------------------------------------------
++// decodeVSX: Major opcode 60 (XX1-form, XX2-form)
++// mfvsrd, mtvsrd, mtvsrwz, mtvsrws, xscvdpsp, xscvdpspn, xscvspdp,
++// xscvspdpn, xxbrd
++
++void Simulator::decodeVSX(SimInstruction* instr) {
++ // VSX major opcode 60 covers XX1/XX2/XX3/XX4 forms. We dispatch XX4
++ // (xxsel) first because its XO is only 2 bits (at ISA 26-27 = sim
++ // bits 5-4), and the XC register field at ISA 21-25 would otherwise
++ // produce 32 different 9-bit XO values to enumerate in the switch.
++ // Peel off any instruction with XX4 XO=3 (xxsel). No XX2/XX3 op currently
++ // emitted by the JIT has sim bits (5,4) == 3.
++ if (instr->bits(5, 4) == 3) {
++ // xxsel XT,XA,XB,XC (VA-like XX4-form).
++ // XT[i] = (XA[i] & ~XC[i]) | (XB[i] & XC[i])
++ // Register fields: XA/XB/XT per-byte; XC at ISA bits 21-25 (sim
++ // bits 10-6) with CX extension at ISA bit 28 (sim bit 3).
++ int xa = int(instr->raValue() | (instr->bit(2) << 5));
++ int xb = int(instr->rbValue() | (instr->bit(1) << 5));
++ int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++ int xc = int(instr->bits(10, 6) | (instr->bit(3) << 5));
++ uint8_t ab[16], bb[16], cb[16], result[16];
++ getVSR128(xa, ab);
++ getVSR128(xb, bb);
++ getVSR128(xc, cb);
++ for (int i = 0; i < 16; i++) {
++ result[i] = (uint8_t)((ab[i] & ~cb[i]) | (bb[i] & cb[i]));
++ }
++ setVSR128(xt, result);
++ return;
++ }
++
++ // The remaining forms (XX1/XX2/XX3) share a 9-bit XO at ISA bits
++ // 21-29 (sim bits 10-2). For XX3 this is (8-bit XO << 1) | AX; for
++ // XX2 the full 9 bits are the XO (no AX field).
++ uint32_t xo = instr->bits(10, 2);
++ uint32_t rt = instr->rtValue();
++ uint32_t rb = instr->rbValue();
++
++ switch (xo) {
++ // xscvdpsp / xscvdpspn / xscvspdp / xscvspdpn / xxbrd are
++ // XX2-form: XT/XB are each 6-bit (5-bit field + TX/BX extension at
++ // sim bits 0/1). Post-Phase-2 the JIT emits these with Simd128
++ // targets (encoding 32-63), which require the extension bit to
++ // select VR-space instead of FPR-space. The previous code used
++ // only the 5-bit field, so any VR-space target silently clobbered
++ // FPR 0..31 and the post-splat fbits in splatX4 never reached the
++ // vector lanes.
++ case 265: {
++ // xscvdpsp: double→single with sNaN quieting. The ISA says
++ // result lands at XT[0:31] (BE word 0 = LE bytes 12..15) and
++ // XT[32:127] is "undefined". Real POWER9 silicon actually
++ // duplicates the result into BE word 1 as well, so the bytes
++ // at LE 8..11 hold the same single. The JIT's
++ // replaceLaneFloat32x4 lowering depends on this: it follows
++ // xscvdpspn with `xxinsertw …, 12`, which reads XB.word[1]
++ // (LE bytes 8..11). Zeroing those bytes here would silently
++ // lose the single under sim. Mirror HW.
++ int xt = int(rt | (instr->bit(0) << 5));
++ int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++ uint8_t bb[16];
++ getVSR128(xb, bb);
++ // Source double at BE DW0 = LE bytes 8..15 of xb.
++ uint64_t dbits = 0;
++ for (int i = 0; i < 8; i++) dbits |= ((uint64_t)bb[8 + i]) << (i * 8);
++ double frb;
++ memcpy(&frb, &dbits, sizeof(frb));
++ float result = demoteDoublePreservingNaN(frb);
++ uint32_t fbits;
++ memcpy(&fbits, &result, sizeof(fbits));
++ if ((fbits & 0x7F800000u) == 0x7F800000u && (fbits & 0x007FFFFFu) != 0) {
++ fbits |= 0x00400000u;
++ }
++ uint8_t out[16];
++ memset(out, 0, 8);
++ // BE word 1 (LE 8..11) and BE word 0 (LE 12..15) both = fbits.
++ for (int off : {8, 12}) {
++ out[off] = (uint8_t)(fbits);
++ out[off + 1] = (uint8_t)(fbits >> 8);
++ out[off + 2] = (uint8_t)(fbits >> 16);
++ out[off + 3] = (uint8_t)(fbits >> 24);
++ }
++ setVSR128(xt, out);
++ break;
++ }
++ case 267: {
++ // xscvdpspn: same as xscvdpsp but non-signaling. Same HW-observed
++ // word-1 duplication (see xscvdpsp comment above).
++ int xt = int(rt | (instr->bit(0) << 5));
++ int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++ uint8_t bb[16];
++ getVSR128(xb, bb);
++ uint64_t dbits = 0;
++ for (int i = 0; i < 8; i++) dbits |= ((uint64_t)bb[8 + i]) << (i * 8);
++ double frb;
++ memcpy(&frb, &dbits, sizeof(frb));
++ float result = demoteDoublePreservingNaN(frb);
++ uint32_t fbits;
++ memcpy(&fbits, &result, sizeof(fbits));
++ uint8_t out[16];
++ memset(out, 0, 8);
++ for (int off : {8, 12}) {
++ out[off] = (uint8_t)(fbits);
++ out[off + 1] = (uint8_t)(fbits >> 8);
++ out[off + 2] = (uint8_t)(fbits >> 16);
++ out[off + 3] = (uint8_t)(fbits >> 24);
++ }
++ setVSR128(xt, out);
++ break;
++ }
++ case 393: {
++ // xvcvdpsp: convert two doubles to two singles, replicating each
++ // result across its dword. BE words = [s(BE_dw0), s(BE_dw0),
++ // s(BE_dw1), s(BE_dw1)]. SIGNALING form per ISA: sNaN inputs are
++ // quieted (high-order fraction bit set in result).
++ int xt = int(rt | (instr->bit(0) << 5));
++ int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++ uint8_t bb[16], out[16];
++ getVSR128(xb, bb);
++ uint32_t fbits[2];
++ // BE_dw0 = LE bytes 8..15, BE_dw1 = LE bytes 0..7.
++ for (int dw = 0; dw < 2; dw++) {
++ int leOff = (dw == 0) ? 8 : 0;
++ uint64_t dbits = 0;
++ for (int i = 0; i < 8; i++) {
++ dbits |= ((uint64_t)bb[leOff + i]) << (i * 8);
++ }
++ double frb;
++ memcpy(&frb, &dbits, sizeof(frb));
++ float result = demoteDoublePreservingNaN(frb);
++ memcpy(&fbits[dw], &result, sizeof(uint32_t));
++ if ((fbits[dw] & 0x7F800000u) == 0x7F800000u &&
++ (fbits[dw] & 0x007FFFFFu) != 0) {
++ fbits[dw] |= 0x00400000u; // quiet sNaN result
++ }
++ }
++ // LE words: [s(dw1), s(dw1), s(dw0), s(dw0)]
++ // (LE word 0 = BE word 3 = s(dw1); LE word 3 = BE word 0 = s(dw0)).
++ uint32_t leWords[4] = {fbits[1], fbits[1], fbits[0], fbits[0]};
++ for (int w = 0; w < 4; w++) {
++ out[w * 4] = (uint8_t)leWords[w];
++ out[w * 4 + 1] = (uint8_t)(leWords[w] >> 8);
++ out[w * 4 + 2] = (uint8_t)(leWords[w] >> 16);
++ out[w * 4 + 3] = (uint8_t)(leWords[w] >> 24);
++ }
++ setVSR128(xt, out);
++ break;
++ }
++ case 216: // xvcvdpsxws: double → signed word, saturating, RTZ (vector)
++ case 200: { // xvcvdpuxws: double → unsigned word, saturating, RTZ (vector)
++ // src1 := XB.dword_BE[0]; src2 := XB.dword_BE[1]
++ // r1 := ConvertDPtoSat(src1); r2 := ConvertDPtoSat(src2)
++ // XT.word_BE[0] := r1; XT.word_BE[1] := r1 (replicated)
++ // XT.word_BE[2] := r2; XT.word_BE[3] := r2 (replicated)
++ // Saturation: signed clamps to [INT32_MIN, INT32_MAX] with NaN→INT32_MIN;
++ // unsigned clamps to [0, UINT32_MAX] with NaN→0 and neg→0.
++ // BE_dw0 = LE bytes 8..15; BE_dw1 = LE bytes 0..7.
++ bool isSigned = (xo == 216);
++ int xt = int(rt | (instr->bit(0) << 5));
++ int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++ uint8_t bb[16], out[16];
++ getVSR128(xb, bb);
++ const int srcOffsets[2] = {8, 0}; // BE_dw0 (LE 8..15), BE_dw1 (LE 0..7)
++ uint32_t results[2];
++ for (int lane = 0; lane < 2; lane++) {
++ uint64_t dbits = 0;
++ for (int j = 0; j < 8; j++) {
++ dbits |= ((uint64_t)bb[srcOffsets[lane] + j]) << (j * 8);
++ }
++ double dval;
++ memcpy(&dval, &dbits, sizeof(dval));
++ if (std::isnan(dval)) {
++ results[lane] = isSigned ? 0x80000000u : 0u;
++ } else if (isSigned) {
++ if (dval >= 2147483647.0) {
++ results[lane] = 0x7FFFFFFFu;
++ } else if (dval <= -2147483648.0) {
++ results[lane] = 0x80000000u;
++ } else {
++ results[lane] = (uint32_t)(int32_t)dval; // RTZ
++ }
++ } else { // unsigned
++ if (dval <= 0.0) {
++ results[lane] = 0u;
++ } else if (dval >= 4294967295.0) {
++ results[lane] = 0xFFFFFFFFu;
++ } else {
++ results[lane] = (uint32_t)dval; // RTZ
++ }
++ }
++ }
++ // Replicated layout: BE words [r1, r1, r2, r2]; in LE bytes
++ // [r2, r2, r1, r1] (LE word 0 = BE word 3 = r2, LE word 3 = BE word 0 = r1).
++ uint32_t leWords[4] = {results[1], results[1], results[0], results[0]};
++ for (int w = 0; w < 4; w++) {
++ out[w * 4] = (uint8_t)leWords[w];
++ out[w * 4 + 1] = (uint8_t)(leWords[w] >> 8);
++ out[w * 4 + 2] = (uint8_t)(leWords[w] >> 16);
++ out[w * 4 + 3] = (uint8_t)(leWords[w] >> 24);
++ }
++ setVSR128(xt, out);
++ break;
++ }
++ case 248: // xvcvsxwdp: signed word → double (vector)
++ case 232: { // xvcvuxwdp: unsigned word → double (vector)
++ // src1 := XB.word_BE[0]; src2 := XB.word_BE[2]
++ // XT.dword_BE[0] := Convert(src1); XT.dword_BE[1] := Convert(src2)
++ // BE word 0 = LE bytes 12..15; BE word 2 = LE bytes 4..7.
++ // Output BE dword 0 = LE bytes 8..15; BE dword 1 = LE bytes 0..7.
++ // No NaN handling needed (integer source).
++ bool isSigned = (xo == 248);
++ int xt = int(rt | (instr->bit(0) << 5));
++ int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++ uint8_t bb[16], out[16];
++ getVSR128(xb, bb);
++ const int srcOffsets[2] = {12, 4};
++ const int dstOffsets[2] = {8, 0};
++ for (int lane = 0; lane < 2; lane++) {
++ uint32_t bits = (uint32_t)bb[srcOffsets[lane]] |
++ ((uint32_t)bb[srcOffsets[lane] + 1] << 8) |
++ ((uint32_t)bb[srcOffsets[lane] + 2] << 16) |
++ ((uint32_t)bb[srcOffsets[lane] + 3] << 24);
++ double dval = isSigned ? (double)(int32_t)bits : (double)bits;
++ uint64_t dbits;
++ memcpy(&dbits, &dval, sizeof(dbits));
++ for (int i = 0; i < 8; i++) {
++ out[dstOffsets[lane] + i] = (uint8_t)(dbits >> (i * 8));
++ }
++ }
++ setVSR128(xt, out);
++ break;
++ }
++ case 457: {
++ // xvcvspdp: convert two singles to two doubles. SIGNALING form
++ // per ISA: sNaN inputs are quieted in the result (bit 51 set).
++ // src1 := XB.word_BE[0]; src2 := XB.word_BE[2]
++ // XT.dword_BE[0] := ConvertSPtoDP(src1)
++ // XT.dword_BE[1] := ConvertSPtoDP(src2)
++ // BE word 0 = LE bytes 12..15; BE word 2 = LE bytes 4..7.
++ // Output BE dword 0 = LE bytes 8..15; BE dword 1 = LE bytes 0..7.
++ int xt = int(rt | (instr->bit(0) << 5));
++ int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++ uint8_t bb[16], out[16];
++ getVSR128(xb, bb);
++ // src1 from BE word 0 (LE 12..15), output dword at LE 8..15.
++ // src2 from BE word 2 (LE 4..7), output dword at LE 0..7.
++ const int srcOffsets[2] = {12, 4}; // LE byte offsets of word_BE[0], word_BE[2]
++ const int dstOffsets[2] = {8, 0}; // LE byte offsets of dword_BE[0], dword_BE[1]
++ for (int lane = 0; lane < 2; lane++) {
++ uint32_t fbits = (uint32_t)bb[srcOffsets[lane]] |
++ ((uint32_t)bb[srcOffsets[lane] + 1] << 8) |
++ ((uint32_t)bb[srcOffsets[lane] + 2] << 16) |
++ ((uint32_t)bb[srcOffsets[lane] + 3] << 24);
++ float fval;
++ memcpy(&fval, &fbits, sizeof(fval));
++ double dval = promoteFloatPreservingNaN(fval);
++ uint64_t dbits;
++ memcpy(&dbits, &dval, sizeof(dbits));
++ if ((dbits & 0x7FF0000000000000ULL) == 0x7FF0000000000000ULL &&
++ (dbits & 0x000FFFFFFFFFFFFFULL) != 0) {
++ dbits |= 0x0008000000000000ULL; // quiet sNaN result
++ }
++ for (int i = 0; i < 8; i++) {
++ out[dstOffsets[lane] + i] = (uint8_t)(dbits >> (i * 8));
++ }
++ }
++ setVSR128(xt, out);
++ break;
++ }
++ case 329: {
++ // xscvspdp: single→double from BE word 0 of XB. SIGNALING form;
++ // an sNaN input yields a qNaN result with the high-order
++ // fraction bit (quiet bit) set.
++ int xt = int(rt | (instr->bit(0) << 5));
++ int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++ uint8_t bb[16];
++ getVSR128(xb, bb);
++ // BE word 0 = LE bytes 12..15 of xb.
++ uint32_t fbits = (uint32_t)bb[12] |
++ ((uint32_t)bb[13] << 8) |
++ ((uint32_t)bb[14] << 16) |
++ ((uint32_t)bb[15] << 24);
++ float fval;
++ memcpy(&fval, &fbits, sizeof(fval));
++ double dval = promoteFloatPreservingNaN(fval);
++ uint64_t dbits;
++ memcpy(&dbits, &dval, sizeof(dbits));
++ // Quiet any NaN result (signaling form): set bit 51 of mantissa.
++ if ((dbits & 0x7FF0000000000000ULL) == 0x7FF0000000000000ULL &&
++ (dbits & 0x000FFFFFFFFFFFFFULL) != 0) {
++ dbits |= 0x0008000000000000ULL;
++ }
++ uint8_t out[16];
++ memset(out, 0, 8);
++ for (int i = 0; i < 8; i++) out[8 + i] = (uint8_t)(dbits >> (i * 8));
++ setVSR128(xt, out);
++ break;
++ }
++ case 331: {
++ // xscvspdpn: non-signaling variant of xscvspdp.
++ int xt = int(rt | (instr->bit(0) << 5));
++ int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++ uint8_t bb[16];
++ getVSR128(xb, bb);
++ uint32_t fbits = (uint32_t)bb[12] |
++ ((uint32_t)bb[13] << 8) |
++ ((uint32_t)bb[14] << 16) |
++ ((uint32_t)bb[15] << 24);
++ float fval;
++ memcpy(&fval, &fbits, sizeof(fval));
++ double dval = promoteFloatPreservingNaN(fval);
++ uint64_t dbits;
++ memcpy(&dbits, &dval, sizeof(dbits));
++ uint8_t out[16];
++ memset(out, 0, 8);
++ for (int i = 0; i < 8; i++) out[8 + i] = (uint8_t)(dbits >> (i * 8));
++ setVSR128(xt, out);
++ break;
++ }
++ case 347: {
++ // POWER9 XX2-form ops sharing XO=347; disambiguated by the 5-bit
++ // A immediate (sim bits 20..16):
++ // A=0 -> xsxexpdp (extract biased exponent into 11 LSBs of XT.dw0)
++ // A=16 -> xscvhpdp (FP16 -> FP64)
++ // A=17 -> xscvdphp (FP64 -> FP16)
++ // Half placement: the FP16 value lives at LE bytes 8..9 of
++ // the VSR (= BE bits 48..63 of
++ // dword[0]), with the rest of dword[0] zeroed. This matches the
++ // lxsihzx layout already used by the JIT.
++ uint32_t aImm = (instr->instructionBits() >> 16) & 0x1F;
++ int xt = int(rt | (instr->bit(0) << 5));
++ int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++ uint8_t bb[16], out[16];
++ getVSR128(xb, bb);
++ memset(out, 0, 16);
++ if (aImm == 17) {
++ // xscvdphp: read FP64 from BE 0..63 of XB (LE bytes 8..15),
++ // convert to FP16, place at LE bytes 8..9 of XT.
++ double d;
++ memcpy(&d, bb + 8, 8);
++ uint16_t h = js::float16(d).toRawBits();
++ out[8] = (uint8_t)(h & 0xFF);
++ out[9] = (uint8_t)((h >> 8) & 0xFF);
++ } else if (aImm == 16) {
++ // xscvhpdp: read FP16 from LE bytes 8..9 of XB, convert to FP64,
++ // place at LE bytes 8..15 of XT.
++ uint16_t h = (uint16_t)bb[8] | ((uint16_t)bb[9] << 8);
++ double d = static_cast<double>(js::float16::fromRawBits(h));
++ memcpy(out + 8, &d, 8);
++ } else if (aImm == 0) {
++ // xsxexpdp: read FP64 from LE bytes 8..15 of XB, extract biased
++ // exponent (bits 1..11 of the IEEE-754 double = bits 52..62 of
++ // the 64-bit pattern), place into XT.dw0 with rest zeroed.
++ uint64_t bits = 0;
++ for (int i = 0; i < 8; i++) bits |= uint64_t(bb[8 + i]) << (i * 8);
++ uint64_t exp = (bits >> 52) & 0x7FF;
++ for (int i = 0; i < 8; i++) out[8 + i] = (uint8_t)(exp >> (i * 8));
++ } else {
++ MOZ_CRASH_UNSAFE_PRINTF(
++ "decodeVSX XO=347 with unexpected A=%u (instr 0x%08x)",
++ aImm, instr->instructionBits());
++ }
++ setVSR128(xt, out);
++ break;
++ }
++ case 475: {
++ // xxbrd: byte-reverse each doubleword.
++ int xt = int(rt | (instr->bit(0) << 5));
++ int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++ uint8_t bb[16], out[16];
++ getVSR128(xb, bb);
++ for (int i = 0; i < 8; i++) out[i] = bb[7 - i];
++ for (int i = 0; i < 8; i++) out[8 + i] = bb[15 - i];
++ setVSR128(xt, out);
++ break;
++ }
++
++ // === XX3-form scalar: xsmaxjdp / xsminjdp (POWER9) ===
++ //
++ // xs{max,min}jdp XT, XA, XB. Scalar inputs at BE bits 0..63 of
++ // XA / XB (= LE bytes 8..15); result lands at BE 0..63 of XT
++ // (upper bits "undefined" per ISA).
++ //
++ // Semantics match ECMA-262 Math.{max,min} / wasm f64.{max,min}:
++ // - NaN: if A is NaN return A; else if B is NaN return B. sNaN
++ // payload preserved bit-for-bit (NOT quieted).
++ // - ±0 tie: signed-zero ordering. xsmaxjdp returns +0 for any
++ // mix of (-0, +0); xsminjdp returns -0.
++ // - Otherwise: standard IEEE max / min.
++ case 288: case 289: // xsmaxjdp (XO8=144 → 9-bit 288/289)
++ case 304: case 305: { // xsminjdp (XO8=152 → 9-bit 304/305)
++ int xa = int(instr->raValue() | (instr->bit(2) << 5));
++ int xb = int(instr->rbValue() | (instr->bit(1) << 5));
++ int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++ uint8_t ab[16], bb[16], out[16];
++ getVSR128(xa, ab);
++ getVSR128(xb, bb);
++ double a, b;
++ memcpy(&a, ab + 8, 8);
++ memcpy(&b, bb + 8, 8);
++ bool isMax = (xo >> 1) == 144;
++ double r;
++ if (std::isnan(a)) {
++ r = a;
++ } else if (std::isnan(b)) {
++ r = b;
++ } else if (a == 0.0 && b == 0.0) {
++ // Signed-zero ordering: max picks +0, min picks -0.
++ if (isMax) {
++ r = std::signbit(a) ? b : a;
++ } else {
++ r = std::signbit(a) ? a : b;
++ }
++ } else {
++ r = isMax ? std::max(a, b) : std::min(a, b);
++ }
++ memset(out, 0, 8);
++ memcpy(out + 8, &r, 8);
++ setVSR128(xt, out);
++ break;
++ }
++
++ // --- VSX XX3-form: xxpermdi ---
++ //
++ // xxpermdi XT, XA, XB, DM:
++ // XT.DW0 = XA.DW(DM[0])
++ // XT.DW1 = XB.DW(DM[1])
++ // In BE, DW0 is MSB-side, DW1 is LSB-side. On PPC64LE register
++ // storage, DW0 = LE bytes 8-15 and DW1 = LE bytes 0-7. The sim's
++ // previous implementation used the reversed "DW0 = LE 0-7"
++ // convention which cancelled for self-swap round-trips but
++ // produced wrong halves when chained with ISA-correct ops
++ // (mtvsrd, xxspltw, mfvsrd).
++ case 20: case 21: // xxpermdi DM=0
++ case 84: case 85: // xxpermdi DM=1
++ case 148: case 149: // xxpermdi DM=2 (= xxswapd when XA==XB)
++ case 212: case 213: { // xxpermdi DM=3
++ uint8_t dm_hi = (xo >> 7) & 1; // DM[0]
++ uint8_t dm_lo = (xo >> 6) & 1; // DM[1]
++ int xa = int(instr->raValue() | (instr->bit(2) << 5));
++ int xb = int(instr->rbValue() | (instr->bit(1) << 5));
++ int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++ uint8_t xa_bytes[16], xb_bytes[16], result[16];
++ getVSR128(xa, xa_bytes);
++ getVSR128(xb, xb_bytes);
++ // DW0 in LE storage is bytes 8-15; DW1 is bytes 0-7.
++ // XT.DW0 (result[8..15]) = XA.DW(dm_hi)
++ // XT.DW1 (result[0..7]) = XB.DW(dm_lo)
++ // DW(0) is at LE 8, DW(1) is at LE 0.
++ memcpy(result + 8, xa_bytes + (dm_hi ? 0 : 8), 8);
++ memcpy(result, xb_bytes + (dm_lo ? 0 : 8), 8);
++ setVSR128(xt, result);
++ break;
++ }
++
++ // --- VSX logical (XX3-form, primary opcode 60) ---
++ //
++ // Each takes two 6-bit VSR sources XA/XB and writes 6-bit VSR
++ // destination XT. 8-bit ISA XO at bits 21-28; our
++ // 9-bit XO extraction (bits 10:2) includes the AX bit at position 0,
++ // so each op appears as two consecutive values (AX=0 and AX=1).
++ //
++ // xxland XT,XA,XB XO=130 (9-bit: 260, 261) XT = XA & XB
++ // xxlandc XT,XA,XB XO=138 (276, 277) XT = XA & ~XB
++ // xxlor XT,XA,XB XO=146 (292, 293) XT = XA | XB
++ // xxlxor XT,XA,XB XO=154 (308, 309) XT = XA ^ XB
++ // xxlnor XT,XA,XB XO=162 (324, 325) XT = ~(XA | XB)
++ // xxlorc XT,XA,XB XO=170 (340, 341) XT = XA | ~XB
++ // xxlnand XT,XA,XB XO=178 (356, 357) XT = ~(XA & XB)
++ // xxleqv XT,XA,XB XO=186 (372, 373) XT = ~(XA ^ XB)
++ //
++ // The encoding constants in Assembler-ppc64.h match: PPC_xxlor=0xF0000490
++ // has bits 4,7,10 set in its base (XO=146 in the 8-bit field), which
++ // under the simulator's 9-bit extraction gives 2*146=292 (AX=0 default).
++ case 260: case 261: // xxland
++ case 276: case 277: // xxlandc
++ case 292: case 293: // xxlor
++ case 308: case 309: // xxlxor
++ case 324: case 325: // xxlnor
++ case 340: case 341: // xxlorc
++ case 356: case 357: // xxlnand
++ case 372: case 373: // xxleqv
++ {
++ int xa = int(instr->raValue() | (instr->bit(2) << 5));
++ int xb = int(instr->rbValue() | (instr->bit(1) << 5));
++ int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++ uint8_t a_bytes[16], b_bytes[16], result[16];
++ getVSR128(xa, a_bytes);
++ getVSR128(xb, b_bytes);
++ // Dispatch on the 8-bit ISA XO (ignoring AX bit at position 0).
++ uint32_t xo8 = xo >> 1;
++ for (int i = 0; i < 16; i++) {
++ uint8_t a = a_bytes[i], b = b_bytes[i];
++ switch (xo8) {
++ case 130: result[i] = a & b; break; // xxland
++ case 138: result[i] = a & ~b; break; // xxlandc
++ case 146: result[i] = a | b; break; // xxlor
++ case 154: result[i] = a ^ b; break; // xxlxor
++ case 162: result[i] = (uint8_t)~(a | b); break; // xxlnor
++ case 170: result[i] = a | (uint8_t)~b; break; // xxlorc
++ case 178: result[i] = (uint8_t)~(a & b); break; // xxlnand
++ case 186: result[i] = (uint8_t)~(a ^ b); break; // xxleqv
++ }
++ }
++ setVSR128(xt, result);
++ break;
++ }
++
++ // === XX2-form: xxspltw (splat word from VRB[UIM] to all 4 lanes) ===
++ //
++ // xxspltw: UIM selects one of four words in BE numbering. UIM=0
++ // → BE word 0 (MSB side of the 128 bits). On PPC64LE register
++ // storage that maps to LE word (3 - UIM). With the input
++ // {0x11111111, 0x22222222, 0x33333333, 0x44444444}: UIM=0
++ // splats 0x44444444 (= LE word 3), UIM=3 splats 0x11111111
++ // (= LE word 0). The JIT emits xxspltw UIM=1 after mtvsrd on the
++ // POWER8 splatX4 path — mtvsrd puts the GPR's low 32 bits in BE
++ // word 1 (= LE word 2 on HW), so xxspltw UIM=1 picks up exactly
++ // that word and splats it to every lane.
++ case 164: { // xxspltw
++ int xt = int(rt | (instr->bit(0) << 5));
++ int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++ uint32_t uim = (instr->instructionBits() >> 16) & 0x3;
++ uint32_t leIdx = 3 - uim; // BE word UIM → LE word (3-UIM)
++ uint8_t bb[16], result[16];
++ getVSR128(xb, bb);
++ uint32_t word = (uint32_t)bb[leIdx * 4] |
++ ((uint32_t)bb[leIdx * 4 + 1] << 8) |
++ ((uint32_t)bb[leIdx * 4 + 2] << 16) |
++ ((uint32_t)bb[leIdx * 4 + 3] << 24);
++ for (int i = 0; i < 4; i++) {
++ result[i * 4] = (uint8_t)(word & 0xFF);
++ result[i * 4 + 1] = (uint8_t)((word >> 8) & 0xFF);
++ result[i * 4 + 2] = (uint8_t)((word >> 16) & 0xFF);
++ result[i * 4 + 3] = (uint8_t)((word >> 24) & 0xFF);
++ }
++ setVSR128(xt, result);
++ break;
++ }
++
++ // === XX2-form: xxextractuw (extract word at BE byte UIM, place at BE word 1) ===
++ //
++ // xxextractuw XT, XB, UIM:
++ // Bytes [4:7] of XT receive bytes [UIM:UIM+3] of XB. Bytes [0:3]
++ // and [8:15] of XT are set to zero.
++ // UIM ∈ {0, 4, 8, 12} (caller responsible for alignment).
++ // BE byte i ↔ LE byte (15-i), so the word at XB BE bytes UIM..UIM+3
++ // sits at XB LE bytes (12-UIM)..(15-UIM), and lands at XT LE bytes
++ // 8..11 (= XT BE word 1).
++ case 165: { // xxextractuw
++ int xt = int(rt | (instr->bit(0) << 5));
++ int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++ uint32_t uim = (instr->instructionBits() >> 16) & 0xF;
++ uint8_t bb[16], result[16];
++ getVSR128(xb, bb);
++ memset(result, 0, sizeof(result));
++ // result.LE[8..11] = XB.LE[(12-UIM)..(15-UIM)] (preserves byte order).
++ memcpy(result + 8, bb + (12 - uim), 4);
++ setVSR128(xt, result);
++ break;
++ }
++
++ case 180: {
++ // xxspltib XT, IMM8 (POWER9, ISA 3.0): splat 8-bit immediate to
++ // all 16 bytes of XT. The encoder writes `imm8 << 11`, so IMM8
++ // occupies LE bits 11..18; TX bit at LE bit 0 selects upper VSR.
++ uint32_t imm8 = instr->bits(18, 11);
++ int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++ uint8_t xt_bytes[16];
++ memset(xt_bytes, (uint8_t)imm8, 16);
++ setVSR128(xt, xt_bytes);
++ break;
++ }
++ case 181: {
++ // xxinsertw XT, XB, UIM (POWER9, ISA 3.0): copy XB[32..63] (the
++ // low 32 bits of XB's BE doubleword 0, which lives at LE bytes
++ // 8-11 of XB) into XT at BE byte position UIM. UIM ∈ {0,4,8,12};
++ // dest occupies XT LE bytes (12-UIM)..(15-UIM). Other bytes of
++ // XT are preserved. UIM at PPC bits 11-15 = LE bits 16-20; TX/BX
++ // at LE bits 0/1.
++ uint32_t uim = instr->bits(20, 16);
++ int xt = int(instr->rtValue() | (instr->bit(0) << 5));
++ int xb = int(instr->rbValue() | (instr->bit(1) << 5));
++ uint8_t xb_bytes[16], xt_bytes[16];
++ getVSR128(xb, xb_bytes);
++ getVSR128(xt, xt_bytes);
++ memcpy(xt_bytes + (12 - uim), xb_bytes + 8, 4);
++ setVSR128(xt, xt_bytes);
++ break;
++ }
++
++ // === XX2-form: xvabssp / xvabsdp (vector absolute value) ===
++ case 408: case 409: case 410: case 411: { // xvabssp + AX/BX bits
++ int xt = int(rt | (instr->bit(0) << 5));
++ int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++ uint8_t bb[16], result[16];
++ getVSR128(xb, bb);
++ for (int i = 0; i < 4; i++) {
++ uint32_t bits = (uint32_t)bb[i * 4] |
++ ((uint32_t)bb[i * 4 + 1] << 8) |
++ ((uint32_t)bb[i * 4 + 2] << 16) |
++ ((uint32_t)bb[i * 4 + 3] << 24);
++ bits &= 0x7FFFFFFFu; // clear sign bit
++ result[i * 4] = (uint8_t)(bits & 0xFF);
++ result[i * 4 + 1] = (uint8_t)((bits >> 8) & 0xFF);
++ result[i * 4 + 2] = (uint8_t)((bits >> 16) & 0xFF);
++ result[i * 4 + 3] = (uint8_t)((bits >> 24) & 0xFF);
++ }
++ setVSR128(xt, result);
++ break;
++ }
++ case 472: case 473: case 474: { // xvabsdp (475 used by xxbrd)
++ int xt = int(rt | (instr->bit(0) << 5));
++ int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++ uint8_t bb[16], result[16];
++ getVSR128(xb, bb);
++ for (int i = 0; i < 2; i++) {
++ uint64_t bits = 0;
++ for (int k = 0; k < 8; k++) bits |= ((uint64_t)bb[i * 8 + k]) << (k * 8);
++ bits &= 0x7FFFFFFFFFFFFFFFULL;
++ for (int k = 0; k < 8; k++) result[i * 8 + k] = (uint8_t)((bits >> (k * 8)) & 0xFF);
++ }
++ setVSR128(xt, result);
++ break;
++ }
++
++ // === XX2-form unary vector float ops (single XB operand, no AX) ===
++ //
++ // Encoding: opcode 60, bits 6-10=XT, 11-15 reserved, 16-20=XB,
++ // 21-29 = 9-bit XO (full field), 30=BX, 31=TX. Extraction gives us
++ // xo = XO9 directly (no AX bit). Every op below has a unique XO9.
++ //
++ // xvsqrtsp XO9=139 PPC_xvsqrtsp=0xF000022C
++ // xvsqrtdp XO9=203 PPC_xvsqrtdp=0xF000032C
++ // xvnegsp XO9=441 PPC_xvnegsp=0xF00006E4
++ // xvnegdp XO9=505 PPC_xvnegdp=0xF00007E4
++ // xvrspip XO9=169 PPC_xvrspip=0xF00002A4 (round +inf = ceil)
++ // xvrspiz XO9=153 PPC_xvrspiz=0xF0000264 (round toward 0 = trunc)
++ // xvrspim XO9=185 PPC_xvrspim=0xF00002E4 (round -inf = floor)
++ // xvrspic XO9=171 PPC_xvrspic=0xF00002AC (round per FPSCR)
++ // xvrdpip XO9=233 PPC_xvrdpip=0xF00003A4
++ // xvrdpiz XO9=217 PPC_xvrdpiz=0xF0000364
++ // xvrdpim XO9=249 PPC_xvrdpim=0xF00003E4
++ // xvrdpic XO9=235 PPC_xvrdpic=0xF00003AC
++ // xvcvspsxws XO9=152 PPC_xvcvspsxws=0xF0000260 (f32 → s32, sat)
++ // xvcvspuxws XO9=136 PPC_xvcvspuxws=0xF0000220 (f32 → u32, sat)
++ // xvcvsxwsp XO9=184 PPC_xvcvsxwsp=0xF00002E0 (s32 → f32)
++ // xvcvuxwsp XO9=168 PPC_xvcvuxwsp=0xF00002A0 (u32 → f32)
++ case 139: case 203: // xvsqrtsp / xvsqrtdp
++ case 441: case 505: // xvnegsp / xvnegdp
++ case 169: case 233: // xvrspip / xvrdpip (ceil)
++ case 153: case 217: // xvrspiz / xvrdpiz (trunc)
++ case 185: case 249: // xvrspim / xvrdpim (floor)
++ case 171: case 235: // xvrspic / xvrdpic (round-to-nearest)
++ case 136: case 152: // xvcvspuxws / xvcvspsxws
++ case 168: case 184: { // xvcvuxwsp / xvcvsxwsp
++ int xt = int(rt | (instr->bit(0) << 5));
++ int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++ uint8_t bb[16], result[16];
++ getVSR128(xb, bb);
++ bool isSp = (xo == 139 || xo == 441 || xo == 169 || xo == 153 ||
++ xo == 185 || xo == 171 || xo == 136 || xo == 152 ||
++ xo == 168 || xo == 184);
++ auto getF32 = [](uint8_t* buf, int i) -> float {
++ uint32_t b = (uint32_t)buf[i * 4] |
++ ((uint32_t)buf[i * 4 + 1] << 8) |
++ ((uint32_t)buf[i * 4 + 2] << 16) |
++ ((uint32_t)buf[i * 4 + 3] << 24);
++ float f; memcpy(&f, &b, sizeof(f)); return f;
++ };
++ auto setF32 = [](uint8_t* buf, int i, float f) {
++ uint32_t b; memcpy(&b, &f, sizeof(b));
++ buf[i*4]=(uint8_t)b; buf[i*4+1]=(uint8_t)(b>>8);
++ buf[i*4+2]=(uint8_t)(b>>16); buf[i*4+3]=(uint8_t)(b>>24);
++ };
++ auto getF64 = [](uint8_t* buf, int i) -> double {
++ uint64_t b = 0;
++ for (int k=0;k<8;k++) b |= ((uint64_t)buf[i*8+k])<<(k*8);
++ double d; memcpy(&d, &b, sizeof(d)); return d;
++ };
++ auto setF64 = [](uint8_t* buf, int i, double d) {
++ uint64_t b; memcpy(&b, &d, sizeof(b));
++ for (int k=0;k<8;k++) buf[i*8+k]=(uint8_t)(b>>(k*8));
++ };
++ // Integer lane read/write (used by conversion ops).
++ auto setU32 = [](uint8_t* buf, int i, uint32_t v) {
++ buf[i*4]=(uint8_t)v; buf[i*4+1]=(uint8_t)(v>>8);
++ buf[i*4+2]=(uint8_t)(v>>16); buf[i*4+3]=(uint8_t)(v>>24);
++ };
++ // Saturated float→int conversion per Power ISA v3.0B: input NaN maps
++ // to 0; out-of-range saturates to the extreme of the destination type.
++ auto fp2sxw = [](double f) -> uint32_t {
++ if (std::isnan(f)) return 0;
++ if (f >= (double)INT32_MAX) return (uint32_t)INT32_MAX;
++ if (f <= (double)INT32_MIN) return (uint32_t)INT32_MIN;
++ return (uint32_t)(int32_t)std::trunc(f);
++ };
++ auto fp2uxw = [](double f) -> uint32_t {
++ if (std::isnan(f)) return 0;
++ if (f >= (double)UINT32_MAX) return UINT32_MAX;
++ if (f <= 0.0) return 0;
++ return (uint32_t)std::trunc(f);
++ };
++
++ if (isSp) {
++ for (int i = 0; i < 4; i++) {
++ float v = getF32(bb, i);
++ float out = 0.0f;
++ uint32_t iout = 0;
++ bool isInt = false;
++ switch (xo) {
++ case 139: out = std::sqrt(v); break; // xvsqrtsp
++ case 441: out = -v; break; // xvnegsp
++ case 169: out = std::ceil(v); break; // xvrspip
++ case 153: out = std::trunc(v); break; // xvrspiz
++ case 185: out = std::floor(v); break; // xvrspim
++ case 171: out = std::nearbyint(v); break; // xvrspic
++ case 152: iout = fp2sxw(v); isInt = true; break; // xvcvspsxws
++ case 136: iout = fp2uxw(v); isInt = true; break; // xvcvspuxws
++ case 184: { // xvcvsxwsp
++ uint32_t bits = (uint32_t)bb[i*4] |
++ ((uint32_t)bb[i*4+1]<<8) |
++ ((uint32_t)bb[i*4+2]<<16) |
++ ((uint32_t)bb[i*4+3]<<24);
++ out = (float)(int32_t)bits;
++ break;
++ }
++ case 168: { // xvcvuxwsp
++ uint32_t bits = (uint32_t)bb[i*4] |
++ ((uint32_t)bb[i*4+1]<<8) |
++ ((uint32_t)bb[i*4+2]<<16) |
++ ((uint32_t)bb[i*4+3]<<24);
++ out = (float)(uint32_t)bits;
++ break;
++ }
++ }
++ if (isInt) setU32(result, i, iout);
++ else setF32(result, i, out);
++ }
++ } else {
++ for (int i = 0; i < 2; i++) {
++ double v = getF64(bb, i);
++ double out = 0.0;
++ switch (xo) {
++ case 203: out = std::sqrt(v); break; // xvsqrtdp
++ case 505: out = -v; break; // xvnegdp
++ case 233: out = std::ceil(v); break; // xvrdpip
++ case 217: out = std::trunc(v); break; // xvrdpiz
++ case 249: out = std::floor(v); break; // xvrdpim
++ case 235: out = std::nearbyint(v); break; // xvrdpic
++ }
++ setF64(result, i, out);
++ }
++ }
++ setVSR128(xt, result);
++ break;
++ }
++
++ // === XX3-form vector float compare (eq, gt, ge) ===
++ // The wasm SIMD compares emit these and use the result as a bitmask.
++ // Per Power ISA: result is all-1s for true lanes, all-0s for false
++ // (for the non-recording form; bit 0 of XO selects record form which
++ // we don't model — wasm doesn't read CR6 here).
++ // Encodings:
++ // 0xF0000218 xvcmpeqsp (XO8=67) → XO9 = 134/135 (+AX).
++ // 0xF0000258 xvcmpgtsp (XO8=75) → XO9 = 150/151.
++ // 0xF0000298 xvcmpgesp (XO8=83) → XO9 = 166/167.
++ // 0xF0000318 xvcmpeqdp (XO8=99) → XO9 = 198/199.
++ // 0xF0000358 xvcmpgtdp (XO8=107) → XO9 = 214/215.
++ // 0xF0000398 xvcmpgedp (XO8=115) → XO9 = 230/231.
++ // Rc=1 record form flips ISA bit 21 (sim bit 10), yielding XO9+256
++ // (not adjacent to the Rc=0 slot). wasm never emits the record form.
++ case 134: case 135: // xvcmpeqsp (XO8=67)
++ case 198: case 199: // xvcmpeqdp (XO8=99)
++ case 150: case 151: // xvcmpgtsp (XO8=75)
++ case 214: case 215: // xvcmpgtdp (XO8=107)
++ case 166: case 167: // xvcmpgesp (XO8=83)
++ case 230: case 231: { // xvcmpgedp (XO8=115)
++ int xt = int(rt | (instr->bit(0) << 5));
++ uint32_t ra = instr->raValue();
++ int xa = int(ra | ((instr->instructionBits() >> 2) & 1) << 5);
++ int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++ uint8_t ab[16], bb[16], result[16];
++ getVSR128(xa, ab);
++ getVSR128(xb, bb);
++ uint32_t op8 = xo >> 1; // canonical 8-bit XO
++ bool isF32 = (op8 == 67 || op8 == 75 || op8 == 83);
++ bool isEq = (op8 == 67 || op8 == 99);
++ bool isGt = (op8 == 75 || op8 == 107);
++ bool isGe = (op8 == 83 || op8 == 115);
++ (void)isGe;
++ auto cmpF32 = [&](int i) -> bool {
++ uint32_t aBits = (uint32_t)ab[i * 4] |
++ ((uint32_t)ab[i * 4 + 1] << 8) |
++ ((uint32_t)ab[i * 4 + 2] << 16) |
++ ((uint32_t)ab[i * 4 + 3] << 24);
++ uint32_t bBits = (uint32_t)bb[i * 4] |
++ ((uint32_t)bb[i * 4 + 1] << 8) |
++ ((uint32_t)bb[i * 4 + 2] << 16) |
++ ((uint32_t)bb[i * 4 + 3] << 24);
++ float fa, fb;
++ memcpy(&fa, &aBits, sizeof(fa));
++ memcpy(&fb, &bBits, sizeof(fb));
++ if (isEq) return fa == fb;
++ if (isGt) return fa > fb;
++ return fa >= fb;
++ };
++ auto cmpF64 = [&](int i) -> bool {
++ uint64_t aBits = 0, bBits = 0;
++ for (int k = 0; k < 8; k++) aBits |= ((uint64_t)ab[i * 8 + k]) << (k * 8);
++ for (int k = 0; k < 8; k++) bBits |= ((uint64_t)bb[i * 8 + k]) << (k * 8);
++ double fa, fb;
++ memcpy(&fa, &aBits, sizeof(fa));
++ memcpy(&fb, &bBits, sizeof(fb));
++ if (isEq) return fa == fb;
++ if (isGt) return fa > fb;
++ return fa >= fb;
++ };
++ if (isF32) {
++ for (int i = 0; i < 4; i++) {
++ uint32_t mask = cmpF32(i) ? 0xFFFFFFFFu : 0;
++ for (int k = 0; k < 4; k++) {
++ result[i * 4 + k] = (uint8_t)((mask >> (k * 8)) & 0xFF);
++ }
++ }
++ } else {
++ for (int i = 0; i < 2; i++) {
++ uint64_t mask = cmpF64(i) ? UINT64_MAX : 0;
++ for (int k = 0; k < 8; k++) {
++ result[i * 8 + k] = (uint8_t)((mask >> (k * 8)) & 0xFF);
++ }
++ }
++ }
++ setVSR128(xt, result);
++ break;
++ }
++
++ // === XX3-form vector float arithmetic ===
++ // Encoding: bits 6-10=XT, 11-15=XA, 16-20=XB, 21-28=XO (8 bits), 29=AX,
++ // 30=BX, 31=TX. We dispatched above using `bits(10, 2)` which is bits
++ // 21-29 (9 bits) — that includes the AX register-extension bit, which
++ // changes for every XA in {0..31} vs {32..63}. To match all 4
++ // (AX,BX) combinations of an XX3 op we use `case xo3 | 0|1|2|3` where
++ // xo3 = (8-bit XO) << 1 (because XO occupies bits 1..8 of our 9-bit
++ // extraction). Helper macro: each case covers four labels.
++ #define XX3_CASE_BASE(name) \
++ case ((name) | 0): case ((name) | 1):
++ case 128: case 129: // xvaddsp: 4 × f32 add (XO=64 → bits 1..8 = 128)
++ case 192: case 193: // xvadddp
++ case 144: case 145: // xvsubsp
++ case 208: case 209: // xvsubdp
++ case 160: case 161: // xvmulsp
++ case 224: case 225: // xvmuldp
++ case 176: case 177: // xvdivsp
++ case 240: case 241: // xvdivdp
++ case 384: case 385: // xvmaxsp
++ case 448: case 449: // xvmaxdp
++ case 400: case 401: // xvminsp
++ case 464: case 465: // xvmindp
++ {
++ // Re-extract the canonical 8-bit XX3 XO.
++ uint32_t xo3 = (xo >> 1);
++ (void)xo3;
++ int xt = int(rt | (instr->bit(0) << 5));
++ uint32_t ra = instr->raValue();
++ int xa = int(ra | ((instr->instructionBits() >> 2) & 1) << 5);
++ int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++ uint8_t ab[16], bb[16], rb_bytes[16];
++ getVSR128(xa, ab);
++ getVSR128(xb, bb);
++
++ auto getF32 = [](uint8_t* buf, int i) -> float {
++ uint32_t bits = (uint32_t)buf[i * 4] |
++ ((uint32_t)buf[i * 4 + 1] << 8) |
++ ((uint32_t)buf[i * 4 + 2] << 16) |
++ ((uint32_t)buf[i * 4 + 3] << 24);
++ float f;
++ memcpy(&f, &bits, sizeof(f));
++ return f;
++ };
++ auto setF32 = [](uint8_t* buf, int i, float f) {
++ uint32_t bits;
++ memcpy(&bits, &f, sizeof(bits));
++ buf[i * 4] = (uint8_t)(bits & 0xFF);
++ buf[i * 4 + 1] = (uint8_t)((bits >> 8) & 0xFF);
++ buf[i * 4 + 2] = (uint8_t)((bits >> 16) & 0xFF);
++ buf[i * 4 + 3] = (uint8_t)((bits >> 24) & 0xFF);
++ };
++ auto getF64 = [](uint8_t* buf, int i) -> double {
++ uint64_t bits = 0;
++ for (int k = 0; k < 8; k++) bits |= ((uint64_t)buf[i * 8 + k]) << (k * 8);
++ double d;
++ memcpy(&d, &bits, sizeof(d));
++ return d;
++ };
++ auto setF64 = [](uint8_t* buf, int i, double d) {
++ uint64_t bits;
++ memcpy(&bits, &d, sizeof(bits));
++ for (int k = 0; k < 8; k++) buf[i * 8 + k] = (uint8_t)((bits >> (k * 8)) & 0xFF);
++ };
++
++ // Dispatch on the canonical 8-bit XX3 XO (bits 21..28 PPC = xo>>1).
++ switch (xo3) {
++ case 64: for (int i = 0; i < 4; i++) setF32(rb_bytes, i, getF32(ab, i) + getF32(bb, i)); break; // xvaddsp
++ case 96: for (int i = 0; i < 2; i++) setF64(rb_bytes, i, getF64(ab, i) + getF64(bb, i)); break; // xvadddp
++ case 72: for (int i = 0; i < 4; i++) setF32(rb_bytes, i, getF32(ab, i) - getF32(bb, i)); break; // xvsubsp
++ case 104: for (int i = 0; i < 2; i++) setF64(rb_bytes, i, getF64(ab, i) - getF64(bb, i)); break; // xvsubdp
++ case 80: for (int i = 0; i < 4; i++) setF32(rb_bytes, i, getF32(ab, i) * getF32(bb, i)); break; // xvmulsp
++ case 112: for (int i = 0; i < 2; i++) setF64(rb_bytes, i, getF64(ab, i) * getF64(bb, i)); break; // xvmuldp
++ case 88: for (int i = 0; i < 4; i++) setF32(rb_bytes, i, getF32(ab, i) / getF32(bb, i)); break; // xvdivsp
++ case 120: for (int i = 0; i < 2; i++) setF64(rb_bytes, i, getF64(ab, i) / getF64(bb, i)); break; // xvdivdp
++ // xvmin{sp,dp} / xvmax{sp,dp}:
++ // If both operands are NaN, result is the NaN from XA.
++ // If exactly one operand is NaN, result is the NON-NaN operand.
++ // For 0 / -0, treat -0 < +0 (signed-zero ordering): xvminsp(+0,-0)
++ // = -0, xvmaxsp(+0,-0) = +0, in either operand order.
++ // Otherwise, result is IEEE min/max(a, b).
++ // This differs from IEEE 754 (which propagates NaN) and is
++ // relied upon by wasm relaxed_min/max (bug1946618.js) and by
++ // wasm f32x4.min(0,-0) returning -0 (simd_f32x4.wast.js).
++ #define XV_MAX(T, a, b) [](T a_, T b_) -> T { \
++ bool an = std::isnan(a_), bn = std::isnan(b_); \
++ if (an && bn) return a_; \
++ if (an) return b_; \
++ if (bn) return a_; \
++ if (a_ == 0.0 && b_ == 0.0) { \
++ /* -0 is smaller than +0; max picks +0. */ \
++ return std::signbit(a_) ? b_ : a_; \
++ } \
++ return std::max(a_, b_); \
++ }(a, b)
++ #define XV_MIN(T, a, b) [](T a_, T b_) -> T { \
++ bool an = std::isnan(a_), bn = std::isnan(b_); \
++ if (an && bn) return a_; \
++ if (an) return b_; \
++ if (bn) return a_; \
++ if (a_ == 0.0 && b_ == 0.0) { \
++ /* -0 is smaller than +0; min picks -0. */ \
++ return std::signbit(a_) ? a_ : b_; \
++ } \
++ return std::min(a_, b_); \
++ }(a, b)
++ case 192: for (int i = 0; i < 4; i++) { // xvmaxsp
++ float a = getF32(ab, i), b = getF32(bb, i);
++ setF32(rb_bytes, i, XV_MAX(float, a, b));
++ } break;
++ case 224: for (int i = 0; i < 2; i++) { // xvmaxdp
++ double a = getF64(ab, i), b = getF64(bb, i);
++ setF64(rb_bytes, i, XV_MAX(double, a, b));
++ } break;
++ case 200: for (int i = 0; i < 4; i++) { // xvminsp
++ float a = getF32(ab, i), b = getF32(bb, i);
++ setF32(rb_bytes, i, XV_MIN(float, a, b));
++ } break;
++ case 232: for (int i = 0; i < 2; i++) { // xvmindp
++ double a = getF64(ab, i), b = getF64(bb, i);
++ setF64(rb_bytes, i, XV_MIN(double, a, b));
++ } break;
++ #undef XV_MAX
++ #undef XV_MIN
++ default:
++ MOZ_CRASH_UNSAFE_PRINTF(
++ "xv float dispatch missing 8-bit XO=%u (instr 0x%08x)",
++ xo3, instr->instructionBits());
++ }
++ setVSR128(xt, rb_bytes);
++ break;
++ }
++
++ // === XX3-form fused multiply-add (3-source: XT is also input) ===
++ //
++ // xvmaddasp XT,XA,XB: XT = (XA * XB) + XT (fused madd)
++ // xvmaddadp XT,XA,XB: same for f64
++ // xvnmsubasp XT,XA,XB: XT = -((XA * XB) - XT) = XT - (XA * XB)
++ // xvnmsubadp XT,XA,XB: same for f64
++ //
++ // Encodings (each +AX): XO8 → XO9 pairs
++ // xvmaddasp PPC_xvmaddasp=0xF0000208 XO8=65 → XO9 130/131
++ // xvmaddadp PPC_xvmaddadp=0xF0000308 XO8=97 → XO9 194/195
++ // xvnmsubasp PPC_xvnmsubasp=0xF0000688 XO8=209 → XO9 418/419
++ // xvnmsubadp PPC_xvnmsubadp=0xF0000788 XO8=241 → XO9 482/483
++ // std::fma gives IEEE-correct single-rounding behaviour matching the
++ // Power ISA definition of these fused forms.
++ case 130: case 131: // xvmaddasp
++ case 194: case 195: // xvmaddadp
++ case 418: case 419: // xvnmsubasp
++ case 482: case 483: { // xvnmsubadp
++ int xt = int(rt | (instr->bit(0) << 5));
++ uint32_t ra = instr->raValue();
++ int xa = int(ra | ((instr->instructionBits() >> 2) & 1) << 5);
++ int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
++ uint8_t ab[16], bb[16], tb[16];
++ getVSR128(xa, ab);
++ getVSR128(xb, bb);
++ getVSR128(xt, tb); // XT is also an input (accumulator).
++ bool isSp = (xo == 130 || xo == 131 || xo == 418 || xo == 419);
++ bool isNmsub = (xo == 418 || xo == 419 || xo == 482 || xo == 483);
++ auto rdF32 = [](uint8_t* buf, int i) -> float {
++ uint32_t b = (uint32_t)buf[i * 4] |
++ ((uint32_t)buf[i * 4 + 1] << 8) |
++ ((uint32_t)buf[i * 4 + 2] << 16) |
++ ((uint32_t)buf[i * 4 + 3] << 24);
++ float f; memcpy(&f, &b, sizeof(f)); return f;
++ };
++ auto wrF32 = [](uint8_t* buf, int i, float f) {
++ uint32_t b; memcpy(&b, &f, sizeof(b));
++ buf[i*4]=(uint8_t)b; buf[i*4+1]=(uint8_t)(b>>8);
++ buf[i*4+2]=(uint8_t)(b>>16); buf[i*4+3]=(uint8_t)(b>>24);
++ };
++ auto rdF64 = [](uint8_t* buf, int i) -> double {
++ uint64_t b = 0;
++ for (int k=0;k<8;k++) b |= ((uint64_t)buf[i*8+k])<<(k*8);
++ double d; memcpy(&d, &b, sizeof(d)); return d;
++ };
++ auto wrF64 = [](uint8_t* buf, int i, double d) {
++ uint64_t b; memcpy(&b, &d, sizeof(b));
++ for (int k=0;k<8;k++) buf[i*8+k]=(uint8_t)(b>>(k*8));
++ };
++ uint8_t result[16];
++ if (isSp) {
++ for (int i = 0; i < 4; i++) {
++ float a = rdF32(ab, i), b = rdF32(bb, i), t = rdF32(tb, i);
++ // madd: t + a*b ; nmsub: -(a*b - t) = t - a*b = std::fma(a,b,-t) negated.
++ float out = isNmsub ? -std::fma(a, b, -t)
++ : std::fma(a, b, t);
++ wrF32(result, i, out);
++ }
++ } else {
++ for (int i = 0; i < 2; i++) {
++ double a = rdF64(ab, i), b = rdF64(bb, i), t = rdF64(tb, i);
++ double out = isNmsub ? -std::fma(a, b, -t)
++ : std::fma(a, b, t);
++ wrF64(result, i, out);
++ }
++ }
++ setVSR128(xt, result);
++ break;
++ }
++
++ default:
++ MOZ_CRASH_UNSAFE_PRINTF(
++ "decodeVSX: unimplemented XO=%u (instruction 0x%08x)", xo,
++ instr->instructionBits());
++ }
++}
++
++// =============================================================================
++// Power ISA v3.1 prefixed instructions (POWER10).
++// =============================================================================
++//
++// A prefixed instruction is 8 bytes: a 4-byte prefix word (primary opcode 1)
++// followed by a 4-byte suffix word. Prefix and suffix must lie in the same
++// 64-byte aligned block — the JIT must guarantee this when emitting; the sim
++// asserts.
++//
++// Prefix word layout (BE bit numbering):
++// [0..5] primary opcode = 1
++// [6..7] Type (00 = 8LS, 10 = MLS — only forms we implement)
++// [8..10] reserved (must be 0)
++// [11] R (1 = PC-relative; RA must be 0)
++// [12..13] reserved (must be 0)
++// [14..31] d0 (high 18 bits of the 34-bit signed immediate)
++//
++// Suffix word (MLS/8LS form, GPR-target instructions like paddi/pld):
++// [0..5] suffix primary opcode (selects the actual instruction)
++// [6..10] RT (or RS for stores)
++// [11..15] RA
++// [16..31] d1 (low 16 bits of immediate)
++//
++// Suffix word (8LS plxv quirk): the suffix opcode field is only 5 bits
++// wide and bit [5] holds TX, the high bit of the 6-bit XT VSR number:
++// [0..4] plxv suffix opcode = 11001 (= 25)
++// [5] TX
++// [6..10] T
++// [11..15] RA
++// [16..31] d1
++// Combined: XT = (TX << 5) | T. (Equivalent: full 6-bit field at [0..5]
++// is 0b11001(TX) — values 50 or 51 in our LE bits 31..26.)
++//
++// Combined immediate: SI = sign_extend((d0 << 16) | d1, 34).
++// EA when R=1: address-of-prefix + SI. (RA must be 0.)
++// EA when R=0: (RA == 0 ? 0 : GPR[RA]) + SI.
++//
++// Suffix opcodes implemented here:
++// MLS (Type 2) / suffix=14 paddi
++// MLS (Type 2) / suffix=48 plfs (load FP single, widens to double)
++// MLS (Type 2) / suffix=50 plfd (load FP double)
++// 8LS (Type 0) / suffix=57 pld
++// 8LS (Type 0) / 5-bit suffix=25, bit 26 = TX plxv
++//
++// Verification recipe when adding more: assemble with `gcc -mcpu=power10
++// -c` (or clang) and compare the emitted bytes against the encoder; encode
++// in a small inline-asm program and step through under this sim.
++
++void Simulator::decodePrefixed(SimInstruction* prefix) {
++ // Prefix and suffix must reside in the same 64-byte block.
++ uint64_t prefixAddr = reinterpret_cast<uint64_t>(prefix);
++ MOZ_ASSERT((prefixAddr & 63) <= 56,
++ "POWER10 prefixed instruction crosses 64-byte boundary");
++
++ SimInstruction* suffix = reinterpret_cast<SimInstruction*>(
++ reinterpret_cast<uint8_t*>(prefix) + SimInstruction::kInstrSize);
++
++ uint32_t type = prefix->bits(25, 24);
++ uint32_t R = prefix->bit(20);
++ uint32_t d0 = prefix->bits(17, 0); // 18 bits
++ uint32_t suffixOp6 = suffix->bits(31, 26); // 6-bit form (paddi, pld)
++ uint32_t suffixOp5 = suffix->bits(31, 27); // 5-bit form (plxv)
++ uint32_t plxvTX = suffix->bit(26);
++ uint32_t rt = suffix->rtValue();
++ uint32_t ra = suffix->raValue();
++ uint32_t d1 = suffix->uimm16Value();
++
++ // Reassemble 34-bit signed displacement.
++ int64_t imm34 = (static_cast<int64_t>(d0) << 16) | d1;
++ imm34 = (imm34 << 30) >> 30; // sign-extend from bit 33
++
++ // R=1 forms require RA=0 per the ISA.
++ MOZ_ASSERT(!R || ra == 0,
++ "POWER10 prefixed R=1 form requires RA=0");
++
++ // Type 2 = MLS, Type 0 = 8LS. Other types are reserved here.
++ if (type == 2 && suffixOp6 == 14) {
++ // paddi RT, RA, SI, R (MLS)
++ int64_t base = R ? static_cast<int64_t>(prefixAddr)
++ : (ra == 0 ? 0 : getRegister(ra));
++ setRegister(rt, base + imm34);
++ } else if (type == 0 && suffixOp6 == 57) {
++ // pld RT, D(RA), R (8LS)
++ uint64_t ea = R ? prefixAddr + static_cast<uint64_t>(imm34)
++ : (ra == 0 ? 0 : getRegister(ra)) +
++ static_cast<uint64_t>(imm34);
++ if (!handleWasmSegFault(ea, 8)) {
++ setRegister(rt, readDW(ea, prefix));
++ }
++ } else if (type == 2 && suffixOp6 == 50) {
++ // plfd FRT, D(RA), R (MLS) — load 8-byte double into FPR.
++ uint64_t ea = R ? prefixAddr + static_cast<uint64_t>(imm34)
++ : (ra == 0 ? 0 : getRegister(ra)) +
++ static_cast<uint64_t>(imm34);
++ if (!handleWasmSegFault(ea, 8)) {
++ setFpuRegisterDouble(rt, readD(ea, prefix));
++ }
++ } else if (type == 2 && suffixOp6 == 48) {
++ // plfs FRT, D(RA), R (MLS) — load 4-byte single, widen NaN-preserving.
++ uint64_t ea = R ? prefixAddr + static_cast<uint64_t>(imm34)
++ : (ra == 0 ? 0 : getRegister(ra)) +
++ static_cast<uint64_t>(imm34);
++ if (!handleWasmSegFault(ea, 4)) {
++ float val = *reinterpret_cast<float*>(ea);
++ setFpuRegisterDouble(rt, promoteFloatPreservingNaN(val));
++ }
++ } else if (type == 0 && suffixOp5 == 25) {
++ // plxv XT, D(RA), R (8LS) — XT = (TX << 5) | T, TX at suffix bit 26.
++ int xt = static_cast<int>(rt | (plxvTX << 5));
++ uint64_t ea = R ? prefixAddr + static_cast<uint64_t>(imm34)
++ : (ra == 0 ? 0 : getRegister(ra)) +
++ static_cast<uint64_t>(imm34);
++ if (!handleWasmSegFault(ea, 16)) {
++ uint8_t buf[16];
++ memcpy(buf, reinterpret_cast<const void*>(ea), 16);
++ setVSR128(xt, buf);
++ }
++ } else if (type == 0 && suffixOp6 == 61) {
++ // pstd RS, D(RA), R (8LS) — store doubleword.
++ uint64_t ea = R ? prefixAddr + static_cast<uint64_t>(imm34)
++ : (ra == 0 ? 0 : getRegister(ra)) +
++ static_cast<uint64_t>(imm34);
++ if (!handleWasmSegFault(ea, 8)) {
++ writeDW(ea, getRegister(rt), prefix);
++ }
++ } else if (type == 2 && suffixOp6 == 54) {
++ // pstfd FRS, D(RA), R (MLS) — store double.
++ uint64_t ea = R ? prefixAddr + static_cast<uint64_t>(imm34)
++ : (ra == 0 ? 0 : getRegister(ra)) +
++ static_cast<uint64_t>(imm34);
++ if (!handleWasmSegFault(ea, 8)) {
++ writeD(ea, getFpuRegisterDouble(rt), prefix);
++ }
++ } else if (type == 2 && suffixOp6 == 52) {
++ // pstfs FRS, D(RA), R (MLS) — store single (narrow from double in FPR).
++ uint64_t ea = R ? prefixAddr + static_cast<uint64_t>(imm34)
++ : (ra == 0 ? 0 : getRegister(ra)) +
++ static_cast<uint64_t>(imm34);
++ if (!handleWasmSegFault(ea, 4)) {
++ double dval = getFpuRegisterDouble(rt);
++ *reinterpret_cast<float*>(ea) = demoteDoublePreservingNaN(dval);
++ }
++ } else if (type == 0 && suffixOp5 == 27) {
++ // pstxv XS, D(RA), R (8LS) — XS = (SX << 5) | S, SX at suffix bit 26.
++ int xs = static_cast<int>(rt | (plxvTX << 5));
++ uint64_t ea = R ? prefixAddr + static_cast<uint64_t>(imm34)
++ : (ra == 0 ? 0 : getRegister(ra)) +
++ static_cast<uint64_t>(imm34);
++ if (!handleWasmSegFault(ea, 16)) {
++ uint8_t buf[16];
++ getVSR128(xs, buf);
++ memcpy(reinterpret_cast<void*>(ea), buf, 16);
++ }
++ } else {
++ MOZ_CRASH_UNSAFE_PRINTF(
++ "decodePrefixed: unimplemented type=%u "
++ "(prefix 0x%08x, suffix 0x%08x)",
++ type, prefix->instructionBits(), suffix->instructionBits());
++ }
++
++ // Advance past the full 8-byte prefixed instruction unless a handler
++ // already redirected the PC. The caller (instructionDecode) returns
++ // immediately after us, so its 4-byte trailing advance is skipped.
++ if (!pc_modified_) {
++ set_pc(static_cast<int64_t>(prefixAddr) + 2 * SimInstruction::kInstrSize);
++ }
++}
++
++// =============================================================================
++// Top-level instruction decoder.
++// =============================================================================
++
++void Simulator::instructionDecode(SimInstruction* instr) {
++ if (!SimulatorProcess::ICacheCheckingDisableCount) {
++ AutoLockSimulatorCache als;
++ SimulatorProcess::checkICacheLocked(instr);
++ }
++ pc_modified_ = false;
++
++ uint32_t instrBits = instr->instructionBits();
++
++ // Check for kCallRedirInstr first (PPC_stop = 0x4C0002E4).
++ if (instrBits == kCallRedirInstr) {
++ softwareInterrupt(instr);
++ if (!pc_modified_) {
++ set_pc(reinterpret_cast<int64_t>(instr) + SimInstruction::kInstrSize);
++ }
++ return;
++ }
++
++ // Check for PPC_trap (0x7FE00008).
++ if (instrBits == 0x7FE00008) {
++ softwareInterrupt(instr);
++ if (!pc_modified_) {
++ set_pc(reinterpret_cast<int64_t>(instr) + SimInstruction::kInstrSize);
++ }
++ return;
++ }
++
++ uint32_t opcode = instr->opcode();
++
++ // Power ISA v3.1 prefixed instructions: primary opcode 1 marks a
++ // 4-byte prefix word followed by a 4-byte suffix word. decodePrefixed
++ // advances the PC by the full 8 bytes (or leaves it modified for
++ // PC-relative side-effects).
++ if (opcode == 1) {
++ decodePrefixed(instr);
++ return;
++ }
++
++ switch (opcode) {
++ // D-form ALU
++ case 3: // twi
++ case 7: // mulli
++ case 8: // subfic
++ case 10: // cmpli
++ case 11: // cmpi
++ case 12: // addic
++ case 13: // addic.
++ case 14: // addi
++ case 15: // addis
++ case 24: // ori
++ case 25: // oris
++ case 26: // xori
++ case 27: // xoris
++ case 28: // andi.
++ case 29: // andis.
++ decodeDFormALU(instr);
++ break;
++
++ // D-form loads
++ case 32: // lwz
++ case 33: // lwzu
++ case 34: // lbz
++ case 35: // lbzu
++ case 40: // lhz
++ case 41: // lhzu
++ case 42: // lha
++ case 43: // lhau
++ case 48: // lfs
++ case 49: // lfsu
++ case 50: // lfd
++ case 51: // lfdu
++ decodeDFormLoad(instr);
++ break;
++
++ // D-form stores
++ case 36: // stw
++ case 38: // stb
++ case 39: // stbu
++ case 44: // sth
++ case 45: // sthu
++ case 52: // stfs
++ case 53: // stfsu
++ case 54: // stfd
++ case 55: // stfdu
++ decodeDFormStore(instr);
++ break;
++
++ // DS-form
++ case 58: // ld, ldu, lwa
++ case 62: // std, stdu
++ decodeDSForm(instr);
++ break;
++
++ // B-form conditional branch
++ case 16:
++ decodeBranch(instr);
++ break;
++
++ // SC (system call) - unused in JIT
++ case 17:
++ MOZ_CRASH("Simulator: sc instruction not supported");
++ break;
++
++ // I-form unconditional branch
++ case 18:
++ decodeBranch(instr);
++ break;
++
++ // XL-form (branch to LR/CTR, CR operations)
++ case 19:
++ decodeBranch(instr);
++ break;
++
++ // M-form / MD-form rotate/mask
++ case 20: // rlwimi
++ case 21: // rlwinm
++ case 23: // rlwnm
++ case 30: // rldicl, rldicr, rldic, rldimi, rldcl, rldcr
++ decodeRotateMask(instr);
++ break;
++
++ // VMX (AltiVec) — primary opcode 4. Vector arithmetic / compare / shift /
++ // splat / merge / pack / unpack on VR0-VR31. The wasm SIMD lowering
++ // emits these directly (Simd128 lives in the VR namespace).
++ case 4:
++ decodeVMX(instr);
++ break;
++
++ // X-form / XO-form
++ case 31:
++ decodeXForm(instr);
++ break;
++
++ // FP single (A-form)
++ case 59:
++ decodeFP(instr);
++ break;
++
++ // VSX (XX1-form)
++ case 60:
++ decodeVSX(instr);
++ break;
++
++ // FP double (X-form / A-form)
++ case 63:
++ decodeFP(instr);
++ break;
++
++ default:
++ MOZ_CRASH_UNSAFE_PRINTF(
++ "instructionDecode: unsupported opcode %u (instruction 0x%08x)",
++ opcode, instrBits);
++ }
++
++ if (!pc_modified_) {
++ set_pc(reinterpret_cast<int64_t>(instr) + SimInstruction::kInstrSize);
++ }
++}
++
++// =============================================================================
++// Single-stepping / execute loop.
++// =============================================================================
++
++void Simulator::enable_single_stepping(SingleStepCallback cb, void* arg) {
++ single_stepping_ = true;
++ single_step_callback_ = cb;
++ single_step_callback_arg_ = arg;
++ single_step_callback_(single_step_callback_arg_, this, (void*)get_pc());
++}
++
++void Simulator::disable_single_stepping() {
++ if (!single_stepping_) {
++ return;
++ }
++ single_step_callback_(single_step_callback_arg_, this, (void*)get_pc());
++ single_stepping_ = false;
++ single_step_callback_ = nullptr;
++ single_step_callback_arg_ = nullptr;
++}
++
++template <bool enableStopSimAt>
++void Simulator::execute() {
++ if (single_stepping_ && getenv("PPC64_TRACE_SIM")) {
++ fprintf(stderr, "[sim] enter execute pc=0x%lx lr=0x%lx fp=0x%lx sp=0x%lx\n",
++ (long)get_pc(), (long)getLR(), (long)getRegister(fp),
++ (long)getRegister(sp));
++ }
++ if (single_stepping_) {
++ single_step_callback_(single_step_callback_arg_, this, nullptr);
++ }
++
++ int64_t program_counter = get_pc();
++
++ while (program_counter != end_sim_pc) {
++ if (enableStopSimAt && (icount_ == Simulator::StopSimAt)) {
++ ppc64Debugger dbg(this);
++ dbg.debug();
++ } else {
++ if (single_stepping_) {
++ if (getenv("PPC64_TRACE_SIM")) {
++ fprintf(stderr,
++ "[sim] step icount=%llu pc=0x%lx instr=0x%08x lr=0x%lx fp=0x%lx sp=0x%lx\n",
++ (unsigned long long)icount_, (long)program_counter,
++ *(uint32_t*)program_counter, (long)getLR(),
++ (long)getRegister(fp), (long)getRegister(sp));
++ }
++ single_step_callback_(single_step_callback_arg_, this,
++ (void*)program_counter);
++ }
++ SimInstruction* instr =
++ reinterpret_cast<SimInstruction*>(program_counter);
++ instructionDecode(instr);
++ icount_++;
++ }
++ program_counter = get_pc();
++ }
++
++ if (single_stepping_) {
++ single_step_callback_(single_step_callback_arg_, this, nullptr);
++ }
++}
++
++// =============================================================================
++// callInternal / call.
++// =============================================================================
++
++void Simulator::callInternal(uint8_t* entry) {
++ // Prepare to execute the code at entry.
++ setRegister(pc, reinterpret_cast<int64_t>(entry));
++ // The simulation stops when returning to this call point (LR == end_sim_pc).
++ setLR(end_sim_pc);
++
++ // Remember the values of callee-saved registers (r14-r31 in ELFv2).
++ int64_t r14_val = getRegister(r14);
++ int64_t r15_val = getRegister(r15);
++ int64_t r16_val = getRegister(r16);
++ int64_t r17_val = getRegister(r17);
++ int64_t r18_val = getRegister(r18);
++ int64_t r19_val = getRegister(r19);
++ int64_t r20_val = getRegister(r20);
++ int64_t r21_val = getRegister(r21);
++ int64_t r22_val = getRegister(r22);
++ int64_t r23_val = getRegister(r23);
++ int64_t r24_val = getRegister(r24);
++ int64_t r25_val = getRegister(r25);
++ int64_t r26_val = getRegister(r26);
++ int64_t r27_val = getRegister(r27);
++ int64_t r28_val = getRegister(r28);
++ int64_t r29_val = getRegister(r29);
++ int64_t r30_val = getRegister(r30);
++ int64_t r31_val = getRegister(r31);
++ int64_t sp_val = getRegister(sp);
++
++#ifdef DEBUG
++ // Set up callee-saved registers with a known value to detect clobbers.
++ // DEBUG-only: in release this would silently corrupt every JS-jit-entry
++ // stub frame, since the stub saves r14-r31 to its stack early on. Any
++ // single-step-profiling sample taken later (or any unwind through the
++ // stub's saved CSR area) then dereferences `icount_` as a frame
++ // pointer and crashes — see e.g. wasm/profiling.js, ion-error-*.js,
++ // ion-lazy-tables.js, ion-callerfp-tag.js, return-call-profiling.js,
++ // externref-global-postbarrier.js, builtin-modules/i8vecmul.js,
++ // asm.js/testBug1357053.js (all single-step-profiling tests). In
++ // debug builds the value collides with the same callsites but the
++ // MOZ_ASSERTs below catch any actual ABI violation, which is the
++ // entire point.
++ int64_t callee_saved_value = icount_;
++ setRegister(r14, callee_saved_value);
++ setRegister(r15, callee_saved_value);
++ setRegister(r16, callee_saved_value);
++ setRegister(r17, callee_saved_value);
++ setRegister(r18, callee_saved_value);
++ setRegister(r19, callee_saved_value);
++ setRegister(r20, callee_saved_value);
++ setRegister(r21, callee_saved_value);
++ setRegister(r22, callee_saved_value);
++ setRegister(r23, callee_saved_value);
++ setRegister(r24, callee_saved_value);
++ setRegister(r25, callee_saved_value);
++ setRegister(r26, callee_saved_value);
++ setRegister(r27, callee_saved_value);
++ setRegister(r28, callee_saved_value);
++ setRegister(r29, callee_saved_value);
++ setRegister(r30, callee_saved_value);
++ setRegister(r31, callee_saved_value);
++#endif
++
++ // Start the simulation.
++ if (Simulator::StopSimAt != -1) {
++ execute<true>();
++ } else {
++ execute<false>();
++ }
++
++#ifdef DEBUG
++ // Check that the callee-saved registers have been preserved.
++ MOZ_ASSERT(callee_saved_value == getRegister(r14));
++ MOZ_ASSERT(callee_saved_value == getRegister(r15));
++ MOZ_ASSERT(callee_saved_value == getRegister(r16));
++ MOZ_ASSERT(callee_saved_value == getRegister(r17));
++ MOZ_ASSERT(callee_saved_value == getRegister(r18));
++ MOZ_ASSERT(callee_saved_value == getRegister(r19));
++ MOZ_ASSERT(callee_saved_value == getRegister(r20));
++ MOZ_ASSERT(callee_saved_value == getRegister(r21));
++ MOZ_ASSERT(callee_saved_value == getRegister(r22));
++ MOZ_ASSERT(callee_saved_value == getRegister(r23));
++ MOZ_ASSERT(callee_saved_value == getRegister(r24));
++ MOZ_ASSERT(callee_saved_value == getRegister(r25));
++ MOZ_ASSERT(callee_saved_value == getRegister(r26));
++ MOZ_ASSERT(callee_saved_value == getRegister(r27));
++ MOZ_ASSERT(callee_saved_value == getRegister(r28));
++ MOZ_ASSERT(callee_saved_value == getRegister(r29));
++ MOZ_ASSERT(callee_saved_value == getRegister(r30));
++ MOZ_ASSERT(callee_saved_value == getRegister(r31));
++#endif
++
++ // Restore callee-saved registers.
++ setRegister(r14, r14_val);
++ setRegister(r15, r15_val);
++ setRegister(r16, r16_val);
++ setRegister(r17, r17_val);
++ setRegister(r18, r18_val);
++ setRegister(r19, r19_val);
++ setRegister(r20, r20_val);
++ setRegister(r21, r21_val);
++ setRegister(r22, r22_val);
++ setRegister(r23, r23_val);
++ setRegister(r24, r24_val);
++ setRegister(r25, r25_val);
++ setRegister(r26, r26_val);
++ setRegister(r27, r27_val);
++ setRegister(r28, r28_val);
++ setRegister(r29, r29_val);
++ setRegister(r30, r30_val);
++ setRegister(r31, r31_val);
++ setRegister(sp, sp_val);
++}
++
++int64_t Simulator::call(uint8_t* entry, int argument_count, ...) {
++ va_list parameters;
++ va_start(parameters, argument_count);
++
++ int64_t original_stack = getRegister(sp);
++ // Compute position of stack on entry to generated code.
++ int64_t entry_stack = original_stack;
++ if (argument_count > kCArgSlotCount) {
++ entry_stack = entry_stack - argument_count * sizeof(int64_t);
++ } else {
++ entry_stack = entry_stack - kCArgsSlotsSize;
++ }
++
++ entry_stack &= ~U64(ABIStackAlignment - 1);
++
++ intptr_t* stack_argument = reinterpret_cast<intptr_t*>(entry_stack);
++
++ // PPC64 ELFv2: first 8 integer args go in r3-r10.
++ for (int i = 0; i < argument_count; i++) {
++ js::jit::Register argReg;
++ if (GetIntArgReg(i, &argReg)) {
++ setRegister(argReg.code(), va_arg(parameters, int64_t));
++ } else {
++ stack_argument[i] = va_arg(parameters, int64_t);
++ }
++ }
++
++ va_end(parameters);
++ setRegister(sp, entry_stack);
++
++ callInternal(entry);
++
++ MOZ_ASSERT(entry_stack == getRegister(sp));
++ setRegister(sp, original_stack);
++
++ int64_t result = getRegister(r3);
++ return result;
++}
++
++uintptr_t Simulator::pushAddress(uintptr_t address) {
++ int64_t new_sp = getRegister(sp) - sizeof(uintptr_t);
++ uintptr_t* stack_slot = reinterpret_cast<uintptr_t*>(new_sp);
++ *stack_slot = address;
++ setRegister(sp, new_sp);
++ return new_sp;
++}
++
++uintptr_t Simulator::popAddress() {
++ int64_t current_sp = getRegister(sp);
++ uintptr_t* stack_slot = reinterpret_cast<uintptr_t*>(current_sp);
++ uintptr_t address = *stack_slot;
++ setRegister(sp, current_sp + sizeof(uintptr_t));
++ return address;
++}
++
++} // namespace jit
++} // namespace js
++
++js::jit::Simulator* JSContext::simulator() const { return simulator_; }
+diff --git a/js/src/jit/ppc64/Simulator-ppc64.h b/js/src/jit/ppc64/Simulator-ppc64.h
+new file mode 100644
+index 000000000000..c7a3f3767d61
+--- /dev/null
++++ b/js/src/jit/ppc64/Simulator-ppc64.h
+@@ -0,0 +1,556 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifndef jit_ppc64_Simulator_ppc64_h
++#define jit_ppc64_Simulator_ppc64_h
++
++#ifdef JS_SIMULATOR_PPC64
++
++# include "mozilla/Atomics.h"
++
++# include "jit/IonTypes.h"
++# include "js/ProfilingFrameIterator.h"
++# include "threading/Thread.h"
++# include "vm/MutexIDs.h"
++# include "wasm/WasmSignalHandlers.h"
++
++namespace js {
++namespace jit {
++
++class JitActivation;
++class Simulator;
++class Redirection;
++class CachePage;
++class AutoLockSimulator;
++
++typedef void (*SingleStepCallback)(void* arg, Simulator* sim, void* pc);
++
++const intptr_t kPointerAlignment = 8;
++const intptr_t kPointerAlignmentMask = kPointerAlignment - 1;
++const intptr_t kDoubleAlignment = 8;
++const intptr_t kDoubleAlignmentMask = kDoubleAlignment - 1;
++
++const int kNumGPRegisters = 32;
++const int kPCRegister = 32;
++const int kNumFPURegisters = 32;
++const int kNumVRRegisters = 32; // VR0-VR31 (Altivec/VMX; = VSR32-63 in VSX)
++
++// PPC64 Condition Register: 8 fields of 4 bits each.
++// Each field: bit3=LT, bit2=GT, bit1=EQ, bit0=SO (in PPC big-endian numbering
++// within a field, but stored in little-endian nibble order in our uint32_t).
++const int kNumCRFields = 8;
++
++// CR field bit positions (within a 4-bit field).
++const uint8_t kCRFieldLT = 0x8;
++const uint8_t kCRFieldGT = 0x4;
++const uint8_t kCRFieldEQ = 0x2;
++const uint8_t kCRFieldSO = 0x1;
++
++// XER register bit positions.
++const int kXERSOBit = 31;
++const int kXEROVBit = 30;
++const int kXERCABit = 29;
++const int kXEROV32Bit = 19;
++const int kXERCA32Bit = 18;
++
++// FPSCR rounding mode bits (bits 62:63, stored in low bits of our uint64_t).
++const uint64_t kFPSCRRNMask = 0x3;
++
++// FPU rounding modes matching PPC64 FPSCR RN field.
++enum FPURoundingMode {
++ RN = 0, // Round to Nearest (ties to even)
++ RZ = 1, // Round toward Zero
++ RP = 2, // Round toward +Infinity
++ RM = 3, // Round toward -Infinity
++};
++
++// FPU invalid result constants.
++const uint32_t kFPUInvalidResult = static_cast<uint32_t>(1 << 31) - 1;
++const int32_t kFPUInvalidResultNegative = static_cast<int32_t>(1u << 31);
++const uint64_t kFPU64InvalidResult =
++ static_cast<uint64_t>(static_cast<uint64_t>(1) << 63) - 1;
++const int64_t kFPU64InvalidResultNegative =
++ static_cast<int64_t>(static_cast<uint64_t>(1) << 63);
++
++// Breakpoint/stop code ranges.
++const uint32_t kMaxWatchpointCode = 31;
++const uint32_t kMaxStopCode = 127;
++const uint32_t kWasmTrapCode = 6;
++
++// Redirection instruction: PPC_stop (0x4C0002E4).
++// Distinct from PPC_trap (0x7FE00008) used for wasm traps.
++const uint32_t kCallRedirInstr = 0x4C0002E4;
++
++typedef uint32_t Instr;
++class SimInstruction;
++
++class Simulator {
++ friend class ppc64Debugger;
++
++ public:
++ enum Register {
++ no_reg = -1,
++ r0 = 0,
++ r1,
++ r2,
++ r3,
++ r4,
++ r5,
++ r6,
++ r7,
++ r8,
++ r9,
++ r10,
++ r11,
++ r12,
++ r13,
++ r14,
++ r15,
++ r16,
++ r17,
++ r18,
++ r19,
++ r20,
++ r21,
++ r22,
++ r23,
++ r24,
++ r25,
++ r26,
++ r27,
++ r28,
++ r29,
++ r30,
++ r31,
++ pc,
++ kNumSimuRegisters,
++ // Aliases
++ sp = r1,
++ fp = r31,
++ };
++
++ enum FPURegister {
++ f0 = 0,
++ f1,
++ f2,
++ f3,
++ f4,
++ f5,
++ f6,
++ f7,
++ f8,
++ f9,
++ f10,
++ f11,
++ f12,
++ f13,
++ f14,
++ f15,
++ f16,
++ f17,
++ f18,
++ f19,
++ f20,
++ f21,
++ f22,
++ f23,
++ f24,
++ f25,
++ f26,
++ f27,
++ f28,
++ f29,
++ f30,
++ f31,
++ kNumFPURegisters
++ };
++
++ static Simulator* Create();
++ static void Destroy(Simulator* simulator);
++
++ Simulator();
++ ~Simulator();
++
++ static Simulator* Current();
++
++ static inline uintptr_t StackLimit() {
++ return Simulator::Current()->stackLimit();
++ }
++
++ uintptr_t* addressOfStackLimit();
++
++ // GPR accessors.
++ void setRegister(int reg, int64_t value);
++ int64_t getRegister(int reg) const;
++
++ // FPR accessors.
++ void setFpuRegister(int fpureg, int64_t value);
++ void setFpuRegisterWord(int fpureg, int32_t value);
++ void setFpuRegisterFloat(int fpureg, float value);
++ void setFpuRegisterDouble(int fpureg, double value);
++ int64_t getFpuRegister(int fpureg) const;
++ int32_t getFpuRegisterWord(int fpureg) const;
++ int32_t getFpuRegisterSignedWord(int fpureg) const;
++ float getFpuRegisterFloat(int fpureg) const;
++ double getFpuRegisterDouble(int fpureg) const;
++
++ // VR accessors (Altivec/VMX registers VR0-VR31). The bytes array is the
++ // ground truth: bytes[0] is the most-significant-byte on PPC64 big-endian
++ // numbering, i.e., VSR[MSB..LSB] mapped as bytes[0..15]. Callers that want
++ // typed views (lane 0 etc.) should extract from the bytes array according
++ // to the ISA's lane numbering for that instruction.
++ void setVRBytes(int vreg, const uint8_t bytes[16]);
++ void getVRBytes(int vreg, uint8_t bytes[16]) const;
++
++ // VSR (Vector-Scalar Register) accessors: unified 64-register namespace
++ // where VSR 0-31 aliases FPR 0-31 (DW0 is the FPR value, DW1 is
++ // architecturally undefined — we model it as zero on read, ignored on
++ // write) and VSR 32-63 aliases VR 0-31. Used by VSX instructions
++ // (xxpermdi, xxlor, xxlxor, mtvsrd, mfvsrd, ...).
++ void getVSR128(int vsr, uint8_t bytes[16]) const;
++ void setVSR128(int vsr, const uint8_t bytes[16]);
++
++ // SPR accessors.
++ int64_t getLR() const { return LR_; }
++ void setLR(int64_t value) { LR_ = value; }
++ int64_t getCTR() const { return CTR_; }
++ void setCTR(int64_t value) { CTR_ = value; }
++ uint32_t getCR() const { return CR_; }
++ void setCR(uint32_t value) { CR_ = value; }
++ uint64_t getXER() const { return XER_; }
++ void setXER(uint64_t value) { XER_ = value; }
++ uint64_t getFPSCR() const { return FPSCR_; }
++ void setFPSCR(uint64_t value) { FPSCR_ = value; }
++
++ // CR field accessors: field 0 is the most significant nibble (bits 31:28).
++ uint8_t getCRField(int field) const {
++ return (CR_ >> (4 * (7 - field))) & 0xF;
++ }
++ void setCRField(int field, uint8_t val) {
++ uint32_t shift = 4 * (7 - field);
++ CR_ = (CR_ & ~(0xFu << shift)) | ((val & 0xFu) << shift);
++ }
++
++ // XER bit accessors.
++ bool getXERSO() const { return (XER_ >> kXERSOBit) & 1; }
++ void setXERSO(bool v) {
++ XER_ = (XER_ & ~(1ull << kXERSOBit)) | ((uint64_t)v << kXERSOBit);
++ }
++ bool getXEROV() const { return (XER_ >> kXEROVBit) & 1; }
++ void setXEROV(bool v) {
++ XER_ = (XER_ & ~(1ull << kXEROVBit)) | ((uint64_t)v << kXEROVBit);
++ // Mirror to OV32. Real POWER9 silicon sets OV32 == OV for both 32-bit
++ // and 64-bit overflow ops: mulldo(2, 2^62) produces OV=OV32=1;
++ // mulldo(2^30, 4) produces OV=OV32=0. The JIT's
++ // POWER9 Overflow path is `mulldo + mcrxrx + bc Overflow`, where
++ // mcrxrx places OV32 in the GT slot and the Overflow condition tests
++ // GT — so OV32 must be live or no-overflow is reported even when
++ // OV=1. Without this mirror, BigInt fast-path mul silently wraps.
++ XER_ = (XER_ & ~(1ull << kXEROV32Bit)) | ((uint64_t)v << kXEROV32Bit);
++ if (v) setXERSO(true);
++ }
++ bool getXERCA() const { return (XER_ >> kXERCABit) & 1; }
++ void setXERCA(bool v) {
++ XER_ = (XER_ & ~(1ull << kXERCABit)) | ((uint64_t)v << kXERCABit);
++ }
++
++ // PC accessors.
++ void set_pc(int64_t value);
++ int64_t get_pc() const;
++
++ template <typename T>
++ T get_pc_as() const {
++ return reinterpret_cast<T>(get_pc());
++ }
++
++ void enable_single_stepping(SingleStepCallback cb, void* arg);
++ void disable_single_stepping();
++
++ uintptr_t stackLimit() const;
++ bool overRecursed(uintptr_t newsp = 0) const;
++ bool overRecursedWithExtra(uint32_t extra) const;
++
++ template <bool enableStopSimAt>
++ void execute();
++
++ int64_t call(uint8_t* entry, int argument_count, ...);
++
++ uintptr_t pushAddress(uintptr_t address);
++ uintptr_t popAddress();
++
++ void setLastDebuggerInput(char* input);
++ char* lastDebuggerInput() { return lastDebuggerInput_; }
++
++ bool has_bad_pc() const;
++
++ // Update CR field 0 from a 64-bit result.
++ void updateCR0(int64_t result) {
++ uint8_t field = kCRFieldSO * getXERSO();
++ if (result < 0)
++ field |= kCRFieldLT;
++ else if (result > 0)
++ field |= kCRFieldGT;
++ else
++ field |= kCRFieldEQ;
++ setCRField(0, field);
++ }
++
++ // Update CR field 0 from a 32-bit result (sign-extended comparison).
++ void updateCR0_32(int32_t result) {
++ uint8_t field = kCRFieldSO * getXERSO();
++ if (result < 0)
++ field |= kCRFieldLT;
++ else if (result > 0)
++ field |= kCRFieldGT;
++ else
++ field |= kCRFieldEQ;
++ setCRField(0, field);
++ }
++
++ // Compare and set an arbitrary CR field.
++ void setCRFieldCmp(int field, int64_t lhs, int64_t rhs) {
++ uint8_t val = kCRFieldSO * getXERSO();
++ if (lhs < rhs)
++ val |= kCRFieldLT;
++ else if (lhs > rhs)
++ val |= kCRFieldGT;
++ else
++ val |= kCRFieldEQ;
++ setCRField(field, val);
++ }
++
++ void setCRFieldCmpU(int field, uint64_t lhs, uint64_t rhs) {
++ uint8_t val = kCRFieldSO * getXERSO();
++ if (lhs < rhs)
++ val |= kCRFieldLT;
++ else if (lhs > rhs)
++ val |= kCRFieldGT;
++ else
++ val |= kCRFieldEQ;
++ setCRField(field, val);
++ }
++
++ private:
++ enum SpecialValues {
++ // PPC64 masks the low 2 bits of branch targets, so these must be
++ // 4-byte aligned to survive the & ~3 mask in blr/bcctr.
++ bad_ra = -4,
++ end_sim_pc = -8,
++ Unpredictable = 0xbadbeaf
++ };
++
++ bool init();
++
++ void format(SimInstruction* instr, const char* format);
++
++ // Memory access.
++ inline uint8_t readBU(uint64_t addr);
++ inline int8_t readB(uint64_t addr);
++ inline void writeB(uint64_t addr, uint8_t value);
++ inline void writeB(uint64_t addr, int8_t value);
++
++ inline uint16_t readHU(uint64_t addr, SimInstruction* instr);
++ inline int16_t readH(uint64_t addr, SimInstruction* instr);
++ inline void writeH(uint64_t addr, uint16_t value, SimInstruction* instr);
++ inline void writeH(uint64_t addr, int16_t value, SimInstruction* instr);
++
++ inline uint32_t readWU(uint64_t addr, SimInstruction* instr);
++ inline int32_t readW(uint64_t addr, SimInstruction* instr);
++ inline void writeW(uint64_t addr, uint32_t value, SimInstruction* instr);
++ inline void writeW(uint64_t addr, int32_t value, SimInstruction* instr);
++
++ inline int64_t readDW(uint64_t addr, SimInstruction* instr);
++ inline void writeDW(uint64_t addr, int64_t value, SimInstruction* instr);
++
++ inline double readD(uint64_t addr, SimInstruction* instr);
++ inline void writeD(uint64_t addr, double value, SimInstruction* instr);
++
++ inline uint8_t loadLinkedB(uint64_t addr, SimInstruction* instr);
++ inline int storeConditionalB(uint64_t addr, uint8_t value,
++ SimInstruction* instr);
++ inline uint16_t loadLinkedH(uint64_t addr, SimInstruction* instr);
++ inline int storeConditionalH(uint64_t addr, uint16_t value,
++ SimInstruction* instr);
++ inline int32_t loadLinkedW(uint64_t addr, SimInstruction* instr);
++ inline int storeConditionalW(uint64_t addr, int32_t value,
++ SimInstruction* instr);
++ inline int64_t loadLinkedD(uint64_t addr, SimInstruction* instr);
++ inline int storeConditionalD(uint64_t addr, int64_t value,
++ SimInstruction* instr);
++
++ // Instruction decoders.
++ void decodeDFormALU(SimInstruction* instr);
++ void decodeDFormLoad(SimInstruction* instr);
++ void decodeDFormStore(SimInstruction* instr);
++ void decodeDSForm(SimInstruction* instr);
++ void decodeXForm(SimInstruction* instr);
++ void decodeRotateMask(SimInstruction* instr);
++ void decodeBranch(SimInstruction* instr);
++ void decodeFP(SimInstruction* instr);
++ void decodeVSX(SimInstruction* instr);
++ void decodeVMX(SimInstruction* instr);
++ // Power ISA v3.1 prefixed instructions. `prefix` points at the
++ // 4-byte prefix word; the suffix is read from `prefix + 4`.
++ void decodePrefixed(SimInstruction* prefix);
++
++ void softwareInterrupt(SimInstruction* instr);
++
++ // Stop/breakpoint helpers.
++ bool isWatchpoint(uint32_t code);
++ void printWatchpoint(uint32_t code);
++ void handleStop(uint32_t code, SimInstruction* instr);
++ bool isStopInstruction(SimInstruction* instr);
++ bool isEnabledStop(uint32_t code);
++ void enableStop(uint32_t code);
++ void disableStop(uint32_t code);
++ void increaseStopCounter(uint32_t code);
++ void printStopInfo(uint32_t code);
++
++ JS::ProfilingFrameIterator::RegisterState registerState();
++
++ bool MOZ_ALWAYS_INLINE handleWasmSegFault(uint64_t addr, unsigned numBytes) {
++ if (MOZ_LIKELY(!js::wasm::CodeExists)) {
++ return false;
++ }
++ uint8_t* newPC;
++ if (!js::wasm::MemoryAccessTraps(registerState(), (uint8_t*)addr, numBytes,
++ &newPC)) {
++ return false;
++ }
++ LLBit_ = false;
++ set_pc(int64_t(newPC));
++ return true;
++ }
++
++ void instructionDecode(SimInstruction* instr);
++
++ public:
++ static int64_t StopSimAt;
++
++ static void* RedirectNativeFunction(void* nativeFunction,
++ ABIFunctionType type);
++
++ private:
++ void setCallResultDouble(double result);
++ void setCallResultFloat(float result);
++ void setCallResult(int64_t res);
++# ifdef XP_DARWIN
++ void setCallResult(intptr_t res);
++# endif
++ void setCallResult(__int128 res);
++
++ void callInternal(uint8_t* entry);
++
++ // Architecture state.
++ int64_t registers_[kNumSimuRegisters];
++ int64_t FPUregisters_[kNumFPURegisters];
++ // VR namespace (Altivec/VMX registers VR0-VR31 == VSR32-63). Stored as
++ // 16 raw bytes per register to preserve exact architectural byte order
++ // independent of host endianness. Accessors defined below; the bytes
++ // array is the ground truth.
++ uint8_t VRregisters_[kNumVRRegisters][16];
++
++ // PPC64 Special Purpose Registers.
++ int64_t LR_;
++ int64_t CTR_;
++ uint32_t CR_;
++ uint64_t XER_;
++ uint64_t FPSCR_;
++
++ // Atomics.
++ bool LLBit_;
++ uintptr_t LLAddr_;
++ int64_t lastLLValue_;
++
++ // Simulator support.
++ char* stack_;
++ uintptr_t stackLimit_;
++ bool pc_modified_;
++ int64_t icount_;
++ int64_t break_count_;
++
++ char* lastDebuggerInput_;
++
++ SimInstruction* break_pc_;
++ Instr break_instr_;
++
++ bool single_stepping_;
++ SingleStepCallback single_step_callback_;
++ void* single_step_callback_arg_;
++
++ static const uint32_t kNumOfWatchedStops = 256;
++ static const uint32_t kStopDisabledBit = 1U << 31;
++
++ struct StopCountAndDesc {
++ uint32_t count_;
++ char* desc_;
++ };
++ StopCountAndDesc watchedStops_[kNumOfWatchedStops];
++};
++
++// Process-wide simulator state.
++class SimulatorProcess {
++ friend class Redirection;
++ friend class AutoLockSimulatorCache;
++
++ private:
++ struct ICacheHasher {
++ typedef void* Key;
++ typedef void* Lookup;
++ static HashNumber hash(const Lookup& l);
++ static bool match(const Key& k, const Lookup& l);
++ };
++
++ public:
++ typedef HashMap<void*, CachePage*, ICacheHasher, SystemAllocPolicy> ICacheMap;
++
++ static mozilla::Atomic<size_t, mozilla::ReleaseAcquire>
++ ICacheCheckingDisableCount;
++ static void FlushICache(void* start, size_t size);
++ static void checkICacheLocked(SimInstruction* instr);
++
++ static bool initialize() {
++ singleton_ = js_new<SimulatorProcess>();
++ return singleton_;
++ }
++ static void destroy() {
++ js_delete(singleton_);
++ singleton_ = nullptr;
++ }
++
++ SimulatorProcess();
++ ~SimulatorProcess();
++
++ private:
++ static SimulatorProcess* singleton_;
++
++ Mutex cacheLock_;
++ Redirection* redirection_;
++ ICacheMap icache_;
++
++ public:
++ static ICacheMap& icache() {
++ singleton_->cacheLock_.assertOwnedByCurrentThread();
++ return singleton_->icache_;
++ }
++
++ static Redirection* redirection() {
++ singleton_->cacheLock_.assertOwnedByCurrentThread();
++ return singleton_->redirection_;
++ }
++
++ static void setRedirection(js::jit::Redirection* redirection) {
++ singleton_->cacheLock_.assertOwnedByCurrentThread();
++ singleton_->redirection_ = redirection;
++ }
++};
++
++} // namespace jit
++} // namespace js
++
++#endif /* JS_SIMULATOR_PPC64 */
++
++#endif /* jit_ppc64_Simulator_ppc64_h */
+diff --git a/js/src/jit/ppc64/Trampoline-ppc64.cpp b/js/src/jit/ppc64/Trampoline-ppc64.cpp
+new file mode 100644
+index 000000000000..515a931c86b0
+--- /dev/null
++++ b/js/src/jit/ppc64/Trampoline-ppc64.cpp
+@@ -0,0 +1,648 @@
++/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
++ * vim: set ts=8 sts=2 et sw=2 tw=80:
++ * This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#include "jit/Bailouts.h"
++#include "jit/BaselineFrame.h"
++#include "jit/CalleeToken.h"
++#include "jit/JitFrames.h"
++#include "jit/JitRuntime.h"
++#include "jit/PerfSpewer.h"
++#include "jit/ppc64/SharedICHelpers-ppc64.h"
++#include "jit/VMFunctions.h"
++#include "vm/JitActivation.h"
++#include "vm/JSContext.h"
++
++#include "jit/MacroAssembler-inl.h"
++
++using namespace js;
++using namespace js::jit;
++
++// Float (Single+Double) and all GPRs. Simd128 excluded — Ion compiles JS
++// (no v128 type), so SIMD regs are never live at bailout / invalidator /
++// preBarrier entry. Including them would force the bailout frame's
++// FPUArray to hold v128 slots that Ion never writes.
++static const LiveRegisterSet AllRegs = LiveRegisterSet(
++ GeneralRegisterSet(Registers::AllMask),
++ FloatRegisterSet(FloatRegisters::AllSingleMask |
++ FloatRegisters::AllDoubleMask));
++
++static_assert(sizeof(uintptr_t) == sizeof(uint64_t), "Not 64-bit clean.");
++
++// PPC64 ELFv2 callee-saved: GPRs r14-r31, FPRs f14-f31, VRs VR20-VR31, LR.
++// We also save reg_vp (r10 / IntArgReg7) so we can use it after the JIT call.
++//
++// Layout is alignas(16) so that after `reserveStack(sizeof(EnterJITRegs))`
++// the SP-relative offset of every VR slot is 16-byte aligned, satisfying
++// the 16-byte alignment requirement of stxvd2x / stvx (stvx is technically
++// alignment-tolerant, but we'd rather align by construction). Padding at
++// the end keeps sizeof a multiple of 16 so SP stays quadword-aligned per
++// the ELFv2 stack-pointer rule.
++struct alignas(16) EnterJITRegs {
++ // VR20-VR31 first so their SP-relative offsets are 0, 16, 32, ... — all
++ // 16-byte aligned regardless of what follows.
++ uint8_t vr20[16];
++ uint8_t vr21[16];
++ uint8_t vr22[16];
++ uint8_t vr23[16];
++ uint8_t vr24[16];
++ uint8_t vr25[16];
++ uint8_t vr26[16];
++ uint8_t vr27[16];
++ uint8_t vr28[16];
++ uint8_t vr29[16];
++ uint8_t vr30[16];
++ uint8_t vr31[16];
++
++ double f31;
++ double f30;
++ double f29;
++ double f28;
++ double f27;
++ double f26;
++ double f25;
++ double f24;
++ double f23;
++ double f22;
++ double f21;
++ double f20;
++ double f19;
++ double f18;
++ double f17;
++ double f16;
++ double f15;
++ double f14;
++
++ uint64_t r31; // FramePointer
++ uint64_t r30;
++ uint64_t r29;
++ uint64_t r28;
++ uint64_t r27;
++ uint64_t r26;
++ uint64_t r25;
++ uint64_t r24;
++ uint64_t r23;
++ uint64_t r22;
++ uint64_t r21;
++ uint64_t r20;
++ uint64_t r19;
++ uint64_t r18;
++ uint64_t r17;
++ uint64_t r16;
++ uint64_t r15;
++ uint64_t r14;
++ uint64_t r2; // TOC pointer
++ uint64_t lr;
++ // Save reg_vp (r10) on stack so we can use it after the JIT call returns.
++ uint64_t r10;
++};
++// alignas(16) on the struct ensures sizeof is a multiple of 16, which keeps
++// SP quadword-aligned after `reserveStack(sizeof(EnterJITRegs))`. The
++// existing fields total 312 bytes; with the 192 bytes of VR slots we are
++// at 504, which alignas(16) bumps to 512.
++static_assert((sizeof(EnterJITRegs) % 16) == 0,
++ "EnterJITRegs must be 16-byte aligned to keep SP aligned");
++
++static void GenerateReturn(MacroAssembler& masm) {
++ MOZ_ASSERT(masm.framePushed() == sizeof(EnterJITRegs));
++
++ // Restore non-volatile GPRs.
++ masm.as_ld(r14, StackPointer, offsetof(EnterJITRegs, r14));
++ masm.as_ld(r15, StackPointer, offsetof(EnterJITRegs, r15));
++ masm.as_ld(r16, StackPointer, offsetof(EnterJITRegs, r16));
++ masm.as_ld(r17, StackPointer, offsetof(EnterJITRegs, r17));
++ masm.as_ld(r18, StackPointer, offsetof(EnterJITRegs, r18));
++ masm.as_ld(r19, StackPointer, offsetof(EnterJITRegs, r19));
++ masm.as_ld(r20, StackPointer, offsetof(EnterJITRegs, r20));
++ masm.as_ld(r21, StackPointer, offsetof(EnterJITRegs, r21));
++ masm.as_ld(r22, StackPointer, offsetof(EnterJITRegs, r22));
++ masm.as_ld(r23, StackPointer, offsetof(EnterJITRegs, r23));
++ masm.as_ld(r24, StackPointer, offsetof(EnterJITRegs, r24));
++ masm.as_ld(r25, StackPointer, offsetof(EnterJITRegs, r25));
++ masm.as_ld(r26, StackPointer, offsetof(EnterJITRegs, r26));
++ masm.as_ld(r27, StackPointer, offsetof(EnterJITRegs, r27));
++ masm.as_ld(r28, StackPointer, offsetof(EnterJITRegs, r28));
++ masm.as_ld(r29, StackPointer, offsetof(EnterJITRegs, r29));
++ masm.as_ld(r30, StackPointer, offsetof(EnterJITRegs, r30));
++ masm.as_ld(r31, StackPointer, offsetof(EnterJITRegs, r31));
++ masm.as_ld(r2, StackPointer, offsetof(EnterJITRegs, r2));
++
++ // Restore LR.
++ masm.as_ld(r0, StackPointer, offsetof(EnterJITRegs, lr));
++ masm.xs_mtlr(r0);
++
++ // Restore non-volatile FPRs.
++ masm.as_lfd(f14, StackPointer, offsetof(EnterJITRegs, f14));
++ masm.as_lfd(f15, StackPointer, offsetof(EnterJITRegs, f15));
++ masm.as_lfd(f16, StackPointer, offsetof(EnterJITRegs, f16));
++ masm.as_lfd(f17, StackPointer, offsetof(EnterJITRegs, f17));
++ masm.as_lfd(f18, StackPointer, offsetof(EnterJITRegs, f18));
++ masm.as_lfd(f19, StackPointer, offsetof(EnterJITRegs, f19));
++ masm.as_lfd(f20, StackPointer, offsetof(EnterJITRegs, f20));
++ masm.as_lfd(f21, StackPointer, offsetof(EnterJITRegs, f21));
++ masm.as_lfd(f22, StackPointer, offsetof(EnterJITRegs, f22));
++ masm.as_lfd(f23, StackPointer, offsetof(EnterJITRegs, f23));
++ masm.as_lfd(f24, StackPointer, offsetof(EnterJITRegs, f24));
++ masm.as_lfd(f25, StackPointer, offsetof(EnterJITRegs, f25));
++ masm.as_lfd(f26, StackPointer, offsetof(EnterJITRegs, f26));
++ masm.as_lfd(f27, StackPointer, offsetof(EnterJITRegs, f27));
++ masm.as_lfd(f28, StackPointer, offsetof(EnterJITRegs, f28));
++ masm.as_lfd(f29, StackPointer, offsetof(EnterJITRegs, f29));
++ masm.as_lfd(f30, StackPointer, offsetof(EnterJITRegs, f30));
++ masm.as_lfd(f31, StackPointer, offsetof(EnterJITRegs, f31));
++
++ // Restore callee-saved VR20-VR31 (ELFv2). lvx uses indexed addressing
++ // (RA + RB), and r0's value is used here as RB (RA = StackPointer is
++ // non-zero, so its value is added). r0 is non-allocatable.
++#define RESTORE_VR(N) \
++ masm.xs_li(r0, offsetof(EnterJITRegs, vr##N)); \
++ masm.as_lvx(N, StackPointer, r0)
++ RESTORE_VR(20); RESTORE_VR(21); RESTORE_VR(22); RESTORE_VR(23);
++ RESTORE_VR(24); RESTORE_VR(25); RESTORE_VR(26); RESTORE_VR(27);
++ RESTORE_VR(28); RESTORE_VR(29); RESTORE_VR(30); RESTORE_VR(31);
++#undef RESTORE_VR
++
++ masm.freeStack(sizeof(EnterJITRegs));
++
++ masm.as_blr();
++}
++
++static void GeneratePrologue(MacroAssembler& masm) {
++ // Save LR first (PPC64 LR is SPR, not GPR).
++ masm.xs_mflr(r0);
++
++ // ELFv2 prologue convention: save LR at caller's frame [SP+16] BEFORE
++ // decrementing SP. External unwinders (gdb, perf, libunwind) walk the
++ // stack by reading LR-save slots at [SP+16] of every frame; without
++ // this write they'd find junk at our caller's slot. Costs 1 extra
++ // instruction; we still keep the in-frame save below for clean
++ // restore symmetry.
++ masm.as_std(r0, StackPointer, 16);
++
++ masm.reserveStack(sizeof(EnterJITRegs));
++
++ // Save LR (also kept in our own frame for the clean restore in
++ // GenerateReturn — see comment there).
++ masm.as_std(r0, StackPointer, offsetof(EnterJITRegs, lr));
++
++ // Save non-volatile GPRs.
++ masm.as_std(r2, StackPointer, offsetof(EnterJITRegs, r2));
++ masm.as_std(r14, StackPointer, offsetof(EnterJITRegs, r14));
++ masm.as_std(r15, StackPointer, offsetof(EnterJITRegs, r15));
++ masm.as_std(r16, StackPointer, offsetof(EnterJITRegs, r16));
++ masm.as_std(r17, StackPointer, offsetof(EnterJITRegs, r17));
++ masm.as_std(r18, StackPointer, offsetof(EnterJITRegs, r18));
++ masm.as_std(r19, StackPointer, offsetof(EnterJITRegs, r19));
++ masm.as_std(r20, StackPointer, offsetof(EnterJITRegs, r20));
++ masm.as_std(r21, StackPointer, offsetof(EnterJITRegs, r21));
++ masm.as_std(r22, StackPointer, offsetof(EnterJITRegs, r22));
++ masm.as_std(r23, StackPointer, offsetof(EnterJITRegs, r23));
++ masm.as_std(r24, StackPointer, offsetof(EnterJITRegs, r24));
++ masm.as_std(r25, StackPointer, offsetof(EnterJITRegs, r25));
++ masm.as_std(r26, StackPointer, offsetof(EnterJITRegs, r26));
++ masm.as_std(r27, StackPointer, offsetof(EnterJITRegs, r27));
++ masm.as_std(r28, StackPointer, offsetof(EnterJITRegs, r28));
++ masm.as_std(r29, StackPointer, offsetof(EnterJITRegs, r29));
++ masm.as_std(r30, StackPointer, offsetof(EnterJITRegs, r30));
++ masm.as_std(r31, StackPointer, offsetof(EnterJITRegs, r31));
++
++ // Save reg_vp (r10) so we can retrieve it after the JIT call.
++ masm.as_std(r10, StackPointer, offsetof(EnterJITRegs, r10));
++
++ // Save non-volatile FPRs.
++ masm.as_stfd(f14, StackPointer, offsetof(EnterJITRegs, f14));
++ masm.as_stfd(f15, StackPointer, offsetof(EnterJITRegs, f15));
++ masm.as_stfd(f16, StackPointer, offsetof(EnterJITRegs, f16));
++ masm.as_stfd(f17, StackPointer, offsetof(EnterJITRegs, f17));
++ masm.as_stfd(f18, StackPointer, offsetof(EnterJITRegs, f18));
++ masm.as_stfd(f19, StackPointer, offsetof(EnterJITRegs, f19));
++ masm.as_stfd(f20, StackPointer, offsetof(EnterJITRegs, f20));
++ masm.as_stfd(f21, StackPointer, offsetof(EnterJITRegs, f21));
++ masm.as_stfd(f22, StackPointer, offsetof(EnterJITRegs, f22));
++ masm.as_stfd(f23, StackPointer, offsetof(EnterJITRegs, f23));
++ masm.as_stfd(f24, StackPointer, offsetof(EnterJITRegs, f24));
++ masm.as_stfd(f25, StackPointer, offsetof(EnterJITRegs, f25));
++ masm.as_stfd(f26, StackPointer, offsetof(EnterJITRegs, f26));
++ masm.as_stfd(f27, StackPointer, offsetof(EnterJITRegs, f27));
++ masm.as_stfd(f28, StackPointer, offsetof(EnterJITRegs, f28));
++ masm.as_stfd(f29, StackPointer, offsetof(EnterJITRegs, f29));
++ masm.as_stfd(f30, StackPointer, offsetof(EnterJITRegs, f30));
++ masm.as_stfd(f31, StackPointer, offsetof(EnterJITRegs, f31));
++
++ // Save callee-saved VR20-VR31 (ELFv2). The JIT freely uses VMX registers
++ // via EmitVmxBinary etc.; without this save the C caller's VR20-VR31
++ // contents would be trashed on return. stvx uses indexed addressing —
++ // r0 holds the offset (non-allocatable in JIT regalloc; safe to use as
++ // a free temp here).
++#define SAVE_VR(N) \
++ masm.xs_li(r0, offsetof(EnterJITRegs, vr##N)); \
++ masm.as_stvx(N, StackPointer, r0)
++ SAVE_VR(20); SAVE_VR(21); SAVE_VR(22); SAVE_VR(23);
++ SAVE_VR(24); SAVE_VR(25); SAVE_VR(26); SAVE_VR(27);
++ SAVE_VR(28); SAVE_VR(29); SAVE_VR(30); SAVE_VR(31);
++#undef SAVE_VR
++}
++
++void JitRuntime::generateEnterJIT(JSContext* cx, MacroAssembler& masm) {
++ AutoCreatedBy acb(masm, "JitRuntime::generateEnterJIT");
++
++ enterJITOffset_ = startTrampolineCode(masm);
++
++ // EnterJitCode signature: (void* code, unsigned argc, Value* argv,
++ // InterpreterFrame* fp, CalleeToken calleeToken,
++ // JSObject* envChain, size_t numStackValues,
++ // Value* vp)
++ const Register reg_code = IntArgReg0; // r3
++ const Register reg_argc = IntArgReg1; // r4
++ const Register reg_argv = IntArgReg2; // r5
++ const mozilla::DebugOnly<Register> reg_frame = IntArgReg3; // r6
++ const Register reg_token = IntArgReg4; // r7
++ const Register reg_chain = IntArgReg5; // r8
++ const Register reg_values = IntArgReg6; // r9
++ const Register reg_vp = IntArgReg7; // r10
++
++ MOZ_ASSERT(OsrFrameReg == reg_frame);
++
++ GeneratePrologue(masm);
++
++ // Save stack pointer as baseline frame.
++ masm.movePtr(StackPointer, FramePointer);
++
++ // Use non-volatile scratch registers for generateEnterJitShared.
++ // r14, r15, r17 are non-volatile and not special-purpose in JIT.
++ generateEnterJitShared(masm, reg_argc, reg_argv, reg_token, r14, r15, r17);
++
++ // Push the descriptor.
++ masm.unboxInt32(Address(reg_vp, 0), r14);
++ masm.pushFrameDescriptorForJitCall(FrameType::CppToJSJit, r14, r14);
++
++ CodeLabel returnLabel;
++ Label oomReturnLabel;
++ {
++ // Handle Interpreter -> Baseline OSR.
++ AllocatableGeneralRegisterSet regs(GeneralRegisterSet::All());
++ MOZ_ASSERT(!regs.has(FramePointer));
++ regs.take(OsrFrameReg);
++ regs.take(reg_code);
++ MOZ_ASSERT(!regs.has(ReturnReg), "ReturnReg matches reg_code");
++
++ Label notOsr;
++ masm.branchTestPtr(Assembler::Zero, OsrFrameReg, OsrFrameReg, ¬Osr);
++
++ Register numStackValues = reg_values;
++ regs.take(numStackValues);
++ Register scratch = regs.takeAny();
++
++ // Push return address.
++ masm.subPtr(Imm32(sizeof(uintptr_t)), StackPointer);
++ masm.mov(&returnLabel, scratch);
++ masm.storePtr(scratch, Address(StackPointer, 0));
++
++ // Push previous frame pointer.
++ masm.subPtr(Imm32(sizeof(uintptr_t)), StackPointer);
++ masm.storePtr(FramePointer, Address(StackPointer, 0));
++
++ // Reserve frame.
++ Register framePtr = FramePointer;
++ masm.movePtr(StackPointer, framePtr);
++ masm.subPtr(Imm32(BaselineFrame::Size()), StackPointer);
++
++ Register framePtrScratch = regs.takeAny();
++ masm.movePtr(StackPointer, framePtrScratch);
++
++ // Reserve space for locals and stack values.
++ masm.x_sldi(scratch, numStackValues, 3);
++ masm.subPtr(scratch, StackPointer);
++
++ // Enter exit frame.
++ masm.reserveStack(3 * sizeof(uintptr_t));
++ masm.storePtr(ImmWord(MakeFrameDescriptor(FrameType::BaselineJS)),
++ Address(StackPointer, 2 * sizeof(uintptr_t)));
++ masm.storePtr(ImmPtr(nullptr), Address(StackPointer, sizeof(uintptr_t)));
++ masm.storePtr(FramePointer, Address(StackPointer, 0));
++
++ // No GC things to mark, push a bare token.
++ masm.loadJSContext(scratch);
++ masm.enterFakeExitFrame(scratch, scratch, ExitFrameType::Bare);
++
++ masm.reserveStack(2 * sizeof(uintptr_t));
++ masm.storePtr(framePtr, Address(StackPointer, sizeof(uintptr_t)));
++ masm.storePtr(reg_code, Address(StackPointer, 0));
++
++ using Fn = void (*)(BaselineFrame* frame, InterpreterFrame* interpFrame,
++ uint32_t numStackValues);
++ masm.setupUnalignedABICall(scratch);
++ masm.passABIArg(framePtrScratch);
++ masm.passABIArg(OsrFrameReg);
++ masm.passABIArg(numStackValues);
++ masm.callWithABI<Fn, jit::InitBaselineFrameForOsr>(
++ ABIType::General, CheckUnsafeCallWithABI::DontCheckHasExitFrame);
++
++ regs.add(OsrFrameReg);
++ Register jitcode = regs.takeAny();
++ masm.loadPtr(Address(StackPointer, 0), jitcode);
++ masm.loadPtr(Address(StackPointer, sizeof(uintptr_t)), framePtr);
++ masm.freeStack(2 * sizeof(uintptr_t));
++
++ masm.freeStack(ExitFrameLayout::SizeWithFooter());
++
++ // If OSR-ing, then emit instrumentation for setting lastProfilerFrame
++ // if profiler instrumentation is enabled.
++ {
++ Label skipProfilingInstrumentation;
++ AbsoluteAddress addressOfEnabled(
++ cx->runtime()->geckoProfiler().addressOfEnabled());
++ masm.branch32(Assembler::Equal, addressOfEnabled, Imm32(0),
++ &skipProfilingInstrumentation);
++ masm.profilerEnterFrame(framePtr, scratch);
++ masm.bind(&skipProfilingInstrumentation);
++ }
++
++ masm.jump(jitcode);
++
++ masm.bind(¬Osr);
++ // Load the scope chain in R1.
++ MOZ_ASSERT(R1.scratchReg() != reg_code);
++ masm.movePtr(reg_chain, R1.scratchReg());
++ }
++
++ // The call will push the return address and frame pointer on the stack, thus
++ // we check that the stack would be aligned once the call is complete.
++ masm.assertStackAlignment(JitStackAlignment, 2 * sizeof(uintptr_t));
++
++ // Call the function with pushing return address to stack.
++ masm.callJitNoProfiler(reg_code);
++
++ {
++ // Interpreter -> Baseline OSR will return here.
++ masm.bind(&returnLabel);
++ masm.addCodeLabel(returnLabel);
++ masm.bind(&oomReturnLabel);
++ }
++
++ // Discard arguments and padding. Set sp to the address of the EnterJITRegs
++ // on the stack.
++ masm.movePtr(FramePointer, StackPointer);
++
++ // Store the returned value into the vp.
++ masm.as_ld(reg_vp, StackPointer, offsetof(EnterJITRegs, r10));
++ masm.storeValue(JSReturnOperand, Address(reg_vp, 0));
++
++ // Restore non-volatile registers and return.
++ GenerateReturn(masm);
++}
++
++// static
++mozilla::Maybe<::JS::ProfilingFrameIterator::RegisterState>
++JitRuntime::getCppEntryRegisters(JitFrameLayout* frameStackAddress) {
++ return mozilla::Nothing{};
++}
++
++void JitRuntime::generateInvalidator(MacroAssembler& masm, Label* bailoutTail) {
++ AutoCreatedBy acb(masm, "JitRuntime::generateInvalidator");
++
++ invalidatorOffset_ = startTrampolineCode(masm);
++
++ masm.checkStackAlignment();
++
++ // Push all registers so we can access them from [base + code].
++ masm.PushRegsInMask(AllRegs);
++
++ // Pass pointer to InvalidationBailoutStack structure.
++ masm.movePtr(StackPointer, IntArgReg0);
++
++ // Reserve place for BailoutInfo pointer. Two words to ensure alignment for
++ // setupAlignedABICall.
++ masm.subPtr(Imm32(2 * sizeof(uintptr_t)), StackPointer);
++ masm.movePtr(StackPointer, IntArgReg1);
++
++ using Fn = bool (*)(InvalidationBailoutStack* sp, BaselineBailoutInfo** info);
++ masm.setupAlignedABICall();
++ masm.passABIArg(IntArgReg0);
++ masm.passABIArg(IntArgReg1);
++ masm.callWithABI<Fn, InvalidationBailout>(
++ ABIType::General, CheckUnsafeCallWithABI::DontCheckOther);
++
++ masm.pop(IntArgReg2);
++
++ // Pop the machine state and the dead frame.
++ masm.moveToStackPtr(FramePointer);
++
++ // Jump to shared bailout tail. The BailoutInfo pointer has to be in
++ // IntArgReg2 (r5).
++ masm.jump(bailoutTail);
++}
++
++// When bailout is done via out of line code (lazy bailout).
++// Frame size is stored in LR (look at
++// CodeGeneratorPPC64::generateOutOfLineCode()) and thunk code should save it
++// on stack.
++static void PushBailoutFrame(MacroAssembler& masm, Register spArg) {
++ // Push the frameSize_ stored in LR.
++ // See: CodeGeneratorPPC64::generateOutOfLineCode()
++ masm.pushReturnAddress();
++
++ // Push registers such that we can access them from [base + code].
++ masm.PushRegsInMask(AllRegs);
++
++ // Put pointer to BailoutStack as first argument to the Bailout().
++ masm.movePtr(StackPointer, spArg);
++}
++
++static void GenerateBailoutThunk(MacroAssembler& masm, Label* bailoutTail) {
++ PushBailoutFrame(masm, IntArgReg0);
++
++ // Make space for Bailout's bailoutInfo outparam.
++ masm.reserveStack(sizeof(void*));
++ masm.movePtr(StackPointer, IntArgReg1);
++
++ // Call the bailout function.
++ using Fn = bool (*)(BailoutStack* sp, BaselineBailoutInfo** info);
++ masm.setupUnalignedABICall(IntArgReg2);
++ masm.passABIArg(IntArgReg0);
++ masm.passABIArg(IntArgReg1);
++ masm.callWithABI<Fn, Bailout>(ABIType::General,
++ CheckUnsafeCallWithABI::DontCheckOther);
++
++ // Get the bailoutInfo outparam.
++ masm.pop(IntArgReg2);
++
++ // Remove both the bailout frame and the topmost Ion frame's stack.
++ masm.moveToStackPtr(FramePointer);
++
++ // Jump to shared bailout tail. The BailoutInfo pointer has to be in
++ // IntArgReg2 (r5).
++ masm.jump(bailoutTail);
++}
++
++void JitRuntime::generateBailoutHandler(MacroAssembler& masm,
++ Label* bailoutTail) {
++ AutoCreatedBy acb(masm, "JitRuntime::generateBailoutHandler");
++
++ bailoutHandlerOffset_ = startTrampolineCode(masm);
++
++ GenerateBailoutThunk(masm, bailoutTail);
++}
++
++bool JitRuntime::generateVMWrapper(JSContext* cx, MacroAssembler& masm,
++ VMFunctionId id, const VMFunctionData& f,
++ DynFn nativeFun, uint32_t* wrapperOffset) {
++ AutoCreatedBy acb(masm, "JitRuntime::generateVMWrapper");
++
++ *wrapperOffset = startTrampolineCode(masm);
++
++ // Avoid conflicts with argument registers while discarding the result after
++ // the function call.
++ AllocatableGeneralRegisterSet regs(Register::Codes::WrapperMask);
++
++ static_assert(
++ (Register::Codes::VolatileMask & ~Register::Codes::WrapperMask) == 0,
++ "Wrapper register set should be a superset of Volatile register set.");
++
++ // The context is the first argument; r3 is the first argument register.
++ Register cxreg = IntArgReg0;
++ regs.take(cxreg);
++
++ // On link-register platforms, it is the responsibility of the VM *callee* to
++ // push the return address, while the caller must ensure that the address
++ // is stored in LR on entry. This allows the VM wrapper to work with both
++ // direct calls and tail calls.
++ masm.pushReturnAddress();
++
++ // Push the frame pointer to finish the exit frame, then link it up.
++ masm.Push(FramePointer);
++ masm.moveStackPtrTo(FramePointer);
++ masm.loadJSContext(cxreg);
++ masm.enterExitFrame(cxreg, regs.getAny(), id);
++
++ // Reserve space for the outparameter.
++ masm.reserveVMFunctionOutParamSpace(f);
++
++ masm.setupUnalignedABICallDontSaveRestoreSP();
++ masm.passABIArg(cxreg);
++
++ size_t argDisp = ExitFrameLayout::Size();
++
++ // Copy any arguments.
++ for (uint32_t explicitArg = 0; explicitArg < f.explicitArgs; explicitArg++) {
++ switch (f.argProperties(explicitArg)) {
++ case VMFunctionData::WordByValue:
++ if (f.argPassedInFloatReg(explicitArg)) {
++ masm.passABIArg(MoveOperand(FramePointer, argDisp), ABIType::Float64);
++ } else {
++ masm.passABIArg(MoveOperand(FramePointer, argDisp), ABIType::General);
++ }
++ argDisp += sizeof(void*);
++ break;
++ case VMFunctionData::WordByRef:
++ masm.passABIArg(MoveOperand(FramePointer, argDisp,
++ MoveOperand::Kind::EffectiveAddress),
++ ABIType::General);
++ argDisp += sizeof(void*);
++ break;
++ case VMFunctionData::DoubleByValue:
++ case VMFunctionData::DoubleByRef:
++ MOZ_CRASH("NYI: PPC64 callVM should not be used with 128bits values.");
++ break;
++ }
++ }
++
++ // Copy the implicit outparam, if any.
++ const int32_t outParamOffset =
++ -int32_t(ExitFooterFrame::Size()) - f.sizeOfOutParamStackSlot();
++ if (f.outParam != Type_Void) {
++ masm.passABIArg(MoveOperand(FramePointer, outParamOffset,
++ MoveOperand::Kind::EffectiveAddress),
++ ABIType::General);
++ }
++
++ masm.callWithABI(nativeFun, ABIType::General,
++ CheckUnsafeCallWithABI::DontCheckHasExitFrame);
++
++ // Test for failure.
++ switch (f.failType()) {
++ case Type_Cell:
++ masm.branchTestPtr(Assembler::Zero, IntArgReg0, IntArgReg0,
++ masm.failureLabel());
++ break;
++ case Type_Bool:
++ masm.branchIfFalseBool(IntArgReg0, masm.failureLabel());
++ break;
++ case Type_Void:
++ break;
++ default:
++ MOZ_CRASH("unknown failure kind");
++ }
++
++ // Load the outparam.
++ masm.loadVMFunctionOutParam(f, Address(FramePointer, outParamOffset));
++
++ // Pop frame and restore frame pointer.
++ masm.moveToStackPtr(FramePointer);
++ masm.pop(FramePointer);
++
++ // Return. Subtract sizeof(void*) for the frame pointer.
++ masm.retn(Imm32(sizeof(ExitFrameLayout) - sizeof(void*) +
++ f.explicitStackSlots() * sizeof(void*) +
++ f.extraValuesToPop * sizeof(Value)));
++
++ return true;
++}
++
++uint32_t JitRuntime::generatePreBarrier(JSContext* cx, MacroAssembler& masm,
++ MIRType type) {
++ AutoCreatedBy acb(masm, "JitRuntime::generatePreBarrier");
++
++ uint32_t offset = startTrampolineCode(masm);
++
++ MOZ_ASSERT(PreBarrierReg == IntArgReg1); // r4
++ Register temp1 = IntArgReg0; // r3
++ Register temp2 = IntArgReg2; // r5
++ Register temp3 = IntArgReg3; // r6
++ masm.push(temp1);
++ masm.push(temp2);
++ masm.push(temp3);
++
++ Label noBarrier;
++ masm.emitPreBarrierFastPath(type, temp1, temp2, temp3, &noBarrier);
++
++ // Call into C++ to mark this GC thing.
++ masm.pop(temp3);
++ masm.pop(temp2);
++ masm.pop(temp1);
++
++ LiveRegisterSet save;
++ save.set() = RegisterSet(GeneralRegisterSet(Registers::VolatileMask),
++ FloatRegisterSet(FloatRegisters::VolatileMask));
++ // On PPC64, save LR since we'll be making a call.
++ masm.pushReturnAddress();
++ masm.PushRegsInMask(save);
++
++ masm.movePtr(ImmPtr(cx->runtime()), IntArgReg0);
++
++ masm.setupUnalignedABICall(IntArgReg2);
++ masm.passABIArg(IntArgReg0);
++ masm.passABIArg(IntArgReg1);
++ masm.callWithABI(JitPreWriteBarrier(type));
++
++ masm.PopRegsInMask(save);
++ masm.ret();
++
++ masm.bind(&noBarrier);
++ masm.pop(temp3);
++ masm.pop(temp2);
++ masm.pop(temp1);
++ masm.abiret();
++
++ return offset;
++}
++
++void JitRuntime::generateBailoutTailStub(MacroAssembler& masm,
++ Label* bailoutTail) {
++ AutoCreatedBy acb(masm, "JitRuntime::generateBailoutTailStub");
++
++ masm.bind(bailoutTail);
++ masm.generateBailoutTail(IntArgReg1, IntArgReg2);
++}
+diff --git a/js/src/jit/shared/Assembler-shared.h b/js/src/jit/shared/Assembler-shared.h
+index d5fed2fabe31..490a9f5391e0 100644
+--- a/js/src/jit/shared/Assembler-shared.h
++++ b/js/src/jit/shared/Assembler-shared.h
+@@ -30,14 +30,15 @@
+
+ #if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64) || \
+ defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+- defined(JS_CODEGEN_WASM32) || defined(JS_CODEGEN_RISCV64)
++ defined(JS_CODEGEN_WASM32) || defined(JS_CODEGEN_RISCV64) || \
++ defined(JS_CODEGEN_PPC64)
+ // Push return addresses callee-side.
+ # define JS_USE_LINK_REGISTER
+ #endif
+
+ #if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_ARM64) || \
+ defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || \
+- defined(JS_CODEGEN_ARM)
++ defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_PPC64)
+ // JS_CODELABEL_LINKMODE gives labels additional metadata
+ // describing how Bind() should patch them.
+ # define JS_CODELABEL_LINKMODE
+diff --git a/js/src/jit/shared/AtomicOperations-feeling-lucky-gcc.h b/js/src/jit/shared/AtomicOperations-feeling-lucky-gcc.h
+index a6909e560bef..d886cba2c7e6 100644
+--- a/js/src/jit/shared/AtomicOperations-feeling-lucky-gcc.h
++++ b/js/src/jit/shared/AtomicOperations-feeling-lucky-gcc.h
+@@ -46,7 +46,8 @@
+ // code in this file.
+
+ #if defined(JS_SIMULATOR_ARM64) || defined(JS_SIMULATOR_ARM) || \
+- defined(JS_SIMULATOR_MIPS64) || defined(JS_SIMULATOR_LOONG64)
++ defined(JS_SIMULATOR_MIPS64) || defined(JS_SIMULATOR_LOONG64) || \
++ defined(JS_SIMULATOR_PPC64)
+ // On some x86 (32-bit) systems this will not work because the compiler does not
+ // open-code 64-bit atomics. If so, try linking with -latomic. If that doesn't
+ // work, you're mostly on your own.
+diff --git a/js/src/jit/shared/CodeGenerator-shared.cpp b/js/src/jit/shared/CodeGenerator-shared.cpp
+index ada87f1f11a2..14468356cf31 100644
+--- a/js/src/jit/shared/CodeGenerator-shared.cpp
++++ b/js/src/jit/shared/CodeGenerator-shared.cpp
+@@ -86,8 +86,8 @@ CodeGeneratorShared::CodeGeneratorShared(MIRGenerator* gen, LIRGraph* graph,
+
+ #ifdef ENABLE_WASM_SIMD
+ # if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_X86) || \
+- defined(JS_CODEGEN_ARM64)
+- // On X64/x86 and ARM64, we don't need alignment for Wasm SIMD at this time.
++ defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_PPC64)
++ // On X64/x86, ARM64, and PPC64, we don't need alignment for Wasm SIMD at this time.
+ # else
+ # error \
+ "we may need padding so that local slots are SIMD-aligned and the stack must be kept SIMD-aligned too."
+@@ -1075,7 +1075,7 @@ Label* CodeGeneratorShared::getJumpLabelForBranch(MBasicBlock* block) {
+ // This function is not used for MIPS64/LOONG64/RISCV64. They have
+ // branchToBlock.
+ #if !defined(JS_CODEGEN_MIPS64) && !defined(JS_CODEGEN_LOONG64) && \
+- !defined(JS_CODEGEN_RISCV64)
++ !defined(JS_CODEGEN_RISCV64) && !defined(JS_CODEGEN_PPC64)
+ void CodeGeneratorShared::jumpToBlock(MBasicBlock* mir,
+ Assembler::Condition cond) {
+ // Skip past trivial blocks.
+diff --git a/js/src/jit/shared/Lowering-shared-inl.h b/js/src/jit/shared/Lowering-shared-inl.h
+index bdcc1da7d41a..b62f8f681df1 100644
+--- a/js/src/jit/shared/Lowering-shared-inl.h
++++ b/js/src/jit/shared/Lowering-shared-inl.h
+@@ -527,7 +527,7 @@ LAllocation LIRGeneratorShared::useRegisterOrNonDoubleConstant(
+
+ #if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64) || \
+ defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_MIPS64) || \
+- defined(JS_CODEGEN_RISCV64)
++ defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ LAllocation LIRGeneratorShared::useAnyOrConstant(MDefinition* mir) {
+ return useRegisterOrConstant(mir);
+ }
+diff --git a/js/src/js-config.mozbuild b/js/src/js-config.mozbuild
+index 22becaf4ecfb..ff5294825e9d 100644
+--- a/js/src/js-config.mozbuild
++++ b/js/src/js-config.mozbuild
+@@ -8,6 +8,7 @@ if (
+ CONFIG["JS_CODEGEN_X64"]
+ or CONFIG["JS_CODEGEN_ARM64"]
+ or CONFIG["JS_CODEGEN_RISCV64"]
++ or CONFIG["JS_CODEGEN_PPC64"]
+ ):
+ DEFINES["WASM_SUPPORTS_HUGE_MEMORY"] = True
+
+diff --git a/js/src/jsapi-tests/testJitABIcalls.cpp b/js/src/jsapi-tests/testJitABIcalls.cpp
+index b5c03a47dd83..887ad9e3d959 100644
+--- a/js/src/jsapi-tests/testJitABIcalls.cpp
++++ b/js/src/jsapi-tests/testJitABIcalls.cpp
+@@ -718,6 +718,9 @@ class JitABICall final : public jsapitest::RuntimeTest,
+ #elif defined(JS_CODEGEN_RISCV64)
+ Register base = t0;
+ regs.take(base);
++#elif defined(JS_CODEGEN_PPC64)
++ Register base = r11;
++ regs.take(base);
+ #else
+ # error "Unknown architecture!"
+ #endif
+diff --git a/js/src/jsapi-tests/testWasmReturnCalls.cpp b/js/src/jsapi-tests/testWasmReturnCalls.cpp
+index 4728f2404ae8..a07ddb2f214e 100644
+--- a/js/src/jsapi-tests/testWasmReturnCalls.cpp
++++ b/js/src/jsapi-tests/testWasmReturnCalls.cpp
+@@ -32,7 +32,10 @@ BEGIN_TEST(testWasmCheckSlowCallMarkerHit) {
+
+ masm.bind(&check);
+ # ifdef JS_USE_LINK_REGISTER
+-# if !defined(JS_CODEGEN_LOONG64) && !defined(JS_CODEGEN_MIPS64) && \
++# if defined(JS_CODEGEN_PPC64)
++ static constexpr Register ra = ABINonArgReg3;
++ masm.xs_mflr(ra);
++# elif !defined(JS_CODEGEN_LOONG64) && !defined(JS_CODEGEN_MIPS64) && \
+ !defined(JS_CODEGEN_RISCV64)
+ static constexpr Register ra = lr;
+ # endif
+@@ -70,7 +73,10 @@ BEGIN_TEST(testWasmCheckSlowCallMarkerMiss) {
+
+ masm.bind(&check);
+ # ifdef JS_USE_LINK_REGISTER
+-# if !defined(JS_CODEGEN_LOONG64) && !defined(JS_CODEGEN_MIPS64) && \
++# if defined(JS_CODEGEN_PPC64)
++ static constexpr Register ra = ABINonArgReg3;
++ masm.xs_mflr(ra);
++# elif !defined(JS_CODEGEN_LOONG64) && !defined(JS_CODEGEN_MIPS64) && \
+ !defined(JS_CODEGEN_RISCV64)
+ static constexpr Register ra = lr;
+ # endif
+diff --git a/js/src/jsapi-tests/testsJit.cpp b/js/src/jsapi-tests/testsJit.cpp
+index a2dfe5d0196c..7f3dcca895d2 100644
+--- a/js/src/jsapi-tests/testsJit.cpp
++++ b/js/src/jsapi-tests/testsJit.cpp
+@@ -25,6 +25,14 @@ void PrepareJit(js::jit::MacroAssembler& masm) {
+ #if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+ defined(JS_CODEGEN_RISCV64)
+ save.add(js::jit::ra);
++#elif defined(JS_CODEGEN_PPC64)
++ // LR on PPC64 isn't a GPR; save it to the stack manually.
++ {
++ UseScratchRegisterScope temps(masm);
++ Register scratch = temps.Acquire();
++ masm.xs_mflr(scratch);
++ masm.as_stdu(scratch, StackPointer, -8);
++ }
+ #elif defined(JS_USE_LINK_REGISTER)
+ save.add(js::jit::lr);
+ #endif
+@@ -44,6 +52,8 @@ bool ExecuteJit(JSContext* cx, js::jit::MacroAssembler& masm) {
+ #if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+ defined(JS_CODEGEN_RISCV64)
+ restore.add(js::jit::ra);
++#elif defined(JS_CODEGEN_PPC64)
++ // LR will be restored manually after PopRegsInMask.
+ #elif defined(JS_USE_LINK_REGISTER)
+ restore.add(js::jit::lr);
+ #endif
+@@ -55,6 +65,16 @@ bool ExecuteJit(JSContext* cx, js::jit::MacroAssembler& masm) {
+
+ // Reset stack pointer.
+ masm.SetStackPointer64(PseudoStackPointer64);
++#elif defined(JS_CODEGEN_PPC64)
++ // Restore LR from the stack and return.
++ {
++ UseScratchRegisterScope temps(masm);
++ Register scratch = temps.Acquire();
++ masm.as_ld(scratch, StackPointer, 0);
++ masm.xs_mtlr(scratch);
++ masm.as_addi(StackPointer, StackPointer, 8);
++ }
++ masm.as_blr();
+ #else
+ // Exit the JIT-ed code using the ABI return style.
+ masm.abiret();
+diff --git a/js/src/shell/js.cpp b/js/src/shell/js.cpp
+index 45bc0796b964..20eb1231bb7f 100644
+--- a/js/src/shell/js.cpp
++++ b/js/src/shell/js.cpp
+@@ -7895,6 +7895,13 @@ static void SingleStepCallback(void* arg, jit::Simulator* sim, void* pc) {
+ state.fp = (void*)sim->getRegister(jit::Simulator::fp);
+ // see WasmTailCallFPScratchReg and CollapseWasmFrameFast
+ state.tempFP = (void*)sim->getRegister(jit::Simulator::t3);
++# elif defined(JS_SIMULATOR_PPC64)
++ state.sp = (void*)sim->getRegister(jit::Simulator::sp);
++ state.lr = (void*)sim->getLR();
++ state.fp = (void*)sim->getRegister(jit::Simulator::fp);
++ // WasmTailCallFPScratchReg = ABINonArgReg3 = r22 holds the unwind FP
++ // during the wasm tail-call collapse window (RestoreFpRa unwind info).
++ state.tempFP = (void*)sim->getRegister(jit::Simulator::r22);
+ # else
+ # error "NYI: Single-step profiling support"
+ # endif
+@@ -13144,6 +13151,15 @@ bool InitOptionParser(OptionParser& op) {
+ "Stop the RISC-V simulator after the given "
+ "NUMBER of instructions.",
+ -1) ||
++#endif
++#ifdef JS_SIMULATOR_PPC64
++ !op.addBoolOption('\0', "ppc64-sim-icache-checks",
++ "Enable icache flush checks in the PPC64 "
++ "simulator.") ||
++ !op.addIntOption('\0', "ppc64-sim-stop-at", "NUMBER",
++ "Stop the PPC64 simulator after the given "
++ "NUMBER of instructions.",
++ -1) ||
+ #endif
+ !op.addIntOption('\0', "nursery-size", "SIZE-MB",
+ "Set the maximum nursery size in MB",
+@@ -14235,6 +14251,15 @@ bool SetContextJITOptions(JSContext* cx, const OptionParser& op) {
+ if (stopAt >= 0) {
+ jit::Simulator::StopSimAt = stopAt;
+ }
++#elif defined(JS_SIMULATOR_PPC64)
++ if (op.getBoolOption("ppc64-sim-icache-checks")) {
++ jit::SimulatorProcess::ICacheCheckingDisableCount = 0;
++ }
++
++ int32_t stopAt = op.getIntOption("ppc64-sim-stop-at");
++ if (stopAt >= 0) {
++ jit::Simulator::StopSimAt = stopAt;
++ }
+ #endif
+
+ #ifdef DEBUG
+diff --git a/js/src/shell/jsshell.h b/js/src/shell/jsshell.h
+index e8d47ba6888c..57e2b15f3cdd 100644
+--- a/js/src/shell/jsshell.h
++++ b/js/src/shell/jsshell.h
+@@ -22,7 +22,8 @@
+
+ // Some platform hooks must be implemented for single-step profiling.
+ #if defined(JS_SIMULATOR_ARM) || defined(JS_SIMULATOR_MIPS64) || \
+- defined(JS_SIMULATOR_ARM64) || defined(JS_SIMULATOR_LOONG64)
++ defined(JS_SIMULATOR_ARM64) || defined(JS_SIMULATOR_LOONG64) || \
++ defined(JS_SIMULATOR_RISCV64) || defined(JS_SIMULATOR_PPC64)
+ # define SINGLESTEP_PROFILING
+ #endif
+
+diff --git a/js/src/tests/shell/os.js b/js/src/tests/shell/os.js
+index 929982756548..f3d2396b17eb 100644
+--- a/js/src/tests/shell/os.js
++++ b/js/src/tests/shell/os.js
+@@ -20,7 +20,13 @@ var info = os.waitpid(kidpid, true);
+ assertEq(info.hasOwnProperty("pid"), false);
+ assertEq(info.hasOwnProperty("exitStatus"), false);
+
+-os.kill(kidpid);
++// Use SIGKILL (9) instead of the default SIGINT: under heavy parallel test
++// load, SIGINT delivery can race with the child's signal-handler setup and
++// the kernel's reaping path, leading to waitpid below blocking until the
++// `sleep 60` exits normally. SIGKILL is uncatchable and forces immediate
++// termination, so the assertion below ("killed process should not have
++// exitStatus") is reliable.
++os.kill(kidpid, 9);
+
+ info = os.waitpid(kidpid);
+ assertEq(info.hasOwnProperty("pid"), true, "waiting on dead process should return pid");
+diff --git a/js/src/util/Poison.h b/js/src/util/Poison.h
+index 721ecff6149d..de7981aa6f60 100644
+--- a/js/src/util/Poison.h
++++ b/js/src/util/Poison.h
+@@ -92,6 +92,8 @@ const uint8_t JS_SCOPE_DATA_TRAILING_NAMES_PATTERN = 0xCC;
+ #elif defined(JS_CODEGEN_RISCV64)
+ # define JS_SWEPT_CODE_PATTERN \
+ 0x29 // illegal sb instruction, crashes in user mode.
++#elif defined(JS_CODEGEN_PPC64)
++# define JS_SWEPT_CODE_PATTERN 0x00 // illegal instruction (all zeros)
+ #else
+ # error "JS_SWEPT_CODE_PATTERN not defined for this platform"
+ #endif
+diff --git a/js/src/wasm/WasmAnyRef.h b/js/src/wasm/WasmAnyRef.h
+index f81d4c6171b6..7200e9ab0e23 100644
+--- a/js/src/wasm/WasmAnyRef.h
++++ b/js/src/wasm/WasmAnyRef.h
+@@ -209,7 +209,7 @@ class AnyRef {
+ // Truncate the value to the 31-bit value size.
+ uintptr_t wideValue = uintptr_t(value & 0x7FFFFFFF);
+ #elif defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_MIPS64) || \
+- defined(JS_CODEGEN_RISCV64)
++ defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ // Sign extend the value to the native pointer size.
+ uintptr_t wideValue = uintptr_t(int64_t((uint64_t(value) << 33)) >> 33);
+ #elif !defined(JS_64BIT)
+@@ -234,6 +234,11 @@ class AnyRef {
+ # ifdef DEBUG
+ # if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM64)
+ MOZ_ASSERT(value <= UINT32_MAX);
++# elif defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_MIPS64) || \
++ defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
++ // On sign-extending platforms, a canonical i32 must be the sign
++ // extension of its low 32 bits.
++ MOZ_ASSERT(value == uintptr_t(int64_t(int32_t(value))));
+ # endif
+ # endif
+ }
+diff --git a/js/src/wasm/WasmBCDefs.h b/js/src/wasm/WasmBCDefs.h
+index b44e91e28693..66a8c9afe8c6 100644
+--- a/js/src/wasm/WasmBCDefs.h
++++ b/js/src/wasm/WasmBCDefs.h
+@@ -44,6 +44,9 @@
+ #if defined(JS_CODEGEN_RISCV64)
+ # include "jit/riscv64/Assembler-riscv64.h"
+ #endif
++#if defined(JS_CODEGEN_PPC64)
++# include "jit/ppc64/Assembler-ppc64.h"
++#endif
+ #include "js/ScalarType.h"
+ #include "util/Memory.h"
+ #include "wasm/WasmCodegenTypes.h"
+@@ -151,6 +154,10 @@ enum class RhsDestOp { True = true };
+ # define RABALDR_PIN_INSTANCE
+ #endif
+
++#ifdef JS_CODEGEN_PPC64
++# define RABALDR_PIN_INSTANCE
++#endif
++
+ // Max number of pushes onto the value stack for any opcode or emitter that
+ // does not push a variable, unbounded amount (anything with multiple
+ // results). This includes also intermediate pushes such as values pushed as
+diff --git a/js/src/wasm/WasmBCMemory.cpp b/js/src/wasm/WasmBCMemory.cpp
+index 835512b09b8c..9137b09f4684 100644
+--- a/js/src/wasm/WasmBCMemory.cpp
++++ b/js/src/wasm/WasmBCMemory.cpp
+@@ -372,7 +372,7 @@ void BaseCompiler::boundsCheckBelow4GBAccess(uint32_t memoryIndex,
+ // Make sure the ptr could be used as an index register.
+ static inline void ToValidIndex(MacroAssembler& masm, RegI32 ptr) {
+ #if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+- defined(JS_CODEGEN_RISCV64)
++ defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ // When ptr is used as an index, it will be added to a 64-bit register.
+ // So we should explicitly promote ptr to 64-bit. Since now ptr holds a
+ // unsigned 32-bit value, we zero-extend it to 64-bit here.
+@@ -645,6 +645,13 @@ void BaseCompiler::executeLoad(MemoryAccessDesc* access, AccessCheck* check,
+ } else {
+ masm.wasmLoad(*access, memoryBase, ptr, ptr, dest.any());
+ }
++#elif defined(JS_CODEGEN_PPC64)
++ MOZ_ASSERT(temp.isInvalid());
++ if (dest.tag == AnyReg::I64) {
++ masm.wasmLoadI64(*access, memoryBase, ptr, ptr, dest.i64());
++ } else {
++ masm.wasmLoad(*access, memoryBase, ptr, ptr, dest.any());
++ }
+ #else
+ MOZ_CRASH("BaseCompiler platform hook: load");
+ #endif
+@@ -675,10 +682,11 @@ void BaseCompiler::load(MemoryAccessDesc* access, AccessCheck* check,
+ // generated is the same for the 64-bit and the 32-bit case.
+ return executeLoad(access, check, instance, memoryBase, RegI32(ptr.reg), dest,
+ maybeFromI64(temp));
+-#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64)
+- // On mips64 and loongarch64, the 'prepareMemoryAccess' function will make
+- // sure that ptr holds a valid 64-bit index value. Thus the code generated in
+- // 'executeLoad' is the same for the 64-bit and the 32-bit case.
++#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
++ defined(JS_CODEGEN_PPC64)
++ // On mips64, loongarch64, and ppc64, the 'prepareMemoryAccess' function will
++ // make sure that ptr holds a valid 64-bit index value. Thus the code
++ // generated in 'executeLoad' is the same for the 64-bit and the 32-bit case.
+ return executeLoad(access, check, instance, memoryBase, RegI32(ptr.reg), dest,
+ maybeFromI64(temp));
+ #elif defined(JS_CODEGEN_RISCV64)
+@@ -788,6 +796,13 @@ void BaseCompiler::executeStore(MemoryAccessDesc* access, AccessCheck* check,
+ } else {
+ masm.wasmStore(*access, src.any(), memoryBase, ptr, ptr);
+ }
++#elif defined(JS_CODEGEN_PPC64)
++ MOZ_ASSERT(temp.isInvalid());
++ if (access->type() == Scalar::Int64) {
++ masm.wasmStoreI64(*access, src.i64(), memoryBase, ptr, ptr);
++ } else {
++ masm.wasmStore(*access, src.any(), memoryBase, ptr, ptr);
++ }
+ #else
+ MOZ_CRASH("BaseCompiler platform hook: store");
+ #endif
+@@ -812,7 +827,7 @@ void BaseCompiler::store(MemoryAccessDesc* access, AccessCheck* check,
+ maybeFromI64(temp));
+ #elif defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM64) || \
+ defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+- defined(JS_CODEGEN_RISCV64)
++ defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ return executeStore(access, check, instance, memoryBase, RegI32(ptr.reg), src,
+ maybeFromI64(temp));
+ #else
+@@ -1295,7 +1310,8 @@ static void Deallocate(BaseCompiler* bc, RegI32 rv, const Temps& temps) {
+ bc->freeI32(temps.t0);
+ }
+
+-#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64)
++#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
++ defined(JS_CODEGEN_PPC64)
+
+ struct Temps {
+ RegI32 t0, t1, t2;
+@@ -1504,7 +1520,7 @@ static void Deallocate(BaseCompiler* bc, AtomicOp op, RegI64 rv, RegI64 temp) {
+ }
+
+ #elif defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) || \
+- defined(JS_CODEGEN_LOONG64)
++ defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_PPC64)
+
+ static void PopAndAllocate(BaseCompiler* bc, AtomicOp op, RegI64* rd,
+ RegI64* rv, RegI64* temp) {
+@@ -1678,7 +1694,8 @@ static void Deallocate(BaseCompiler* bc, RegI32 rv, const Temps&) {
+ bc->freeI32(rv);
+ }
+
+-#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64)
++#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
++ defined(JS_CODEGEN_PPC64)
+
+ struct Temps {
+ RegI32 t0, t1, t2;
+@@ -1844,7 +1861,7 @@ static void Deallocate(BaseCompiler* bc, RegI64 rd, RegI64 rv) {
+ }
+
+ #elif defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) || \
+- defined(JS_CODEGEN_LOONG64)
++ defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_PPC64)
+
+ static void PopAndAllocate(BaseCompiler* bc, RegI64* rd, RegI64* rv) {
+ *rv = bc->popI64();
+@@ -2017,7 +2034,8 @@ static void Deallocate(BaseCompiler* bc, RegI32 rexpect, RegI32 rnew,
+ bc->freeI32(rexpect);
+ }
+
+-#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64)
++#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
++ defined(JS_CODEGEN_PPC64)
+
+ struct Temps {
+ RegI32 t0, t1, t2;
+@@ -2287,7 +2305,7 @@ static void Deallocate(BaseCompiler* bc, RegI64 rexpect, RegI64 rnew) {
+ }
+
+ #elif defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) || \
+- defined(JS_CODEGEN_LOONG64)
++ defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_PPC64)
+
+ template <typename RegAddressType>
+ static void PopAndAllocate(BaseCompiler* bc, RegI64* rexpect, RegI64* rnew,
+@@ -2885,6 +2903,11 @@ void BaseCompiler::loadExtend(MemoryAccessDesc* access, Scalar::Type viewType) {
+ RegI64 rs = popI64();
+ RegV128 rd = needV128();
+ masm.moveGPR64ToDouble(rs, rd);
++#ifdef JS_CODEGEN_PPC64
++ // mtvsrd places value in BE dw0 (= LE dw1). widenLow* operates on LE dw0.
++ // Swap dwords to move loaded data to the correct half.
++ masm.as_xxpermdi(rd, rd, rd, 2);
++#endif
+ switch (viewType) {
+ case Scalar::Int8:
+ masm.widenLowInt8x16(rd, rd);
+diff --git a/js/src/wasm/WasmBCRegDefs.h b/js/src/wasm/WasmBCRegDefs.h
+index bb84f0863de2..fd37bd464f39 100644
+--- a/js/src/wasm/WasmBCRegDefs.h
++++ b/js/src/wasm/WasmBCRegDefs.h
+@@ -118,6 +118,13 @@ static constexpr Register RabaldrScratchI32 = CallTempReg2;
+ static constexpr Register RabaldrScratchI32 = CallTempReg2;
+ #endif
+
++#ifdef JS_CODEGEN_PPC64
++# define RABALDR_SCRATCH_I32
++// Use r25 (callee-saved, non-arg, not used by any wasm infrastructure)
++// instead of CallTempReg2 (r10) which is IntArgReg7.
++static constexpr Register RabaldrScratchI32 = r25;
++#endif
++
+ #ifdef RABALDR_SCRATCH_F32_ALIASES_F64
+ # if !defined(RABALDR_SCRATCH_F32) || !defined(RABALDR_SCRATCH_F64)
+ # error "Bad configuration"
+@@ -386,8 +393,9 @@ struct SpecificRegs {
+
+ SpecificRegs() : abiReturnRegI64(ReturnReg64) {}
+ };
+-#elif defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) || \
+- defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#elif defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) || \
++ defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || \
++ defined(JS_CODEGEN_PPC64)
+ struct SpecificRegs {
+ // Required by gcc.
+ SpecificRegs() {}
+diff --git a/js/src/wasm/WasmBaselineCompile.cpp b/js/src/wasm/WasmBaselineCompile.cpp
+index 2af7ad7f583b..c57180dd362b 100644
+--- a/js/src/wasm/WasmBaselineCompile.cpp
++++ b/js/src/wasm/WasmBaselineCompile.cpp
+@@ -376,11 +376,15 @@ void BaseCompiler::tableSwitch(Label* theTable, RegI32 switchValue,
+ masm.ma_ldr(DTRAddr(scratch, DtrRegImmShift(switchValue, LSL, 2)), pc, Offset,
+ Assembler::Always);
+ #elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+- defined(JS_CODEGEN_RISCV64)
++ defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ ScratchI32 scratch(*this);
+ CodeLabel tableCl;
+
++# if defined(JS_CODEGEN_PPC64)
++ masm.mov(&tableCl, scratch);
++# else
+ masm.ma_li(scratch, &tableCl);
++# endif
+
+ tableCl.target()->bind(theTable->offset());
+ masm.addCodeLabel(tableCl);
+@@ -898,7 +902,7 @@ void BaseCompiler::insertBreakablePoint(CallSiteKind kind) {
+ masm.append(CallSiteDesc(iter_.lastOpcodeOffset(), kind),
+ CodeOffset(masm.currentOffset()));
+ #elif defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_MIPS64) || \
+- defined(JS_CODEGEN_RISCV64)
++ defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ ScratchPtr scratch(*this);
+ Label L;
+ masm.loadPtr(Address(InstanceReg, Instance::offsetOfDebugStub()), scratch);
+@@ -972,7 +976,7 @@ void BaseCompiler::insertPerFunctionDebugStub() {
+ masm.ma_bx(lr, Assembler::Zero);
+ }
+ #elif defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_MIPS64) || \
+- defined(JS_CODEGEN_RISCV64)
++ defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ {
+ ScratchPtr scratch(*this);
+
+@@ -1403,7 +1407,7 @@ void BaseCompiler::popStackResults(ABIResultIter& iter, StackHeight stackBase) {
+ switch (v.kind()) {
+ case Stk::ConstI32:
+ #if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+- defined(JS_CODEGEN_RISCV64)
++ defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ fr.storeImmediatePtrToStack(v.i32val_, resultHeight, temp);
+ #else
+ fr.storeImmediatePtrToStack(uint32_t(v.i32val_), resultHeight, temp);
+@@ -1723,6 +1727,13 @@ void BaseCompiler::passArg(ValType type, const Stk& arg, FunctionCall* call) {
+ argLoc.offsetFromArgBase()));
+ } else {
+ loadI32(arg, RegI32(argLoc.gpr()));
++#ifdef JS_CODEGEN_PPC64
++ // addi can sign-extend, which yields wrong values when the C++
++ // callee expects a uint32_t. Clear the upper 32 bits.
++ if (call->abiKind == ABIKind::System) {
++ masm.as_rldicl(argLoc.gpr(), argLoc.gpr(), 0, 32);
++ }
++#endif
+ }
+ break;
+ }
+@@ -2372,9 +2383,10 @@ void BaseCompiler::finishTryNote(size_t tryNoteIndex) {
+ RegI32 BaseCompiler::needRotate64Temp() {
+ #if defined(JS_CODEGEN_X86)
+ return needI32();
+-#elif defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM) || \
+- defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) || \
+- defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#elif defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM) || \
++ defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) || \
++ defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || \
++ defined(JS_CODEGEN_PPC64)
+ return RegI32::Invalid();
+ #else
+ MOZ_CRASH("BaseCompiler platform hook: needRotate64Temp");
+@@ -2433,6 +2445,8 @@ void BaseCompiler::popAndAllocateForMulI64(RegI64* r0, RegI64* r1,
+ pop2xI64(r0, r1);
+ #elif defined(JS_CODEGEN_RISCV64)
+ pop2xI64(r0, r1);
++#elif defined(JS_CODEGEN_PPC64)
++ pop2xI64(r0, r1);
+ #else
+ MOZ_CRASH("BaseCompiler porting interface: popAndAllocateForMulI64");
+ #endif
+@@ -2866,6 +2880,9 @@ static RegI32 PopcntTemp(BaseCompiler& bc) {
+ defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+ defined(JS_CODEGEN_RISCV64)
+ return bc.needI32();
++#elif defined(JS_CODEGEN_PPC64)
++ // PPC64 has native popcntd/popcntw; no temp register needed.
++ return RegI32::Invalid();
+ #else
+ MOZ_CRASH("BaseCompiler platform hook: PopcntTemp");
+ #endif
+@@ -9362,6 +9379,11 @@ static void MulI64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd,
+ RegV128 temp1, RegV128 temp2) {
+ masm.mulInt64x2(rsd, rs, rsd, temp1, temp2);
+ }
++# elif defined(JS_CODEGEN_PPC64)
++static void MulI64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd,
++ RegV128 temp1, RegV128 temp2) {
++ masm.mulInt64x2(rsd, rs, rsd, temp1, temp2);
++}
+ # endif
+
+ static void MulF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
+@@ -9376,7 +9398,8 @@ static void DivF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
+ masm.divFloat64x2(rsd, rs, rsd);
+ }
+
+-# if defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_X64)
++# if defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_X64) || \
++ defined(JS_CODEGEN_PPC64)
+ static void MinF32x4(MacroAssembler& masm, RegV128 rs, RegV128 rsd,
+ RegV128 temp1, RegV128 temp2) {
+ masm.minFloat32x4(rsd, rs, rsd, temp1, temp2);
+@@ -9397,6 +9420,22 @@ static void MaxF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd,
+ masm.maxFloat64x2(rsd, rs, rsd, temp1, temp2);
+ }
+
++# if defined(JS_CODEGEN_PPC64)
++// PPC64: use non-RhsDestOp convention (first=rhs, second=lhsDest),
++// matching the pseudoMin/Max function signature.
++static void PMinF32x4(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
++ masm.pseudoMinFloat32x4(rs, rsd);
++}
++static void PMinF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
++ masm.pseudoMinFloat64x2(rs, rsd);
++}
++static void PMaxF32x4(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
++ masm.pseudoMaxFloat32x4(rs, rsd);
++}
++static void PMaxF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
++ masm.pseudoMaxFloat64x2(rs, rsd);
++}
++# else
+ static void PMinF32x4(MacroAssembler& masm, RegV128 rsd, RegV128 rs,
+ RhsDestOp) {
+ masm.pseudoMinFloat32x4(rsd, rs);
+@@ -9416,6 +9455,7 @@ static void PMaxF64x2(MacroAssembler& masm, RegV128 rsd, RegV128 rs,
+ RhsDestOp) {
+ masm.pseudoMaxFloat64x2(rsd, rs);
+ }
++# endif
+ # elif defined(JS_CODEGEN_ARM64)
+ static void MinF32x4(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
+ masm.minFloat32x4(rs, rsd);
+@@ -9806,6 +9846,68 @@ static void ShiftRightI64x2(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
+ masm.rightShiftInt64x2(rsd, temp, rsd);
+ }
+
++static void ShiftRightUI64x2(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
++ RegI32 temp) {
++ ShiftOpMask(masm, SimdOp::I64x2ShrU, rs, temp);
++ masm.unsignedRightShiftInt64x2(rsd, temp, rsd);
++}
++# elif defined(JS_CODEGEN_PPC64)
++// PPC64: same as ARM64 pattern (shift amount in GPR, result in vector reg)
++static void ShiftLeftI8x16(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
++ RegI32 temp) {
++ ShiftOpMask(masm, SimdOp::I8x16Shl, rs, temp);
++ masm.leftShiftInt8x16(rsd, temp, rsd);
++}
++static void ShiftLeftI16x8(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
++ RegI32 temp) {
++ ShiftOpMask(masm, SimdOp::I16x8Shl, rs, temp);
++ masm.leftShiftInt16x8(rsd, temp, rsd);
++}
++static void ShiftLeftI32x4(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
++ RegI32 temp) {
++ ShiftOpMask(masm, SimdOp::I32x4Shl, rs, temp);
++ masm.leftShiftInt32x4(rsd, temp, rsd);
++}
++static void ShiftLeftI64x2(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
++ RegI32 temp) {
++ ShiftOpMask(masm, SimdOp::I64x2Shl, rs, temp);
++ masm.leftShiftInt64x2(rsd, temp, rsd);
++}
++static void ShiftRightI8x16(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
++ RegI32 temp) {
++ ShiftOpMask(masm, SimdOp::I8x16ShrS, rs, temp);
++ masm.rightShiftInt8x16(rsd, temp, rsd);
++}
++static void ShiftRightUI8x16(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
++ RegI32 temp) {
++ ShiftOpMask(masm, SimdOp::I8x16ShrU, rs, temp);
++ masm.unsignedRightShiftInt8x16(rsd, temp, rsd);
++}
++static void ShiftRightI16x8(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
++ RegI32 temp) {
++ ShiftOpMask(masm, SimdOp::I16x8ShrS, rs, temp);
++ masm.rightShiftInt16x8(rsd, temp, rsd);
++}
++static void ShiftRightUI16x8(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
++ RegI32 temp) {
++ ShiftOpMask(masm, SimdOp::I16x8ShrU, rs, temp);
++ masm.unsignedRightShiftInt16x8(rsd, temp, rsd);
++}
++static void ShiftRightI32x4(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
++ RegI32 temp) {
++ ShiftOpMask(masm, SimdOp::I32x4ShrS, rs, temp);
++ masm.rightShiftInt32x4(rsd, temp, rsd);
++}
++static void ShiftRightUI32x4(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
++ RegI32 temp) {
++ ShiftOpMask(masm, SimdOp::I32x4ShrU, rs, temp);
++ masm.unsignedRightShiftInt32x4(rsd, temp, rsd);
++}
++static void ShiftRightI64x2(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
++ RegI32 temp) {
++ ShiftOpMask(masm, SimdOp::I64x2ShrS, rs, temp);
++ masm.rightShiftInt64x2(rsd, temp, rsd);
++}
+ static void ShiftRightUI64x2(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
+ RegI32 temp) {
+ ShiftOpMask(masm, SimdOp::I64x2ShrU, rs, temp);
+@@ -10107,6 +10209,23 @@ static void BitmaskI32x4(MacroAssembler& masm, RegV128 rs, RegI32 rd,
+ masm.bitmaskInt32x4(rs, rd, temp);
+ }
+
++static void BitmaskI64x2(MacroAssembler& masm, RegV128 rs, RegI32 rd,
++ RegV128 temp) {
++ masm.bitmaskInt64x2(rs, rd, temp);
++}
++# elif defined(JS_CODEGEN_PPC64)
++static void BitmaskI8x16(MacroAssembler& masm, RegV128 rs, RegI32 rd,
++ RegV128 temp) {
++ masm.bitmaskInt8x16(rs, rd, temp);
++}
++static void BitmaskI16x8(MacroAssembler& masm, RegV128 rs, RegI32 rd,
++ RegV128 temp) {
++ masm.bitmaskInt16x8(rs, rd, temp);
++}
++static void BitmaskI32x4(MacroAssembler& masm, RegV128 rs, RegI32 rd,
++ RegV128 temp) {
++ masm.bitmaskInt32x4(rs, rd, temp);
++}
+ static void BitmaskI64x2(MacroAssembler& masm, RegV128 rs, RegI32 rd,
+ RegV128 temp) {
+ masm.bitmaskInt64x2(rs, rd, temp);
+@@ -10182,6 +10301,13 @@ static void BitselectV128(MacroAssembler& masm, RegV128 rhs, RegV128 control,
+ masm.bitwiseSelectSimd128(lhsDest, rhs, temp);
+ masm.moveSimd128(temp, lhsDest);
+ }
++# elif defined(JS_CODEGEN_PPC64)
++static void BitselectV128(MacroAssembler& masm, RegV128 rhs, RegV128 control,
++ RegV128 lhsDest, RegV128 temp) {
++ masm.moveSimd128(control, temp);
++ masm.bitwiseSelectSimd128(lhsDest, rhs, temp);
++ masm.moveSimd128(temp, lhsDest);
++}
+ # endif
+
+ # ifdef ENABLE_WASM_RELAXED_SIMD
+@@ -10257,7 +10383,7 @@ void BaseCompiler::emitDotI8x16I7x16AddS() {
+ RegV128 rsd = popV128();
+ RegV128 rs0, rs1;
+ pop2xV128(&rs0, &rs1);
+-# if defined(JS_CODEGEN_ARM64)
++# if defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_PPC64)
+ RegV128 temp = needV128();
+ masm.dotInt8x16Int7x16ThenAdd(rs0, rs1, rsd, temp);
+ freeV128(temp);
+@@ -10463,7 +10589,7 @@ bool BaseCompiler::emitVectorLaneSelect() {
+ freeV128(lhs);
+ freeV128(mask);
+ pushV128(rhsDest);
+-# elif defined(JS_CODEGEN_ARM64)
++# elif defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_PPC64)
+ RegV128 maskDest = popV128();
+ RegV128 rhs = popV128();
+ RegV128 lhs = popV128();
+@@ -12628,7 +12754,7 @@ bool js::wasm::BaselinePlatformSupport() {
+ #if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_X86) || \
+ defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64) || \
+ defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+- defined(JS_CODEGEN_RISCV64)
++ defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ return true;
+ #else
+ return false;
+diff --git a/js/src/wasm/WasmCodegenConstants.h b/js/src/wasm/WasmCodegenConstants.h
+index 9c10d307ae6f..e25332b5464e 100644
+--- a/js/src/wasm/WasmCodegenConstants.h
++++ b/js/src/wasm/WasmCodegenConstants.h
+@@ -43,7 +43,8 @@ static const unsigned InterpFailInstanceReg = 0xbad;
+ // The following thresholds were derived from a microbenchmark. If we begin to
+ // ship this optimization for more platforms, we will need to extend this list.
+
+-#if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM64)
++#if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM64) || \
++ defined(JS_CODEGEN_PPC64)
+ static const uint32_t MaxInlineMemoryCopyLength = 64;
+ static const uint32_t MaxInlineMemoryFillLength = 64;
+ #elif defined(JS_CODEGEN_X86)
+diff --git a/js/src/wasm/WasmCodegenTypes.cpp b/js/src/wasm/WasmCodegenTypes.cpp
+index 8b9f32639ea3..e906c4afecc3 100644
+--- a/js/src/wasm/WasmCodegenTypes.cpp
++++ b/js/src/wasm/WasmCodegenTypes.cpp
+@@ -144,14 +144,15 @@ void TrapSitesForKind::checkInvariants(const uint8_t* codeBase) const {
+ last = pcOffset;
+ }
+
+-# if (defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_X86) || \
+- defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_ARM) || \
+- defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_MIPS64))
++# if (defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_X86) || \
++ defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_ARM) || \
++ defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_MIPS64) || \
++ defined(JS_CODEGEN_PPC64))
+ // Check that each trapsite is associated with a plausible instruction. The
+ // required instruction kind depends on the trapsite kind.
+ //
+- // NOTE: currently enabled on x86_{32,64}, arm{32,64}, loongson64 and mips64.
+- // Ideally it should be extended to riscv64 too.
++ // NOTE: currently enabled on x86_{32,64}, arm{32,64}, loongson64, mips64,
++ // and ppc64. Ideally it should be extended to riscv64 too.
+ //
+ for (uint32_t i = 0; i < length(); i++) {
+ uint32_t pcOffset = pcOffsets_[i];
+diff --git a/js/src/wasm/WasmCompile.cpp b/js/src/wasm/WasmCompile.cpp
+index 051c60ebaa55..89447aa668ff 100644
+--- a/js/src/wasm/WasmCompile.cpp
++++ b/js/src/wasm/WasmCompile.cpp
+@@ -71,8 +71,9 @@ uint32_t wasm::ObservedCPUFeatures() {
+ ARM64 = 0x6,
+ LOONG64 = 0x7,
+ RISCV64 = 0x8,
++ PPC64 = 0x9,
+
+- LAST = RISCV64,
++ LAST = PPC64,
+ ARCH_BITS = 4
+ };
+
+@@ -101,6 +102,9 @@ uint32_t wasm::ObservedCPUFeatures() {
+ #elif defined(JS_CODEGEN_RISCV64)
+ MOZ_ASSERT(jit::GetRISCV64Flags() <= (UINT32_MAX >> ARCH_BITS));
+ return RISCV64 | (jit::GetRISCV64Flags() << ARCH_BITS);
++#elif defined(JS_CODEGEN_PPC64)
++ MOZ_ASSERT(jit::GetPPC64Flags() <= (UINT32_MAX >> ARCH_BITS));
++ return PPC64 | (jit::GetPPC64Flags() << ARCH_BITS);
+ #elif defined(JS_CODEGEN_NONE) || defined(JS_CODEGEN_WASM32)
+ return 0;
+ #else
+diff --git a/js/src/wasm/WasmFrameIter.cpp b/js/src/wasm/WasmFrameIter.cpp
+index b3b264bc625a..b540acf9a05d 100644
+--- a/js/src/wasm/WasmFrameIter.cpp
++++ b/js/src/wasm/WasmFrameIter.cpp
+@@ -622,6 +622,19 @@ static const unsigned PushedFP = 16;
+ static const unsigned SetFP = 20;
+ static const unsigned PoppedFP = 4;
+ static const unsigned PoppedFPJitEntry = 8;
++#elif defined(JS_CODEGEN_PPC64)
++// pushReturnAddress = mflr(4) + stdu(4) = 8 bytes.
++// push(FP) = stdu(4) = 4 bytes (PPC64 stdu is a single DS-form instruction).
++// moveStackPtrTo = mr(4) = 4 bytes.
++static const unsigned PushedRetAddr = 8;
++static const unsigned PushedFP = 12;
++static const unsigned SetFP = 16;
++// Callable + jit-entry epilogues between poppedFP and *ret are:
++// mtlr r0; addi sp, sp, 16 (two 4-byte instructions — 8 bytes).
++// mtlr must come before addi so LR holds the caller's RA throughout the
++// post-poppedFP window (single-step profiling fires every instruction).
++static const unsigned PoppedFP = 8;
++static const unsigned PoppedFPJitEntry = 8;
+ #elif defined(JS_CODEGEN_NONE) || defined(JS_CODEGEN_WASM32)
+ // Synthetic values to satisfy asserts and avoid compiler warnings.
+ static const unsigned PushedRetAddr = 0;
+@@ -710,6 +723,17 @@ static void GenerateCallablePrologue(MacroAssembler& masm, uint32_t* entry) {
+ masm.moveStackPtrTo(FramePointer);
+ MOZ_ASSERT_IF(!masm.oom(), SetFP == masm.currentOffset() - *entry);
+ }
++#elif defined(JS_CODEGEN_PPC64)
++ {
++ *entry = masm.currentOffset();
++
++ masm.pushReturnAddress();
++ MOZ_ASSERT_IF(!masm.oom(), PushedRetAddr == masm.currentOffset() - *entry);
++ masm.push(FramePointer);
++ MOZ_ASSERT_IF(!masm.oom(), PushedFP == masm.currentOffset() - *entry);
++ masm.moveStackPtrTo(FramePointer);
++ MOZ_ASSERT_IF(!masm.oom(), SetFP == masm.currentOffset() - *entry);
++ }
+ #elif defined(JS_CODEGEN_ARM64)
+ {
+ // We do not use the PseudoStackPointer. However, we may be called in a
+@@ -803,6 +827,38 @@ static void GenerateCallableEpilogue(MacroAssembler& masm, unsigned framePushed,
+ masm.jalr(zero, ra, 0);
+ masm.nop();
+ }
++#elif defined(JS_CODEGEN_PPC64)
++ // Load RA and FP from the Frame while it's still on the stack.
++ // Using r0 (js::jit::r0) for RA is safe: it's volatile, used as
++ // RT (not base), and we're in an epilogue where it's not live.
++ masm.loadPtr(Address(StackPointer, Frame::returnAddressOffset()),
++ js::jit::r0);
++ masm.loadPtr(Address(StackPointer, Frame::callerFPOffset()), FramePointer);
++
++ // Fence the pool BEFORE capturing poppedFP. PoppedFP is a static 8
++ // (mtlr + addi); enterNoPool itself can emit insertNopFill() and a
++ // preemptive finishPool() at its top edge, so any leading insertions
++ // must land before poppedFP — not between poppedFP and *ret. A pool
++ // flush inside the 2-insn window would otherwise extend *ret - poppedFP
++ // and trip the post-condition assertion below. P9 routes FP constants
++ // through the pool so flushes are more frequent than on P8 (the
++ // assertion was historically silent on P8 but reproducible on P9 dbgopt).
++ masm.enterNoPool(2);
++ poppedFP = masm.currentOffset();
++
++ // Move RA into LR BEFORE popping the Frame. If the order were addi/mtlr,
++ // single-step profiling firing at the mtlr instruction would see: sp
++ // already moved (so saved RA at sp[8] is gone), addi already executed,
++ // and LR still holding the address right after the function's last `bl`
++ // (i.e. inside this function, not the caller's RA). With mtlr first,
++ // the entire post-poppedFP window has LR == caller's RA available
++ // either via sp[8] (pre-addi) or registers.lr (post-mtlr).
++ masm.xs_mtlr(js::jit::r0);
++ masm.addToStackPtr(Imm32(sizeof(Frame)));
++ *ret = masm.currentOffset();
++ masm.leaveNoPool();
++ masm.as_blr();
++
+ #elif defined(JS_CODEGEN_ARM64)
+
+ // See comment at equivalent place in |GenerateCallablePrologue| above.
+@@ -1483,6 +1539,9 @@ void wasm::GenerateJitEntryPrologue(MacroAssembler& masm,
+ BlockTrampolinePoolScope block_trampoline_pool(&masm, 10);
+ offsets->begin = masm.currentOffset();
+ masm.push(ra);
++#elif defined(JS_CODEGEN_PPC64)
++ offsets->begin = masm.currentOffset();
++ masm.pushReturnAddress();
+ #elif defined(JS_CODEGEN_ARM64)
+ {
+ AutoForbidPoolsAndNops afp(&masm,
+@@ -1536,6 +1595,20 @@ void wasm::GenerateJitEntryEpilogue(MacroAssembler& masm,
+ masm.Ret(ARMRegister(lr, 64));
+ masm.setFramePushed(0);
+ }
++#elif defined(JS_CODEGEN_PPC64)
++ // Load RA and FP from the frame while it's still on the stack, then
++ // restore LR, pop the frame, and return. mtlr must precede addi so LR
++ // holds the caller's RA across the whole post-poppedFP window (see
++ // GenerateCallableEpilogue for the matching rationale).
++ masm.loadPtr(Address(StackPointer, Frame::returnAddressOffset()),
++ js::jit::r0);
++ masm.loadPtr(Address(StackPointer, Frame::callerFPOffset()), FramePointer);
++ poppedFP = masm.currentOffset();
++
++ masm.xs_mtlr(js::jit::r0);
++ masm.addToStackPtr(Imm32(sizeof(Frame)));
++ offsets->ret = masm.currentOffset();
++ masm.as_blr();
+ #else
+ // Forbid pools for the same reason as described in GenerateCallablePrologue.
+ # if defined(JS_CODEGEN_ARM)
+@@ -1905,6 +1978,22 @@ bool js::wasm::StartUnwinding(const RegisterState& registers,
+ fixedFP = fp;
+ AssertMatchesCallSite(fixedPC, fixedFP);
+ } else
++#elif defined(JS_CODEGEN_PPC64)
++ if (codeRange->isThunk()) {
++ // The FarJumpIsland sequence temporary scrambles the link register.
++ fixedPC = pc;
++ fixedFP = fp;
++ *unwoundCaller = false;
++ AssertMatchesCallSite(
++ Frame::fromUntaggedWasmExitFP(fp)->returnAddress(),
++ Frame::fromUntaggedWasmExitFP(fp)->rawCaller());
++ } else if (offsetFromEntry < PushedFP) {
++ // On PPC64 the return address is in LR (registers.lr) until
++ // pushReturnAddress() saves it to the stack.
++ fixedPC = (uint8_t*)registers.lr;
++ fixedFP = fp;
++ AssertMatchesCallSite(fixedPC, fixedFP);
++ } else
+ #elif defined(JS_CODEGEN_ARM64)
+ if (offsetFromEntry < SetFP || codeRange->isThunk()) {
+ // On ARM64 we rely on register state instead of state saved on
+@@ -1956,6 +2045,35 @@ bool js::wasm::StartUnwinding(const RegisterState& registers,
+ fixedPC = Frame::fromUntaggedWasmExitFP(sp)->returnAddress();
+ fixedFP = fp;
+ AssertMatchesCallSite(fixedPC, fixedFP);
++#elif defined(JS_CODEGEN_PPC64)
++ } else if (offsetInCode >= codeRange->ret() - PoppedFP &&
++ offsetInCode < codeRange->ret()) {
++ // PPC64 epilogue (RA loaded into r0, FP restored, RA not yet
++ // moved to LR, SP not yet adjusted):
++ // ld r0, 8(sp) ; restore caller's RA into r0
++ // ld FP, 0(sp) ; restore caller's FP
++ // <-- poppedFP -->
++ // mtlr r0 ; LR := caller's RA
++ // addi sp, sp, 16 ; pop the Frame
++ // <-- ret -->
++ // blr
++ // In the [poppedFP, ret) window the addi has not run, so *sp
++ // is still the saved Frame and sp[8] is the caller's RA.
++ // (registers.lr would also be correct after mtlr executes, but
++ // sp[8] is valid throughout this window — including before mtlr —
++ // so we read it consistently.)
++ MOZ_ASSERT(*sp == fp);
++ fixedPC = Frame::fromUntaggedWasmExitFP(sp)->returnAddress();
++ fixedFP = fp;
++ AssertMatchesCallSite(fixedPC, fixedFP);
++ } else if (offsetInCode == codeRange->ret()) {
++ // PPC64 epilogue, at the blr: addi has run, so SP is the
++ // caller's and *sp is unrelated memory. mtlr ran earlier in
++ // the [poppedFP, ret) window, so LR holds the caller's RA.
++ // fp holds the restored caller's FP.
++ fixedPC = (uint8_t*)registers.lr;
++ fixedFP = fp;
++ AssertMatchesCallSite(fixedPC, fixedFP);
+ #elif defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_LOONG64)
+ // The stack pointer does not move until all values have
+ // been restored so several cases can be coalesced here.
+diff --git a/js/src/wasm/WasmGC.cpp b/js/src/wasm/WasmGC.cpp
+index e59cd4f5aba0..21cd01fd1c5e 100644
+--- a/js/src/wasm/WasmGC.cpp
++++ b/js/src/wasm/WasmGC.cpp
+@@ -444,6 +444,14 @@ bool wasm::IsPlausibleStackMapKey(const uint8_t* nextPC) {
+ insn[-1] == 0x00000013 /* addi zero, zero, 0 */) || // jal; nop
+ (insn[-1] == 0x00100073 &&
+ (insn[-2] & kITypeMask) == RO_CSRRWI))); // wasm trap
++# elif defined(JS_CODEGEN_PPC64)
++ const uint32_t* insn = reinterpret_cast<const uint32_t*>(nextPC);
++ MOZ_ASSERT((uintptr_t(insn) & 3) == 0);
++ // xs_trap() = tw 31,r0,r0 (PPC_trap); bctrl = PPC_bctr|1; bl = I-form
++ // opcode 18 (PPC_b) with LK=1, AA=0, checked via 0xFC000003 mask.
++ return insn[-1] == uint32_t(PPC_trap) ||
++ insn[-1] == (uint32_t(PPC_bctr) | 1u) ||
++ (insn[-1] & 0xFC000003u) == (uint32_t(PPC_b) | 1u);
+ # else
+ MOZ_CRASH("IsValidStackMapKey: requires implementation on this platform");
+ # endif
+diff --git a/js/src/wasm/WasmGenerator.cpp b/js/src/wasm/WasmGenerator.cpp
+index 2dafac73e96a..07ffe150fc57 100644
+--- a/js/src/wasm/WasmGenerator.cpp
++++ b/js/src/wasm/WasmGenerator.cpp
+@@ -930,7 +930,23 @@ bool ModuleGenerator::finishCodeBlock(CodeBlockResult* result) {
+ callSiteTargets_.clear();
+ callFarJumps_.clear();
+
+- // None of the linking or far-jump operations should emit masm metadata.
++ // None of the linking or far-jump operations should emit masm metadata,
++ // except on PPC64 where patchFarJump uses addLongJump to create CodeLabels
++ // for absolute-address far jumps. Drain those into linkData_ here.
++#ifdef JS_CODEGEN_PPC64
++ for (const jit::CodeLabel& codeLabel : masm_->codeLabels()) {
++ LinkData::InternalLink link;
++ link.patchAtOffset = codeLabel.patchAt().offset();
++ link.targetOffset = codeLabel.target().offset();
++# ifdef JS_CODELABEL_LINKMODE
++ link.mode = codeLabel.linkMode();
++# endif
++ if (!linkData_->internalLinks.append(link)) {
++ return false;
++ }
++ }
++ masm_->codeLabels().clear();
++#endif
+
+ MOZ_ASSERT(masm_->inliningContext().empty());
+ MOZ_ASSERT(masm_->callSites().empty());
+diff --git a/js/src/wasm/WasmIonCompile.cpp b/js/src/wasm/WasmIonCompile.cpp
+index 9c79b9cf0704..0d0e661770af 100644
+--- a/js/src/wasm/WasmIonCompile.cpp
++++ b/js/src/wasm/WasmIonCompile.cpp
+@@ -11602,7 +11602,7 @@ bool js::wasm::IonPlatformSupport() {
+ #if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_X86) || \
+ defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) || \
+ defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_LOONG64) || \
+- defined(JS_CODEGEN_RISCV64)
++ defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ return true;
+ #else
+ return false;
+diff --git a/js/src/wasm/WasmMemory.cpp b/js/src/wasm/WasmMemory.cpp
+index 0e3e6d3509ad..feee9f6ea1c9 100644
+--- a/js/src/wasm/WasmMemory.cpp
++++ b/js/src/wasm/WasmMemory.cpp
+@@ -288,9 +288,9 @@ static_assert(MaxMemoryAccessSize <= HugeUnalignedGuardPage,
+ static_assert(HugeOffsetGuardLimit < UINT32_MAX,
+ "checking for overflow against OffsetGuardLimit is enough.");
+
+-// We have only tested huge memory on x64, arm64 and riscv64.
++// We have only tested huge memory on x64, arm64, riscv64 and ppc64.
+ # if !(defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM64) || \
+- defined(JS_CODEGEN_RISCV64))
++ defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64))
+ # error "Not an expected configuration"
+ # endif
+
+diff --git a/js/src/wasm/WasmSignalHandlers.cpp b/js/src/wasm/WasmSignalHandlers.cpp
+index cc8bc2755745..84d3c4ec164d 100644
+--- a/js/src/wasm/WasmSignalHandlers.cpp
++++ b/js/src/wasm/WasmSignalHandlers.cpp
+@@ -111,7 +111,9 @@ using namespace js::wasm;
+ # if defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || \
+ defined(__PPC64LE__)
+ # define R01_sig(p) ((p)->sc_frame.fixreg[1])
++# define R31_sig(p) ((p)->sc_frame.fixreg[31])
+ # define R32_sig(p) ((p)->sc_frame.srr0)
++# define R36_sig(p) ((p)->sc_frame.lr)
+ # endif
+ # elif defined(__linux__) || defined(__sun)
+ # if defined(__linux__)
+@@ -157,7 +159,9 @@ using namespace js::wasm;
+ # if defined(__linux__) && (defined(__ppc64__) || defined(__PPC64__) || \
+ defined(__ppc64le__) || defined(__PPC64LE__))
+ # define R01_sig(p) ((p)->uc_mcontext.gp_regs[1])
++# define R31_sig(p) ((p)->uc_mcontext.gp_regs[31])
+ # define R32_sig(p) ((p)->uc_mcontext.gp_regs[32])
++# define R36_sig(p) ((p)->uc_mcontext.gp_regs[36])
+ # endif
+ # if defined(__linux__) && defined(__loongarch__)
+ # define EPC_sig(p) ((p)->uc_mcontext.__pc)
+@@ -200,7 +204,9 @@ using namespace js::wasm;
+ # if defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || \
+ defined(__PPC64LE__)
+ # define R01_sig(p) ((p)->uc_mcontext.__gregs[_REG_R1])
++# define R31_sig(p) ((p)->uc_mcontext.__gregs[_REG_R31])
+ # define R32_sig(p) ((p)->uc_mcontext.__gregs[_REG_PC])
++# define R36_sig(p) ((p)->uc_mcontext.__gregs[_REG_LR])
+ # endif
+ # elif defined(__DragonFly__) || defined(__FreeBSD__) || \
+ defined(__FreeBSD_kernel__)
+@@ -234,7 +240,9 @@ using namespace js::wasm;
+ # if defined(__FreeBSD__) && (defined(__ppc64__) || defined(__PPC64__) || \
+ defined(__ppc64le__) || defined(__PPC64LE__))
+ # define R01_sig(p) ((p)->uc_mcontext.mc_gpr[1])
++# define R31_sig(p) ((p)->uc_mcontext.mc_gpr[31])
+ # define R32_sig(p) ((p)->uc_mcontext.mc_srr0)
++# define R36_sig(p) ((p)->uc_mcontext.mc_lr)
+ # endif
+ # elif defined(XP_DARWIN)
+ # define EIP_sig(p) ((p)->thread.uts.ts32.__eip)
+@@ -412,7 +420,8 @@ struct macos_aarch64_context {
+ defined(__PPC64LE__)
+ # define PC_sig(p) R32_sig(p)
+ # define SP_sig(p) R01_sig(p)
+-# define FP_sig(p) R01_sig(p)
++# define FP_sig(p) R31_sig(p)
++# define LR_sig(p) R36_sig(p)
+ # elif defined(__loongarch__)
+ # define PC_sig(p) EPC_sig(p)
+ # define FP_sig(p) RFP_sig(p)
+@@ -458,7 +467,8 @@ static uint8_t* ContextToSP(CONTEXT* context) {
+ }
+
+ # if defined(__arm__) || defined(__aarch64__) || defined(__mips__) || \
+- defined(__loongarch__) || defined(__riscv)
++ defined(__loongarch__) || defined(__riscv) || \
++ defined(__ppc64__) || defined(__PPC64__)
+ static uint8_t* ContextToLR(CONTEXT* context) {
+ # ifdef LR_sig
+ return mozilla::BitwiseCast<uint8_t*>(LR_sig(context));
+@@ -475,7 +485,8 @@ static JS::ProfilingFrameIterator::RegisterState ToRegisterState(
+ state.pc = ContextToPC(context);
+ state.sp = ContextToSP(context);
+ # if defined(__arm__) || defined(__aarch64__) || defined(__mips__) || \
+- defined(__loongarch__) || defined(__riscv)
++ defined(__loongarch__) || defined(__riscv) || \
++ defined(__ppc64__) || defined(__PPC64__)
+ state.lr = ContextToLR(context);
+ # else
+ state.lr = (void*)UINTPTR_MAX;
+@@ -776,6 +787,9 @@ static void MachExceptionHandlerThread() {
+
+ # if defined(__mips__) || defined(__loongarch__)
+ static const uint32_t kWasmTrapSignal = SIGFPE;
++# elif defined(__ppc64__) || defined(__PPC64__) || \
++ defined(__ppc64le__) || defined(__PPC64LE__)
++static const uint32_t kWasmTrapSignal = SIGTRAP;
+ # else
+ static const uint32_t kWasmTrapSignal = SIGILL;
+ # endif
+diff --git a/js/src/wasm/WasmStacks.cpp b/js/src/wasm/WasmStacks.cpp
+index 71497353c5c1..6514d8b0e2e4 100644
+--- a/js/src/wasm/WasmStacks.cpp
++++ b/js/src/wasm/WasmStacks.cpp
+@@ -426,6 +426,30 @@ static constexpr size_t ContStackMaxJitStackSize = 10 * 1024 * 1024;
+ // or stack snapshots utilities.
+ static constexpr size_t ContStackRedZoneSize = 0x8000;
+
++// Effective red-zone size used when laying out a continuation stack.
++//
++// The jit stack (and therefore the bottom guard page) must start on a page
++// boundary; otherwise gc::ProtectPages trips MOZ_RELEASE_ASSERT(length %
++// pageSize == 0). The red zone sits between the top guard page and the jit
++// stack, so its size has to be a page multiple to keep that start aligned.
++//
++// Rounding the red zone up to a page is correct on every platform and would
++// also cover any configuration whose page size exceeds ContStackRedZoneSize
++// (32K) -- e.g. a 64K-page AArch64 kernel -- but ContStackRedZoneSize is
++// already a multiple of the 4K/16K pages used on the tier-1 platforms, so the
++// round-up is a no-op there today. We deliberately gate it to PPC64 (64K
++// pages, where the round-up is load-bearing) so this patch cannot alter
++// continuation stack layout on any tier-1 platform. Drop the gate if the
++// general case is ever wanted.
++static inline size_t ContStackEffectiveRedZoneSize(
++ [[maybe_unused]] size_t pageSize) {
++#ifdef JS_CODEGEN_PPC64
++ return RoundUp(ContStackRedZoneSize, pageSize);
++#else
++ return ContStackRedZoneSize;
++#endif
++}
++
+ // Number of guard pages at the top and bottom of each continuation stack slot.
+ static constexpr size_t ContStackTopGuardPages = 1;
+ static constexpr size_t ContStackBottomGuardPages = 1;
+@@ -444,8 +468,8 @@ void ContStackSize::compute() {
+ ContStackMinJitStackSize, ContStackMaxJitStackSize),
+ pageSize);
+ headerSize = RoundUp(sizeof(ContStack), pageSize);
+- totalSize = topGuardSize + ContStackRedZoneSize + jitStackSize +
+- bottomGuardSize + headerSize;
++ totalSize = topGuardSize + ContStackEffectiveRedZoneSize(pageSize) +
++ jitStackSize + bottomGuardSize + headerSize;
+
+ // Assert we can't overflow when multiplying our size by capacity. Assume
+ // 32-bit integers to be conservative.
+@@ -467,7 +491,8 @@ void ContStack::init(ContStackArena* arena, uintptr_t allocationBase,
+ uintptr_t topGuardPagePhysicalStart = allocationBase;
+ uintptr_t topGuardPagePhysicalEnd = allocationBase + topGuardPageSize;
+ uintptr_t redZonePhysicalStart = topGuardPagePhysicalEnd;
+- uintptr_t jitStackPhysicalStart = redZonePhysicalStart + ContStackRedZoneSize;
++ uintptr_t jitStackPhysicalStart =
++ redZonePhysicalStart + ContStackEffectiveRedZoneSize(pageSize);
+ uintptr_t jitStackPhysicalEnd = jitStackPhysicalStart + jitStackSize;
+ uintptr_t bottomGuardPagePhysicalStart = jitStackPhysicalEnd;
+ uintptr_t headerPhysicalStart =
+diff --git a/js/src/wasm/WasmStubs.cpp b/js/src/wasm/WasmStubs.cpp
+index 8a98e201a452..8497814fcd37 100644
+--- a/js/src/wasm/WasmStubs.cpp
++++ b/js/src/wasm/WasmStubs.cpp
+@@ -646,8 +646,9 @@ static bool GenerateInterpEntry(MacroAssembler& masm, const FuncExport& fe,
+
+ // Save the return address if it wasn't already saved by the call insn.
+ #ifdef JS_USE_LINK_REGISTER
+-# if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) || \
+- defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++# if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) || \
++ defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || \
++ defined(JS_CODEGEN_PPC64)
+ masm.pushReturnAddress();
+ # elif defined(JS_CODEGEN_ARM64)
+ // WasmPush updates framePushed() unlike pushReturnAddress(), but that's
+@@ -2123,9 +2124,10 @@ static bool GenerateImportInterpExit(MacroAssembler& masm, const FuncImport& fi,
+ // The native ABI preserves the instance, heap and global registers since they
+ // are non-volatile.
+ MOZ_ASSERT(NonVolatileRegs.has(InstanceReg));
+-#if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM) || \
+- defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) || \
+- defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
++#if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM) || \
++ defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) || \
++ defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || \
++ defined(JS_CODEGEN_PPC64)
+ MOZ_ASSERT(NonVolatileRegs.has(HeapReg));
+ #endif
+
+@@ -2571,6 +2573,15 @@ bool wasm::GenerateBuiltinThunk(MacroAssembler& masm, ABIFunctionType abiType,
+ Register::FromCode(regId + 1));
+ }
+ }
++#endif
++#ifdef JS_CODEGEN_PPC64
++ // PPC64 32-bit operations do not zero-extend to 64 bits (unlike
++ // x86-64/ARM64/LA64). The ELFv2 ABI requires callers to zero/sign-extend
++ // narrow args. Wasm i32 values may have garbage upper bits in 64-bit
++ // registers, so zero-extend them before calling C++ builtins.
++ if (selfArgs.mirType() == MIRType::Int32) {
++ masm.move32ZeroExtendToPtr(selfArgs->gpr(), selfArgs->gpr());
++ }
+ #endif
+ continue;
+ }
+@@ -2659,6 +2670,28 @@ static const LiveRegisterSet RegsToPreserve(
+ # ifdef ENABLE_WASM_SIMD
+ # error "high lanes of SIMD registers need to be saved too."
+ # endif
++#elif defined(JS_CODEGEN_PPC64)
++// Exclude r0 (ScratchRegister, not allocatable, special addressing semantics),
++// r1 (SP), r2 (TOC pointer, reserved), and r13 (TLS pointer, reserved).
++static const LiveRegisterSet RegsToPreserve(
++ GeneralRegisterSet(Registers::AllMask & ~((uint32_t(1) << Registers::r0) |
++ (uint32_t(1) << Registers::r1) |
++ (uint32_t(1) << Registers::r2) |
++ (uint32_t(1) << Registers::r13))),
++# ifdef ENABLE_WASM_SIMD
++ // Unlike ARM64, where the vector registers alias the doubles, PPC64
++ // doubles live in the FPRs (VSR0-31) while wasm v128 values live in the
++ // VRs (VSR32-63) -- two disjoint physical pools, so both must be
++ // preserved. Saving only the doubles loses the entire live v128 state: a
++ // trap firing while a v128 is live (notably the interrupt-check trap,
++ // which fires constantly in hot loops) resumes with whatever the C++
++ // handler's libc left in the VRs (e.g. glibc's vector memcpy leaves lvsl
++ // alignment-control patterns in low VRs).
++ FloatRegisterSet(FloatRegisters::AllDoubleMask |
++ FloatRegisters::AllSimd128Mask));
++# else
++ FloatRegisterSet(FloatRegisters::AllDoubleMask));
++# endif
+ #elif defined(JS_CODEGEN_ARM64)
+ // We assume that traps do not happen while lr is live. This both ensures that
+ // the size of RegsToPreserve is a multiple of 2 (preserving WasmStackAlignment)
+diff --git a/js/src/wasm/WasmSummarizeInsn.cpp b/js/src/wasm/WasmSummarizeInsn.cpp
+index 7bb4f4b7a725..2ae55a1b1b9e 100644
+--- a/js/src/wasm/WasmSummarizeInsn.cpp
++++ b/js/src/wasm/WasmSummarizeInsn.cpp
+@@ -1731,6 +1731,169 @@ Maybe<TrapMachineInsn> SummarizeTrapInstruction(const uint8_t* insnAddr) {
+ return Nothing();
+ }
+
++// ================================================================== ppc64 ====
++
++# elif defined(JS_CODEGEN_PPC64)
++
++Maybe<TrapMachineInsn> SummarizeTrapInstruction(const uint8_t* insnAddr) {
++ MOZ_ASSERT(0 == (uintptr_t(insnAddr) & 3));
++
++ const uint32_t insn = *(uint32_t*)insnAddr;
++ const uint32_t majorOp = insn >> 26;
++ // X-form secondary opcode: bits 10..1.
++ const uint32_t xo = (insn >> 1) & 0x3FF;
++
++ // PPC_trap = 0x7FE00008 = tw 31,0,0.
++ if (insn == 0x7FE00008) {
++ return Some(TrapMachineInsn::OfficialUD);
++ }
++
++ // D-form / DS-form loads.
++ switch (majorOp) {
++ case 34: // lbz
++ return Some(TrapMachineInsn::Load8);
++ case 40: // lhz
++ case 42: // lha
++ return Some(TrapMachineInsn::Load16);
++ case 32: // lwz
++ return Some(TrapMachineInsn::Load32);
++ case 58: // ld (DS=0) / lwa (DS=2)
++ if ((insn & 3) == 2) {
++ return Some(TrapMachineInsn::Load32); // lwa
++ }
++ return Some(TrapMachineInsn::Load64); // ld
++ case 48: // lfs
++ return Some(TrapMachineInsn::Load32);
++ case 50: // lfd
++ return Some(TrapMachineInsn::Load64);
++ default:
++ break;
++ }
++
++ // D-form / DS-form stores.
++ switch (majorOp) {
++ case 38: // stb
++ return Some(TrapMachineInsn::Store8);
++ case 44: // sth
++ return Some(TrapMachineInsn::Store16);
++ case 36: // stw
++ case 37: // stwu
++ return Some(TrapMachineInsn::Store32);
++ case 52: // stfs
++ return Some(TrapMachineInsn::Store32);
++ case 62: // std (DS=0) / stdu (DS=1)
++ return Some(TrapMachineInsn::Store64);
++ case 54: // stfd
++ case 55: // stfdu
++ return Some(TrapMachineInsn::Store64);
++ default:
++ break;
++ }
++
++ // X-form instructions (major opcode 31).
++ if (majorOp == 31) {
++ switch (xo) {
++ // Indexed loads.
++ case 87: // lbzx
++ return Some(TrapMachineInsn::Load8);
++ case 279: // lhzx
++ case 343: // lhax
++ return Some(TrapMachineInsn::Load16);
++ case 23: // lwzx
++ return Some(TrapMachineInsn::Load32);
++ case 21: // ldx
++ return Some(TrapMachineInsn::Load64);
++ case 535: // lfsx
++ case 855: // lfiwax
++ case 887: // lfiwzx
++ return Some(TrapMachineInsn::Load32);
++ case 599: // lfdx
++ return Some(TrapMachineInsn::Load64);
++ case 790: // lhbrx (byte-reverse halfword)
++ return Some(TrapMachineInsn::Load16);
++ case 534: // lwbrx (byte-reverse word)
++ return Some(TrapMachineInsn::Load32);
++
++ // Indexed stores.
++ case 215: // stbx
++ return Some(TrapMachineInsn::Store8);
++ case 407: // sthx
++ return Some(TrapMachineInsn::Store16);
++ case 151: // stwx
++ return Some(TrapMachineInsn::Store32);
++ case 149: // stdx
++ return Some(TrapMachineInsn::Store64);
++ case 663: // stfsx
++ return Some(TrapMachineInsn::Store32);
++ case 727: // stfdx
++ return Some(TrapMachineInsn::Store64);
++ case 918: // sthbrx (byte-reverse halfword store)
++ return Some(TrapMachineInsn::Store16);
++ case 662: // stwbrx (byte-reverse word store)
++ return Some(TrapMachineInsn::Store32);
++
++ // VSX SIMD indexed load/store (XX1-form, same major opcode 31).
++ case 268: // lxvx (POWER9)
++ case 844: // lxvd2x (POWER8)
++ return Some(TrapMachineInsn::Load128);
++ case 396: // stxvx (POWER9)
++ case 972: // stxvd2x (POWER8)
++ return Some(TrapMachineInsn::Store128);
++
++ // Atomic (load-reserve / store-conditional).
++ case 20: // lwarx
++ case 52: // lbarx (POWER7+)
++ case 84: // ldarx
++ case 116: // lharx (POWER7+)
++ return Some(TrapMachineInsn::Atomic);
++ default:
++ break;
++ }
++ // stwcx. (XO=150, Rc=1), stdcx. (XO=214, Rc=1), stbcx. (XO=694, Rc=1)
++ // and sthcx. (XO=726, Rc=1) have bit 0 set. Note xo above already
++ // discards bit 0, so we need a separate low-11-bit match.
++ const uint32_t xoRc = insn & 0x7FF; // bits 10..0
++ if (xoRc == ((150 << 1) | 1) || xoRc == ((214 << 1) | 1) ||
++ xoRc == ((694 << 1) | 1) || xoRc == ((726 << 1) | 1)) {
++ return Some(TrapMachineInsn::Atomic);
++ }
++ }
++
++ // POWER10 prefixed loads/stores (major opcode 1). The trap-site PC
++ // points at the prefix word; the actual load/store kind is encoded in
++ // the suffix word at insnAddr + 4. The 64-byte-boundary rule
++ // (ensurePrefixedAlignment) guarantees the suffix is in the same block.
++ if (majorOp == 1) {
++ const uint32_t suffix = *(uint32_t*)(insnAddr + 4);
++ const uint32_t suffixOp6 = suffix >> 26; // 6-bit suffix op
++ const uint32_t suffixOp5 = suffix >> 27; // 5-bit suffix op (plxv/pstxv)
++ switch (suffixOp6) {
++ case 57: // pld
++ return Some(TrapMachineInsn::Load64);
++ case 50: // plfd
++ return Some(TrapMachineInsn::Load64);
++ case 48: // plfs
++ return Some(TrapMachineInsn::Load32);
++ case 61: // pstd
++ return Some(TrapMachineInsn::Store64);
++ case 54: // pstfd
++ return Some(TrapMachineInsn::Store64);
++ case 52: // pstfs
++ return Some(TrapMachineInsn::Store32);
++ default:
++ break;
++ }
++ if (suffixOp5 == 25) { // plxv
++ return Some(TrapMachineInsn::Load128);
++ }
++ if (suffixOp5 == 27) { // pstxv
++ return Some(TrapMachineInsn::Store128);
++ }
++ }
++
++ return Nothing();
++}
++
+ // ================================================================== none ====
+
+ # elif defined(JS_CODEGEN_NONE)
+diff --git a/js/src/wasm/WasmValue.cpp b/js/src/wasm/WasmValue.cpp
+index fda0996851e1..45fff24fa582 100644
+--- a/js/src/wasm/WasmValue.cpp
++++ b/js/src/wasm/WasmValue.cpp
+@@ -430,7 +430,7 @@ bool ToWebAssemblyValue_i32(JSContext* cx, HandleValue val, int32_t* loc,
+ bool ok = ToInt32(cx, val, loc);
+ if (ok && mustWrite64) {
+ #if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
+- defined(JS_CODEGEN_RISCV64)
++ defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
+ loc[1] = loc[0] >> 31;
+ #else
+ loc[1] = 0;
+diff --git a/mfbt/Assertions.h b/mfbt/Assertions.h
+index a436d019a197..4887af7e7676 100644
+--- a/mfbt/Assertions.h
++++ b/mfbt/Assertions.h
+@@ -282,6 +282,11 @@ static inline void MOZ_CrashSequence(void* aAddress, intptr_t aLine) {
+ "st.d %1,%0,0;\n" // Write the line number to the crashing address
+ : // no output registers
+ : "r"(aAddress), "r"(aLine));
++# elif defined(__powerpc64__)
++ asm volatile(
++ "std %1,0(%0);\n" // Write the line number to the crashing address
++ : // no output registers
++ : "r"(aAddress), "r"(aLine));
+ # else
+ # warning \
+ "Unsupported architecture, replace the code below with assembly suitable to crash the process"
+--
+2.52.0
+
diff --git a/firefox.spec b/firefox.spec
index 06a6900..be8abaf 100644
--- a/firefox.spec
+++ b/firefox.spec
@@ -281,6 +281,11 @@ Patch600: pgo.patch
Patch602: mozilla-1516803.patch
Patch603: firefox-gcc-always-inline.patch
+# ppc64le JIT
+Patch900: 0001-Add-VSX-instructions-for-SKIA.patch
+Patch901: 0002-Add-VSX-instructions-for-libwebp.patch
+Patch902: 0003-Add-PPC64LE-JIT-backend.patch
+
%if %{?system_nss}
BuildRequires: pkgconfig(nspr) >= %{nspr_version}
@@ -601,6 +606,11 @@ cat %{SOURCE49} | sed -e "s|LIBCLANG_RT_PLACEHOLDER|`pwd`/wasi-sdk-30/build/sysr
%endif
%patch -P603 -p1 -b .inline
+# ppc64le JIT
+%patch -P900 -p1
+%patch -P901 -p1
+%patch -P902 -p1
+
rm -f .mozconfig
cp %{SOURCE10} .mozconfig
echo "ac_add_options --enable-default-toolkit=cairo-gtk3-wayland" >> .mozconfig
^ permalink raw reply related [flat|nested] only message in thread
only message in thread, other threads:[~2026-06-16 13:10 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2026-06-16 13:10 [rpms/firefox] rawhide: add ppc64le JIT
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox