public inbox for git-commits@fedoraproject.org
help / color / mirror / Atom feed
* [rpms/firefox] rawhide: Revert "add ppc64le JIT"
@ 2026-06-16 13:11 
  0 siblings, 0 replies; only message in thread
From:  @ 2026-06-16 13:11 UTC (permalink / raw)
  To: git-commits

            A new commit has been pushed.

            Repo   : rpms/firefox
            Branch : rawhide
            Commit : 430380f3f64c47eba746c7e67fa2160ef8865644
            Author : Dan Horák <dan@danny.cz>
            Date   : 2026-06-16T13:11:26+00:00
            Stats  : +0/-42086 in 4 file(s)
            URL    : https://src.fedoraproject.org/rpms/firefox/c/430380f3f64c47eba746c7e67fa2160ef8865644?branch=rawhide

            Log:
            Revert "add ppc64le JIT"

This reverts commit e99f0d4925ac596ad75f2ae084620d36c44a85c2.

---
diff --git a/0001-Add-VSX-instructions-for-SKIA.patch b/0001-Add-VSX-instructions-for-SKIA.patch
deleted file mode 100644
index ac3a0d8..0000000
--- a/0001-Add-VSX-instructions-for-SKIA.patch
+++ /dev/null
@@ -1,1347 +0,0 @@
-From a47c991dbbfb709134737a54e8bbe7e0b1bce800 Mon Sep 17 00:00:00 2001
-From: =?UTF-8?q?Trung=20L=C3=AA?= <8@tle.id.au>
-Date: Fri, 12 Jun 2026 15:23:10 +1000
-Subject: [PATCH 1/3] Add VSX instructions for SKIA
-
-Adapted from work done by Raptor Engineering for chromium's vendored
-SKIA
-
-Co-authored-by: Timothy Pearson <tpearson@raptorengineering.com>
----
- gfx/skia/skia/src/base/SkVx.h                 |  58 +++-
- gfx/skia/skia/src/core/SkBlitRow_D32.cpp      |  98 ++++++
- gfx/skia/skia/src/core/SkBlitter_ARGB32.cpp   | 268 ++++++++++++++++
- .../skia/src/opts/SkBitmapProcState_opts.h    | 164 ++++++++++
- gfx/skia/skia/src/opts/SkBlitRow_opts.h       |  48 +++
- .../skia/src/opts/SkRasterPipeline_opts.h     | 237 ++++++++++++++
- gfx/skia/skia/src/opts/SkSwizzler_opts.inc    | 289 ++++++++++++++++++
- 7 files changed, 1160 insertions(+), 2 deletions(-)
-
-diff --git a/gfx/skia/skia/src/base/SkVx.h b/gfx/skia/skia/src/base/SkVx.h
-index f87ca44d4af0..ed80c91fd38e 100644
---- a/gfx/skia/skia/src/base/SkVx.h
-+++ b/gfx/skia/skia/src/base/SkVx.h
-@@ -52,6 +52,8 @@
-         #include <arm_neon.h>
-     #elif defined(__wasm_simd128__)
-         #include <wasm_simd128.h>
-+    #elif defined(SK_CPU_PPC) && defined(__VSX__)
-+        #include <altivec.h>
-     #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
-         #include <lasxintrin.h>
-         #include <lsxintrin.h>
-@@ -509,6 +511,14 @@ SINT Vec<N,T> if_then_else(const Vec<N,M<T>>& cond, const Vec<N,T>& t, const Vec
-                                               sk_bit_cast<uint8x16_t>(e)));
-     }
- #endif
-+#if SKVX_USE_SIMD && defined(SK_CPU_PPC) && defined(__VSX__)
-+    if constexpr (N*sizeof(T) == 16) {
-+        return sk_bit_cast<Vec<N,T>>(
-+                vec_sel(sk_bit_cast<__vector unsigned char>(e),
-+                        sk_bit_cast<__vector unsigned char>(t),
-+                        sk_bit_cast<__vector unsigned char>(cond)));
-+    }
-+#endif
- #if SKVX_USE_SIMD && SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
-     if constexpr (N*sizeof(T) == 32) {
-         return sk_bit_cast<Vec<N,T>>(__lasx_xvbitsel_v(sk_bit_cast<__m256i>(e),
-@@ -579,6 +589,11 @@ SINT bool any(const Vec<N,T>& x) {
-                                                           sk_bit_cast<__m128i>(x)));
-         return retv[0] != 0b0000;
-     }
-+#endif
-+#if SKVX_USE_SIMD && defined(SK_CPU_PPC) && defined(__VSX__)
-+    if constexpr (N*sizeof(T) == 16) {
-+        return vec_any_ne(sk_bit_cast<__vector unsigned int>(x), vec_splats(0u));
-+    }
- #endif
-     return any(x.lo)
-         || any(x.hi);
-@@ -622,6 +637,11 @@ SINT bool all(const Vec<N,T>& x) {
-                                                           sk_bit_cast<__m128i>(x)));
-         return retv[0] == 0b1111;
-     }
-+#endif
-+#if SKVX_USE_SIMD && defined(SK_CPU_PPC) && defined(__VSX__)
-+    if constexpr (N*sizeof(T) == 16) {
-+        return vec_all_ne(sk_bit_cast<__vector unsigned int>(x), vec_splats(0u));
-+    }
- #endif
-     return all(x.lo)
-         && all(x.hi);
-@@ -647,8 +667,22 @@ SIT  T max(const Vec<1,T>& x) { return x.val; }
- SINT T min(const Vec<N,T>& x) { return std::min(min(x.lo), min(x.hi)); }
- SINT T max(const Vec<N,T>& x) { return std::max(max(x.lo), max(x.hi)); }
- 
--SINT Vec<N,T> min(const Vec<N,T>& x, const Vec<N,T>& y) { return naive_if_then_else(y < x, y, x); }
--SINT Vec<N,T> max(const Vec<N,T>& x, const Vec<N,T>& y) { return naive_if_then_else(x < y, y, x); }
-+SINT Vec<N,T> min(const Vec<N,T>& x, const Vec<N,T>& y) {
-+#if SKVX_USE_SIMD && defined(SK_CPU_PPC) && defined(__VSX__)
-+    if constexpr (N*sizeof(T) == 16) {
-+        return sk_bit_cast<Vec<N,T>>(vec_min(to_vext(x), to_vext(y)));
-+    }
-+#endif
-+    return naive_if_then_else(y < x, y, x);
-+}
-+SINT Vec<N,T> max(const Vec<N,T>& x, const Vec<N,T>& y) {
-+#if SKVX_USE_SIMD && defined(SK_CPU_PPC) && defined(__VSX__)
-+    if constexpr (N*sizeof(T) == 16) {
-+        return sk_bit_cast<Vec<N,T>>(vec_max(to_vext(x), to_vext(y)));
-+    }
-+#endif
-+    return naive_if_then_else(x < y, y, x);
-+}
- 
- SINTU Vec<N,T> min(const Vec<N,T>& x, U y) { return min(x, Vec<N,T>(y)); }
- SINTU Vec<N,T> max(const Vec<N,T>& x, U y) { return max(x, Vec<N,T>(y)); }
-@@ -960,6 +994,26 @@ SIN Vec<N,uint16_t> mulhi(const Vec<N,uint16_t>& x,
-     } else { // N > 8
-         return join(mulhi(x.lo, y.lo), mulhi(x.hi, y.hi));
-     }
-+#elif SKVX_USE_SIMD && defined(SK_CPU_PPC) && defined(__VSX__)
-+    if constexpr (N == 8) {
-+        // u16*u16 -> u32 even/odd products (vmuleuh/vmulouh), then gather the
-+        // high 16 bits of each back into sequential lanes. Same idiom as the
-+        // VSX scale() in SkSwizzler_opts.
-+        __vector unsigned short xs = sk_bit_cast<__vector unsigned short>(x);
-+        __vector unsigned short ys = sk_bit_cast<__vector unsigned short>(y);
-+        __vector unsigned int even = vec_vmuleuh(xs, ys);
-+        __vector unsigned int odd  = vec_vmulouh(xs, ys);
-+        const __vector unsigned char hi = {
-+            0x02,0x03, 0x12,0x13,  0x06,0x07, 0x16,0x17,
-+            0x0A,0x0B, 0x1A,0x1B,  0x0E,0x0F, 0x1E,0x1F
-+        };
-+        return sk_bit_cast<Vec<8,uint16_t>>(
-+            vec_perm((__vector unsigned char)even, (__vector unsigned char)odd, hi));
-+    } else if constexpr (N < 8) {
-+        return mulhi(join(x,x), join(y,y)).lo;
-+    } else { // N > 8
-+        return join(mulhi(x.lo, y.lo), mulhi(x.hi, y.hi));
-+    }
- #else
-     return skvx::cast<uint16_t>(mull(x, y) >> 16);
- #endif
-diff --git a/gfx/skia/skia/src/core/SkBlitRow_D32.cpp b/gfx/skia/skia/src/core/SkBlitRow_D32.cpp
-index bcbf2e66bd46..920d6a9b2366 100644
---- a/gfx/skia/skia/src/core/SkBlitRow_D32.cpp
-+++ b/gfx/skia/skia/src/core/SkBlitRow_D32.cpp
-@@ -517,6 +517,104 @@ static void blit_row_s32_opaque(SkPMColor* dst,
-         }
-     }
- 
-+#elif defined(SK_CPU_PPC) && defined(__VSX__)
-+    #include <altivec.h>
-+
-+    // dst + (((src - dst) * src_scale) >> 8), splayed into 16-bit lanes; the
-+    // vec_* transcription of SkPMLerp_SSE2.
-+    static inline __vector unsigned char SkPMLerp_VSX(__vector unsigned char src,
-+                                                      __vector unsigned char dst,
-+                                                      unsigned src_scale) {
-+        const __vector unsigned int mask = vec_splats(0x00FF00FFu);
-+        const __vector unsigned short eight = vec_splats((unsigned short)8);
-+        __vector unsigned short src_rb = (__vector unsigned short)vec_and((__vector unsigned int)src, mask);
-+        __vector unsigned short src_ag = vec_sr((__vector unsigned short)src, eight);
-+        __vector unsigned short dst_rb = (__vector unsigned short)vec_and((__vector unsigned int)dst, mask);
-+        __vector unsigned short dst_ag = vec_sr((__vector unsigned short)dst, eight);
-+        __vector unsigned short s = vec_splats((unsigned short)src_scale);
-+        __vector unsigned short diff_rb = vec_mul(vec_sub(src_rb, dst_rb), s);
-+        __vector unsigned short diff_ag = vec_mul(vec_sub(src_ag, dst_ag), s);
-+        diff_rb = vec_sr(diff_rb, eight);
-+        __vector unsigned int diff = vec_or((__vector unsigned int)diff_rb,
-+                                            vec_andc((__vector unsigned int)diff_ag, mask));
-+        return vec_add(dst, (__vector unsigned char)diff);
-+    }
-+
-+    static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
-+        SkASSERT(alpha <= 255);
-+        unsigned src_scale = SkAlpha255To256(alpha);
-+        while (count >= 4) {
-+            __vector unsigned char s = vec_xl(0, (const unsigned char*)src);
-+            __vector unsigned char d = vec_xl(0, (const unsigned char*)dst);
-+            vec_xst(SkPMLerp_VSX(s, d, src_scale), 0, (unsigned char*)dst);
-+            src += 4; dst += 4; count -= 4;
-+        }
-+        while (count --> 0) {
-+            *dst = SkPMLerp(*src, *dst, src_scale);
-+            src++;
-+            dst++;
-+        }
-+    }
-+
-+    // The vec_* transcription of SkBlendARGB32_SSE2: scale src by aa and dst by
-+    // SkAlphaMulInv256(srcA, aa), then add the splayed halves.
-+    static inline __vector unsigned char SkBlendARGB32_VSX(__vector unsigned char src,
-+                                                           __vector unsigned char dst,
-+                                                           unsigned aa) {
-+        unsigned alpha = SkAlpha255To256(aa);
-+        __vector unsigned short src_scale = vec_splats((unsigned short)alpha);
-+        const __vector unsigned int mask = vec_splats(0x00FF00FFu);
-+        const __vector unsigned short eight = vec_splats((unsigned short)8);
-+
-+        // dst_scale = SkAlphaMulInv256(SkGetPackedA32(src), alpha), per 32-bit lane.
-+        __vector unsigned int srcA = vec_sr((__vector unsigned int)src, vec_splats(24u));
-+        __vector unsigned int ds = (__vector unsigned int)vec_mul((__vector unsigned short)srcA, src_scale);
-+        ds = vec_sub(vec_splats((unsigned int)0xFFFF), ds);
-+        ds = vec_add(ds, vec_sr(ds, vec_splats(8u)));
-+        ds = vec_sr(ds, vec_splats(8u));
-+        // Duplicate the low 16-bit word of each 32-bit lane into both halves
-+        // (the SSE shufflelo/shufflehi _MM_SHUFFLE(2,2,0,0)).
-+        const __vector unsigned char dup = (__vector unsigned char){
-+            0,1,0,1, 4,5,4,5, 8,9,8,9, 12,13,12,13
-+        };
-+        __vector unsigned short dst_scale =
-+            (__vector unsigned short)vec_perm((__vector unsigned char)ds,
-+                                              (__vector unsigned char)ds, dup);
-+
-+        __vector unsigned short src_rb = (__vector unsigned short)vec_and((__vector unsigned int)src, mask);
-+        __vector unsigned short src_ag = vec_sr((__vector unsigned short)src, eight);
-+        __vector unsigned short dst_rb = (__vector unsigned short)vec_and((__vector unsigned int)dst, mask);
-+        __vector unsigned short dst_ag = vec_sr((__vector unsigned short)dst, eight);
-+
-+        src_rb = vec_mul(src_rb, src_scale);
-+        src_ag = vec_mul(src_ag, src_scale);
-+        dst_rb = vec_mul(dst_rb, dst_scale);
-+        dst_ag = vec_mul(dst_ag, dst_scale);
-+
-+        dst_rb = vec_add(src_rb, dst_rb);
-+        dst_ag = vec_add(src_ag, dst_ag);
-+
-+        dst_rb = vec_sr(dst_rb, eight);
-+        __vector unsigned int out = vec_or((__vector unsigned int)dst_rb,
-+                                           vec_andc((__vector unsigned int)dst_ag, mask));
-+        return (__vector unsigned char)out;
-+    }
-+
-+    static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
-+        SkASSERT(alpha <= 255);
-+        while (count >= 4) {
-+            __vector unsigned char s = vec_xl(0, (const unsigned char*)src);
-+            __vector unsigned char d = vec_xl(0, (const unsigned char*)dst);
-+            vec_xst(SkBlendARGB32_VSX(s, d, alpha), 0, (unsigned char*)dst);
-+            src += 4; dst += 4; count -= 4;
-+        }
-+        while (count --> 0) {
-+            *dst = SkBlendARGB32(*src, *dst, alpha);
-+            src++;
-+            dst++;
-+        }
-+    }
-+
- #else
-     static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
-         SkASSERT(alpha <= 255);
-diff --git a/gfx/skia/skia/src/core/SkBlitter_ARGB32.cpp b/gfx/skia/skia/src/core/SkBlitter_ARGB32.cpp
-index a7538027b85d..9669431292b6 100644
---- a/gfx/skia/skia/src/core/SkBlitter_ARGB32.cpp
-+++ b/gfx/skia/skia/src/core/SkBlitter_ARGB32.cpp
-@@ -480,6 +480,274 @@ static inline SkPMColor blend_lcd16_opaque(int srcR, int srcG, int srcB,
-         }
-     }
- 
-+#elif defined(SK_CPU_PPC) && defined(__VSX__)
-+    #include <altivec.h>
-+
-+    // Native VSX/AltiVec port of the SSE2 LCD-subpixel blend block below.
-+    // Same algorithm — only the intrinsics change. Translations follow the
-+    // GCC ppc_wrappers pattern (vec_mergeh/l, vec_packsu, etc.).
-+
-+    // The following (left) shifts cause the top 5 bits of the mask components to
-+    // line up with the corresponding components in an SkPMColor.
-+    // Note that the mask's RGB16 order may differ from the SkPMColor order.
-+    #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
-+    #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
-+    #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
-+
-+    // Each macro must always return __vector unsigned int so the surrounding
-+    // vec_and gets matching element types. The pass-through case (SHIFT == 0)
-+    // still needs an explicit reinterpret-cast since `mask` is __vector
-+    // unsigned char in our function signature.
-+    #if SK_R16x5_R32x5_SHIFT == 0
-+        #define SkPackedR16x5ToUnmaskedR32x5_VSX(x) ((__vector unsigned int)(x))
-+    #elif SK_R16x5_R32x5_SHIFT > 0
-+        #define SkPackedR16x5ToUnmaskedR32x5_VSX(x) \
-+            vec_sl((__vector unsigned int)(x), vec_splats((unsigned int)SK_R16x5_R32x5_SHIFT))
-+    #else
-+        #define SkPackedR16x5ToUnmaskedR32x5_VSX(x) \
-+            vec_sr((__vector unsigned int)(x), vec_splats((unsigned int)(-SK_R16x5_R32x5_SHIFT)))
-+    #endif
-+
-+    #if SK_G16x5_G32x5_SHIFT == 0
-+        #define SkPackedG16x5ToUnmaskedG32x5_VSX(x) ((__vector unsigned int)(x))
-+    #elif SK_G16x5_G32x5_SHIFT > 0
-+        #define SkPackedG16x5ToUnmaskedG32x5_VSX(x) \
-+            vec_sl((__vector unsigned int)(x), vec_splats((unsigned int)SK_G16x5_G32x5_SHIFT))
-+    #else
-+        #define SkPackedG16x5ToUnmaskedG32x5_VSX(x) \
-+            vec_sr((__vector unsigned int)(x), vec_splats((unsigned int)(-SK_G16x5_G32x5_SHIFT)))
-+    #endif
-+
-+    #if SK_B16x5_B32x5_SHIFT == 0
-+        #define SkPackedB16x5ToUnmaskedB32x5_VSX(x) ((__vector unsigned int)(x))
-+    #elif SK_B16x5_B32x5_SHIFT > 0
-+        #define SkPackedB16x5ToUnmaskedB32x5_VSX(x) \
-+            vec_sl((__vector unsigned int)(x), vec_splats((unsigned int)SK_B16x5_B32x5_SHIFT))
-+    #else
-+        #define SkPackedB16x5ToUnmaskedB32x5_VSX(x) \
-+            vec_sr((__vector unsigned int)(x), vec_splats((unsigned int)(-SK_B16x5_B32x5_SHIFT)))
-+    #endif
-+
-+    static __vector unsigned char blend_lcd16_vsx(__vector unsigned char& src,
-+                                                   __vector unsigned char& dst,
-+                                                   __vector unsigned char& mask,
-+                                                   __vector unsigned char& srcA) {
-+        // Get the R,G,B of each 16bit mask pixel, all aligned to 5-bit positions.
-+        __vector unsigned int r = vec_and(SkPackedR16x5ToUnmaskedR32x5_VSX(mask),
-+                                          vec_splats((unsigned int)(0x1F << SK_R32_SHIFT)));
-+        __vector unsigned int g = vec_and(SkPackedG16x5ToUnmaskedG32x5_VSX(mask),
-+                                          vec_splats((unsigned int)(0x1F << SK_G32_SHIFT)));
-+        __vector unsigned int b = vec_and(SkPackedB16x5ToUnmaskedB32x5_VSX(mask),
-+                                          vec_splats((unsigned int)(0x1F << SK_B32_SHIFT)));
-+
-+        // a needs to be either the min or the max of the LCD coverages, depending on srcA < dstA.
-+        __vector unsigned int rA = vec_sl(r, vec_splats((unsigned int)(SK_A32_SHIFT - SK_R32_SHIFT)));
-+        __vector unsigned int gA = vec_sl(g, vec_splats((unsigned int)(SK_A32_SHIFT - SK_G32_SHIFT)));
-+        __vector unsigned int bA = vec_sl(b, vec_splats((unsigned int)(SK_A32_SHIFT - SK_B32_SHIFT)));
-+        __vector unsigned char aMin = vec_min(vec_min((__vector unsigned char)rA,
-+                                                       (__vector unsigned char)gA),
-+                                              (__vector unsigned char)bA);
-+        __vector unsigned char aMax = vec_max(vec_max((__vector unsigned char)rA,
-+                                                       (__vector unsigned char)gA),
-+                                              (__vector unsigned char)bA);
-+        // srcA has been biased to [0-256]; compare srcA against (dstA+1).
-+        __vector unsigned int dstA = vec_and(vec_add((__vector unsigned int)dst,
-+                                                     vec_splats((unsigned int)(1 << SK_A32_SHIFT))),
-+                                             vec_splats((unsigned int)SK_A32_MASK));
-+        __vector __bool int aLT = vec_cmplt((__vector signed int)srcA, (__vector signed int)dstA);
-+        // a = (aMin & aLT) | (aMax & ~aLT)
-+        __vector unsigned char a = vec_or(vec_and(aMin, (__vector unsigned char)aLT),
-+                                          vec_andc(aMax, (__vector unsigned char)aLT));
-+
-+        // Pack the 4 16-bit mask pixels into 4 32-bit pixels (m0A, m0R, m0G, m0B, ...).
-+        mask = vec_or(vec_or(a, (__vector unsigned char)r),
-+                      vec_or((__vector unsigned char)g, (__vector unsigned char)b));
-+
-+        // Interleave into 16-bit words.
-+        const __vector unsigned char zeros = vec_splats((unsigned char)0);
-+        __vector unsigned short maskLo = (__vector unsigned short)vec_mergeh(mask, zeros);
-+        __vector unsigned short maskHi = (__vector unsigned short)vec_mergel(mask, zeros);
-+
-+        // Upscale 0..31 -> 0..32 by adding (mask >> 4).
-+        const __vector unsigned short v4 = vec_splats((unsigned short)4);
-+        const __vector unsigned short v8 = vec_splats((unsigned short)8);
-+        const __vector unsigned short v5 = vec_splats((unsigned short)5);
-+        maskLo = vec_add(maskLo, vec_sr(maskLo, v4));
-+        maskHi = vec_add(maskHi, vec_sr(maskHi, v4));
-+
-+        // Multiply by srcA per 16-bit lane.
-+        maskLo = vec_mul(maskLo, (__vector unsigned short)srcA);
-+        maskHi = vec_mul(maskHi, (__vector unsigned short)srcA);
-+        // Divide by 256 (right-shift 8).
-+        maskLo = vec_sr(maskLo, v8);
-+        maskHi = vec_sr(maskHi, v8);
-+
-+        // Unpack dst into 16-bit words.
-+        __vector signed short dstLo = (__vector signed short)vec_mergeh(dst, zeros);
-+        __vector signed short dstHi = (__vector signed short)vec_mergel(dst, zeros);
-+        // mask = (src - dst) * mask
-+        __vector signed short srcS = (__vector signed short)src;
-+        __vector signed short mLoS = vec_mul((__vector signed short)maskLo, vec_sub(srcS, dstLo));
-+        __vector signed short mHiS = vec_mul((__vector signed short)maskHi, vec_sub(srcS, dstHi));
-+        // arithmetic shift right by 5
-+        mLoS = vec_sra(mLoS, (__vector unsigned short)v5);
-+        mHiS = vec_sra(mHiS, (__vector unsigned short)v5);
-+        // result = dst + ((src - dst) * mask >> 5)
-+        __vector signed short resLo = vec_add(dstLo, mLoS);
-+        __vector signed short resHi = vec_add(dstHi, mHiS);
-+        // Pack 16-bit signed -> 8-bit unsigned with saturation.
-+        return vec_packsu(resLo, resHi);
-+    }
-+
-+    static __vector unsigned char blend_lcd16_opaque_vsx(__vector unsigned char& src,
-+                                                          __vector unsigned char& dst,
-+                                                          __vector unsigned char& mask) {
-+        __vector unsigned int r = vec_and(SkPackedR16x5ToUnmaskedR32x5_VSX(mask),
-+                                          vec_splats((unsigned int)(0x1F << SK_R32_SHIFT)));
-+        __vector unsigned int g = vec_and(SkPackedG16x5ToUnmaskedG32x5_VSX(mask),
-+                                          vec_splats((unsigned int)(0x1F << SK_G32_SHIFT)));
-+        __vector unsigned int b = vec_and(SkPackedB16x5ToUnmaskedB32x5_VSX(mask),
-+                                          vec_splats((unsigned int)(0x1F << SK_B32_SHIFT)));
-+
-+        // Opaque src: a = max(r, g, b) shifted to alpha lane.
-+        __vector unsigned int rA = vec_sl(r, vec_splats((unsigned int)(SK_A32_SHIFT - SK_R32_SHIFT)));
-+        __vector unsigned int gA = vec_sl(g, vec_splats((unsigned int)(SK_A32_SHIFT - SK_G32_SHIFT)));
-+        __vector unsigned int bA = vec_sl(b, vec_splats((unsigned int)(SK_A32_SHIFT - SK_B32_SHIFT)));
-+        __vector unsigned char a = vec_max(vec_max((__vector unsigned char)rA,
-+                                                    (__vector unsigned char)gA),
-+                                           (__vector unsigned char)bA);
-+
-+        mask = vec_or(vec_or(a, (__vector unsigned char)r),
-+                      vec_or((__vector unsigned char)g, (__vector unsigned char)b));
-+
-+        const __vector unsigned char zeros = vec_splats((unsigned char)0);
-+        __vector unsigned short maskLo = (__vector unsigned short)vec_mergeh(mask, zeros);
-+        __vector unsigned short maskHi = (__vector unsigned short)vec_mergel(mask, zeros);
-+
-+        const __vector unsigned short v4 = vec_splats((unsigned short)4);
-+        const __vector unsigned short v5 = vec_splats((unsigned short)5);
-+        maskLo = vec_add(maskLo, vec_sr(maskLo, v4));
-+        maskHi = vec_add(maskHi, vec_sr(maskHi, v4));
-+
-+        __vector signed short dstLo = (__vector signed short)vec_mergeh(dst, zeros);
-+        __vector signed short dstHi = (__vector signed short)vec_mergel(dst, zeros);
-+        __vector signed short srcS = (__vector signed short)src;
-+        __vector signed short mLoS = vec_mul((__vector signed short)maskLo, vec_sub(srcS, dstLo));
-+        __vector signed short mHiS = vec_mul((__vector signed short)maskHi, vec_sub(srcS, dstHi));
-+        mLoS = vec_sra(mLoS, (__vector unsigned short)v5);
-+        mHiS = vec_sra(mHiS, (__vector unsigned short)v5);
-+        __vector signed short resLo = vec_add(dstLo, mLoS);
-+        __vector signed short resHi = vec_add(dstHi, mHiS);
-+        return vec_packsu(resLo, resHi);
-+    }
-+
-+    void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src,
-+                        int width, SkPMColor) {
-+        if (width <= 0) {
-+            return;
-+        }
-+        int srcA = SkColorGetA(src);
-+        int srcR = SkColorGetR(src);
-+        int srcG = SkColorGetG(src);
-+        int srcB = SkColorGetB(src);
-+        srcA = SkAlpha255To256(srcA);
-+
-+        if (width >= 4) {
-+            SkASSERT(SkIsAlign4((uintptr_t) dst));
-+            while (!SkIsAlign16((uintptr_t) dst)) {
-+                *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
-+                mask++; dst++; width--;
-+            }
-+
-+            // Replicate source across 4 lanes, then unpack low half to interleaved 16-bit.
-+            uint32_t srcPM = SkPackARGB32(0xFF, srcR, srcG, srcB);
-+            __vector unsigned int src_v32 = vec_splats(srcPM);
-+            const __vector unsigned char zeros = vec_splats((unsigned char)0);
-+            __vector unsigned char src_v = vec_mergeh((__vector unsigned char)src_v32, zeros);
-+            __vector unsigned char srcA_v = (__vector unsigned char)vec_splats((unsigned short)srcA);
-+
-+            while (width >= 4) {
-+                __vector unsigned char dst_v = vec_xl(0, (const unsigned char*)dst);
-+                // Load 8 bytes (4x uint16 mask) into low half of vector.
-+                uint64_t mlo;
-+                memcpy(&mlo, mask, sizeof(mlo));
-+                __vector unsigned long long mask_low =
-+                    (__vector unsigned long long){mlo, 0};
-+                __vector unsigned char mask_v = (__vector unsigned char)mask_low;
-+
-+                // Check if all mask values are zero (skip blending if so).
-+                if (!vec_all_eq((__vector unsigned long long)mask_v,
-+                                vec_splats((unsigned long long)0))) {
-+                    // Unpack low 8 bytes of mask (4x uint16) into 4x uint32 (with zeros).
-+                    // Zero-extend the 4 uint16 masks to 4 uint32 (16-bit-granularity
-+                    // merge, matching SSE2's _mm_unpacklo_epi16); a char-granularity
-+                    // merge would byte-stretch the RGB565 value and misplace the shifts.
-+                    mask_v = (__vector unsigned char)vec_mergeh((__vector unsigned short)mask_v,
-+                                                               (__vector unsigned short)zeros);
-+                    __vector unsigned char result =
-+                        blend_lcd16_vsx(src_v, dst_v, mask_v, srcA_v);
-+                    vec_xst(result, 0, (unsigned char*)dst);
-+                }
-+                dst += 4; mask += 4; width -= 4;
-+            }
-+        }
-+
-+        while (width > 0) {
-+            *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
-+            mask++; dst++; width--;
-+        }
-+    }
-+
-+    void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
-+                               SkColor src, int width, SkPMColor opaqueDst) {
-+        if (width <= 0) {
-+            return;
-+        }
-+        int srcR = SkColorGetR(src);
-+        int srcG = SkColorGetG(src);
-+        int srcB = SkColorGetB(src);
-+
-+        if (width >= 4) {
-+            SkASSERT(SkIsAlign4((uintptr_t) dst));
-+            while (!SkIsAlign16((uintptr_t) dst)) {
-+                *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
-+                mask++; dst++; width--;
-+            }
-+
-+            uint32_t srcPM = SkPackARGB32(0xFF, srcR, srcG, srcB);
-+            __vector unsigned int src_v32 = vec_splats(srcPM);
-+            const __vector unsigned char zeros = vec_splats((unsigned char)0);
-+            __vector unsigned char src_v = vec_mergeh((__vector unsigned char)src_v32, zeros);
-+
-+            while (width >= 4) {
-+                __vector unsigned char dst_v = vec_xl(0, (const unsigned char*)dst);
-+                uint64_t mlo;
-+                memcpy(&mlo, mask, sizeof(mlo));
-+                __vector unsigned long long mask_low =
-+                    (__vector unsigned long long){mlo, 0};
-+                __vector unsigned char mask_v = (__vector unsigned char)mask_low;
-+
-+                if (!vec_all_eq((__vector unsigned long long)mask_v,
-+                                vec_splats((unsigned long long)0))) {
-+                    // Zero-extend the 4 uint16 masks to 4 uint32 (16-bit-granularity
-+                    // merge, matching SSE2's _mm_unpacklo_epi16); a char-granularity
-+                    // merge would byte-stretch the RGB565 value and misplace the shifts.
-+                    mask_v = (__vector unsigned char)vec_mergeh((__vector unsigned short)mask_v,
-+                                                               (__vector unsigned short)zeros);
-+                    __vector unsigned char result =
-+                        blend_lcd16_opaque_vsx(src_v, dst_v, mask_v);
-+                    vec_xst(result, 0, (unsigned char*)dst);
-+                }
-+                dst += 4; mask += 4; width -= 4;
-+            }
-+        }
-+
-+        while (width > 0) {
-+            *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
-+            mask++; dst++; width--;
-+        }
-+    }
-+
- #elif defined(SK_ARM_HAS_NEON)
-     #include <arm_neon.h>
- 
-diff --git a/gfx/skia/skia/src/opts/SkBitmapProcState_opts.h b/gfx/skia/skia/src/opts/SkBitmapProcState_opts.h
-index 6d01a2f4458f..87b160ed7a1e 100644
---- a/gfx/skia/skia/src/opts/SkBitmapProcState_opts.h
-+++ b/gfx/skia/skia/src/opts/SkBitmapProcState_opts.h
-@@ -29,6 +29,8 @@
-     #include <lasxintrin.h>
- #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
-     #include <lsxintrin.h>
-+#elif defined(SK_CPU_PPC) && defined(__VSX__)
-+    #include <altivec.h>
- #endif
- 
- namespace SK_OPTS_NS {
-@@ -260,6 +262,168 @@ static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, O
-         }
-     }
- 
-+#elif defined(SK_CPU_PPC) && defined(__VSX__)
-+
-+    // Helper: scalar uint32_t -> 16-byte vector with x in low 32 bits, zero elsewhere.
-+    // Equivalent of x86's _mm_cvtsi32_si128.
-+    static inline __vector unsigned char vsx_cvt_u32_to_vec(uint32_t x) {
-+        __vector unsigned int v = (__vector unsigned int){x, 0, 0, 0};
-+        return (__vector unsigned char)v;
-+    }
-+
-+    // Helper: PPC64 VSX equivalent of x86's _mm_maddubs_epi16. Multiplies pairs of
-+    // (unsigned byte, signed byte) and adds adjacent pairs to produce 16-bit signed
-+    // values, saturating to int16. Implementation transcribes the GCC ppc_wrappers
-+    // tmmintrin.h sequence for endianness correctness on LE PPC64.
-+    static inline __vector signed short vsx_maddubs_epi16(__vector unsigned char A,
-+                                                            __vector signed char B) {
-+        __vector signed short __ff = vec_splats((signed short)0x00FF);
-+        __vector signed short __C = vec_and(vec_unpackh((__vector signed char)A), __ff);
-+        __vector signed short __D = vec_and(vec_unpackl((__vector signed char)A), __ff);
-+        __vector signed short __E = vec_unpackh(B);
-+        __vector signed short __F = vec_unpackl(B);
-+        __C = vec_mul(__C, __E);
-+        __D = vec_mul(__D, __F);
-+        const __vector unsigned char __odds  = (__vector unsigned char){
-+            0,1, 4,5, 8,9, 12,13,  16,17, 20,21, 24,25, 28,29
-+        };
-+        const __vector unsigned char __evens = (__vector unsigned char){
-+            2,3, 6,7, 10,11, 14,15,  18,19, 22,23, 26,27, 30,31
-+        };
-+        __E = (__vector signed short)vec_perm((__vector unsigned char)__C,
-+                                              (__vector unsigned char)__D, __odds);
-+        __F = (__vector signed short)vec_perm((__vector unsigned char)__C,
-+                                              (__vector unsigned char)__D, __evens);
-+        return vec_adds(__E, __F);
-+    }
-+
-+    /*not static*/ inline
-+    void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
-+                                 const uint32_t* xy, int count, uint32_t* colors) {
-+        SkASSERT(count > 0 && colors != nullptr);
-+        SkASSERT(s.fBilerp);
-+        SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
-+        SkASSERT(s.fAlphaScale <= 256);
-+
-+        // interpolate_in_x() is the crux of the implementation, interpolating in X
-+        // for up to two output pixels (A and B) using vsx_maddubs_epi16().
-+        auto interpolate_in_x = [](uint32_t A0, uint32_t A1,
-+                                   uint32_t B0, uint32_t B1,
-+                                   __vector signed char interlaced_x_weights) {
-+            // _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1))
-+            // = vec_mergeh on uchar, since the input vectors have only the low 32 bits set.
-+            __vector unsigned char interlaced_A = vec_mergeh(vsx_cvt_u32_to_vec(A0),
-+                                                              vsx_cvt_u32_to_vec(A1));
-+            __vector unsigned char interlaced_B = vec_mergeh(vsx_cvt_u32_to_vec(B0),
-+                                                              vsx_cvt_u32_to_vec(B1));
-+            // _mm_unpacklo_epi64 = vec_mergeh on long long.
-+            __vector long long lo64 = vec_mergeh((__vector long long)interlaced_A,
-+                                                 (__vector long long)interlaced_B);
-+            return vsx_maddubs_epi16((__vector unsigned char)lo64, interlaced_x_weights);
-+        };
-+
-+        // Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B.
-+        // Returns two pixels, with each color channel in a 16-bit lane of the result.
-+        auto interpolate_in_x_and_y = [&](uint32_t A0, uint32_t A1,
-+                                          uint32_t A2, uint32_t A3,
-+                                          uint32_t B0, uint32_t B1,
-+                                          uint32_t B2, uint32_t B3,
-+                                          __vector signed char interlaced_x_weights,
-+                                          int wy) {
-+            __vector signed short top = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights);
-+            __vector signed short bot = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights);
-+
-+            // 16*top + (bot-top)*wy, mirroring the SSE2 form (saves one multiply vs. the
-+            // straightforward top*(16-wy) + bot*wy).
-+            __vector unsigned short v4 = vec_splats((unsigned short)4);
-+            __vector signed short wy_v = vec_splats((signed short)wy);
-+            __vector signed short px = vec_add(vec_sl(top, v4), vec_mul(vec_sub(bot, top), wy_v));
-+
-+            // Scale down by total max weight 16x16 = 256.
-+            px = (__vector signed short)vec_sr((__vector unsigned short)px, vec_splats((unsigned short)8));
-+
-+            // Scale by alpha if needed.
-+            if (s.fAlphaScale < 256) {
-+                __vector signed short scale_v = vec_splats((signed short)s.fAlphaScale);
-+                px = (__vector signed short)vec_sr((__vector unsigned short)vec_mul(px, scale_v),
-+                                                   vec_splats((unsigned short)8));
-+            }
-+            return px;
-+        };
-+
-+        // We're in _DX mode here, so we're only varying in X.
-+        // That means the first entry of xy is our constant pair of Y coordinates and weight in Y.
-+        int y0, y1, wy;
-+        decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
-+
-+        auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()),
-+             row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes());
-+
-+        while (count >= 4) {
-+            // We can really get going, loading 4 X-pairs at a time to produce 4 output pixels.
-+            int x0[4],
-+                x1[4];
-+            __vector unsigned int wx;
-+
-+            // decode_packed_coordinates_and_weight(), 4x.
-+            __vector unsigned int packed = (__vector unsigned int)vec_xl(0, (const unsigned char*)xy);
-+            __vector unsigned int x0_v = vec_sr(packed, vec_splats(18u));
-+            __vector unsigned int x1_v = vec_and(packed, vec_splats(0x3fffu));
-+            vec_xst((__vector unsigned char)x0_v, 0, (unsigned char*)x0);
-+            vec_xst((__vector unsigned char)x1_v, 0, (unsigned char*)x1);
-+            wx = vec_and(vec_sr(packed, vec_splats(14u)), vec_splats(0xfu));  // [0,15]
-+
-+            // Splat each x weight 4x (for each color channel) as wr for pixels on the right at x1,
-+            // and sixteen minus that as wl for pixels on the left at x0.
-+            const __vector unsigned char wr_mask = (__vector unsigned char){
-+                0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12
-+            };
-+            __vector unsigned char wr = vec_perm((__vector unsigned char)wx,
-+                                                  (__vector unsigned char)wx, wr_mask);
-+            __vector unsigned char wl = vec_sub(vec_splats((unsigned char)16), wr);
-+
-+            // Interlace wl and wr for vsx_maddubs_epi16().
-+            __vector signed char interlaced_x_weights_AB = (__vector signed char)vec_mergeh(wl, wr);
-+            __vector signed char interlaced_x_weights_CD = (__vector signed char)vec_mergel(wl, wr);
-+
-+            enum { A,B,C,D };
-+
-+            __vector signed short AB = interpolate_in_x_and_y(
-+                    row0[x0[A]], row0[x1[A]], row1[x0[A]], row1[x1[A]],
-+                    row0[x0[B]], row0[x1[B]], row1[x0[B]], row1[x1[B]],
-+                    interlaced_x_weights_AB, wy);
-+            __vector signed short CD = interpolate_in_x_and_y(
-+                    row0[x0[C]], row0[x1[C]], row1[x0[C]], row1[x1[C]],
-+                    row0[x0[D]], row0[x1[D]], row1[x0[D]], row1[x1[D]],
-+                    interlaced_x_weights_CD, wy);
-+
-+            // Pack 16-bit signed -> 8-bit unsigned with saturation, write 4 pixels.
-+            __vector unsigned char packed_out = vec_packsu(AB, CD);
-+            vec_xst(packed_out, 0, (unsigned char*)colors);
-+            xy     += 4;
-+            colors += 4;
-+            count  -= 4;
-+        }
-+
-+        while (count --> 0) {
-+            // Same flow as the count >= 4 loop, but writing one pixel.
-+            int x0, x1, wx;
-+            decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
-+
-+            __vector unsigned char wr = vec_splats((unsigned char)wx);
-+            __vector unsigned char wl = vec_sub(vec_splats((unsigned char)16), wr);
-+            __vector signed char interlaced_x_weights = (__vector signed char)vec_mergeh(wl, wr);
-+
-+            __vector signed short Av = interpolate_in_x_and_y(
-+                    row0[x0], row0[x1], row1[x0], row1[x1],
-+                    0, 0, 0, 0,
-+                    interlaced_x_weights, wy);
-+            __vector unsigned char packed_out = vec_packsu(Av,
-+                    (__vector signed short)(__vector unsigned char){0});
-+            *colors++ = ((__vector unsigned int)packed_out)[0];
-+        }
-+    }
-+
- #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
-     /*not static*/ inline
-     void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
-diff --git a/gfx/skia/skia/src/opts/SkBlitRow_opts.h b/gfx/skia/skia/src/opts/SkBlitRow_opts.h
-index d1de5681a72e..d03908a03a32 100644
---- a/gfx/skia/skia/src/opts/SkBlitRow_opts.h
-+++ b/gfx/skia/skia/src/opts/SkBlitRow_opts.h
-@@ -68,6 +68,43 @@
-     }
- #endif
- 
-+#if defined(SK_CPU_PPC) && defined(__VSX__)
-+    #include <altivec.h>
-+
-+    // Native VSX/AltiVec port of SkPMSrcOver_SSE2.
-+    // Same algorithm: src + dst*(256-srcAlpha)/256.
-+    static inline __vector unsigned char SkPMSrcOver_VSX(__vector unsigned char src,
-+                                                         __vector unsigned char dst) {
-+        __vector unsigned int src_u32 = (__vector unsigned int)src;
-+        __vector unsigned int dst_u32 = (__vector unsigned int)dst;
-+
-+        // scale = 256 - (src >> 24)  (per 32-bit lane)
-+        __vector unsigned int scale = vec_sub(vec_splats((unsigned int)256),
-+                                              vec_sr(src_u32, vec_splats(24u)));
-+        // scale_x2 = (scale << 16) | scale  -- splat the scale into both 16-bit halves
-+        __vector unsigned int scale_x2 = vec_or(vec_sl(scale, vec_splats(16u)), scale);
-+
-+        const __vector unsigned int rb_mask = vec_splats(0x00FF00FFu);
-+
-+        // rb = (dst & 0x00FF00FF) * scale_x2 >> 8   (R and B channels in 16-bit lanes)
-+        __vector unsigned short rb = (__vector unsigned short)vec_and(rb_mask, dst_u32);
-+        rb = vec_mul(rb, (__vector unsigned short)scale_x2);
-+        rb = vec_sr(rb, vec_splats((unsigned short)8));
-+
-+        // ga = (dst >> 8) * scale_x2  then mask out the rb channels
-+        __vector unsigned short ga = vec_sr((__vector unsigned short)dst_u32,
-+                                            vec_splats((unsigned short)8));
-+        ga = vec_mul(ga, (__vector unsigned short)scale_x2);
-+        // andc(ga, rb_mask) = ga & ~rb_mask  -- keep only G and A channels in 16-bit lanes
-+        __vector unsigned int ga_u32 = vec_andc((__vector unsigned int)ga, rb_mask);
-+
-+        // result = src + adds_epu8(rb | ga)
-+        __vector unsigned char merged =
-+            (__vector unsigned char)vec_or((__vector unsigned int)rb, ga_u32);
-+        return vec_adds(src, merged);
-+    }
-+#endif
-+
- #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
-     #include <immintrin.h>
- 
-@@ -176,6 +213,17 @@ inline void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len,
-     }
- #endif
- 
-+#if defined(SK_CPU_PPC) && defined(__VSX__)
-+    while (len >= 4) {
-+        __vector unsigned char vsrc = vec_xl(0, (const unsigned char*)src);
-+        __vector unsigned char vdst = vec_xl(0, (const unsigned char*)dst);
-+        vec_xst(SkPMSrcOver_VSX(vsrc, vdst), 0, (unsigned char*)dst);
-+        src += 4;
-+        dst += 4;
-+        len -= 4;
-+    }
-+#endif
-+
- #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
-     while (len >= 4) {
-         _mm_storeu_si128((__m128i*)dst, SkPMSrcOver_SSE2(_mm_loadu_si128((const __m128i*)src),
-diff --git a/gfx/skia/skia/src/opts/SkRasterPipeline_opts.h b/gfx/skia/skia/src/opts/SkRasterPipeline_opts.h
-index 695b71434f8c..e2af0b94f392 100644
---- a/gfx/skia/skia/src/opts/SkRasterPipeline_opts.h
-+++ b/gfx/skia/skia/src/opts/SkRasterPipeline_opts.h
-@@ -87,6 +87,8 @@ using NoCtx = const void*;
-     #define SKRP_CPU_SCALAR
- #elif defined(SK_ARM_HAS_NEON)
-     #define SKRP_CPU_NEON
-+#elif defined(SK_CPU_PPC) && defined(__VSX__)
-+    #define SKRP_CPU_VSX
- #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
-     #define SKRP_CPU_SKX
- #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
-@@ -109,6 +111,8 @@ using NoCtx = const void*;
-     #include <math.h>
- #elif defined(SKRP_CPU_NEON)
-     #include <arm_neon.h>
-+#elif defined(SKRP_CPU_VSX)
-+    #include <altivec.h>
- #elif defined(SKRP_CPU_LASX)
-     #include <lasxintrin.h>
-     #include <lsxintrin.h>
-@@ -337,6 +341,239 @@ namespace SK_OPTS_NS {
-         vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
-     }
- 
-+#elif defined(SKRP_CPU_VSX)
-+    // Reuse the file-scope Vec<N,T> defined above. It already handles the
-+    // GCC-vs-Clang divergence (ext_vector_type on Clang; vector_size via
-+    // VecHelper on GCC) and produces the right vector-register-passing ABI
-+    // on PPC64. The vec_* intrinsics in <altivec.h> accept either form.
-+    template <typename T> using V = Vec<4, T>;
-+    using F   = V<float   >;
-+    using I32 = V< int32_t>;
-+    using U64 = V<uint64_t>;
-+    using U32 = V<uint32_t>;
-+    using U16 = V<uint16_t>;
-+    using U8  = V<uint8_t >;
-+
-+    // We polyfill a few routines that Clang doesn't build into ext_vector_types.
-+    SI F   min(F a, F b)     { return vec_min(a,b); }
-+    SI I32 min(I32 a, I32 b) { return vec_min(a,b); }
-+    SI U32 min(U32 a, U32 b) { return vec_min(a,b); }
-+    SI F   max(F a, F b)     { return vec_max(a,b); }
-+    SI I32 max(I32 a, I32 b) { return vec_max(a,b); }
-+    SI U32 max(U32 a, U32 b) { return vec_max(a,b); }
-+
-+    SI F   abs_  (F v)   { return vec_abs(v); }
-+    SI I32 abs_  (I32 v) { return vec_abs(v); }
-+    SI F   rcp_approx(F v) { return vec_re(v); }
-+    SI F   rcp_precise (F v) { F e = rcp_approx(v); return e * (2.0f - v * e); }
-+    SI F   rsqrt_approx (F v)   { return vec_rsqrte(v); }
-+
-+    SI U16 pack(U32 v)       { return __builtin_convertvector(v, U16); }
-+    SI U8  pack(U16 v)       { return __builtin_convertvector(v,  U8); }
-+
-+    SI F if_then_else(I32 c, F t, F e) {
-+        return vec_or((__vector float)vec_and((__vector float)c, (__vector float)t), (__vector float)vec_andc((__vector float)e, (__vector float)c));
-+    }
-+    SI I32 if_then_else(I32 c, I32 t, I32 e) {
-+        return (I32)vec_or((__vector unsigned int)vec_and((__vector unsigned int)c, (__vector unsigned int)t), (__vector unsigned int)vec_andc((__vector unsigned int)e, (__vector unsigned int)c));
-+    }
-+
-+    // In both AltiVec and SSE there is no horizontal element compare, unlike ARM.  Fall back to scalar operations here...
-+    SI bool any(I32 c) {
-+        if (vec_extract((U32)c, 0) != 0) return 1;
-+        if (vec_extract((U32)c, 1) != 0) return 1;
-+        if (vec_extract((U32)c, 2) != 0) return 1;
-+        if (vec_extract((U32)c, 3) != 0) return 1;
-+        return 0;
-+    }
-+    SI bool all(I32 c) {
-+        if (vec_extract((U32)c, 0) == 0) return 0;
-+        if (vec_extract((U32)c, 1) == 0) return 0;
-+        if (vec_extract((U32)c, 2) == 0) return 0;
-+        if (vec_extract((U32)c, 3) == 0) return 0;
-+        return 1;
-+    }
-+
-+    SI F     mad(F f, F m, F a) { return vec_madd(f,m,a); }
-+    SI F    nmad(F f, F m, F a) { return vec_nmsub(f,m,a); }
-+    SI F  floor_(F v) { return vec_floor(v); }
-+    SI F   ceil_(F v) { return vec_ceil(v); }
-+    SI F   sqrt_(F v) { return vec_sqrt(v); }
-+    SI I32 iround(F v) { return vec_cts((__vector float)vec_rint(v), 0); }
-+    SI U32 round(F v)  { return vec_ctu((__vector float)vec_rint(v), 0); }
-+    SI U32 round(F v, F scale) { return (U32)vec_cts((__vector float)vec_rint(v*scale), 0); }
-+
-+    template <typename T>
-+    SI V<T> gather(const T* p, U32 ix) {
-+        return V<T>{p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]};
-+    }
-+    template <typename T>
-+    SI V<T> gather_unaligned(const T* ptr, U32 ix) {
-+        // This tells the compiler ptr might not be aligned appropriately, so
-+        // it generates better assembly.
-+        typedef T __attribute__ ((aligned (1))) unaligned_ptr;
-+        const unaligned_ptr* uptr = static_cast<const unaligned_ptr*>(ptr);
-+        return V<T>{uptr[ix[0]], uptr[ix[1]], uptr[ix[2]], uptr[ix[3]]};
-+    }
-+    template <typename V, typename S>
-+    SI void scatter_masked(V src, S* dst, U32 ix, I32 mask) {
-+        V before = gather(dst, ix);
-+        V after = if_then_else(mask, src, before);
-+        dst[ix[0]] = after[0];
-+        dst[ix[1]] = after[1];
-+        dst[ix[2]] = after[2];
-+        dst[ix[3]] = after[3];
-+    }
-+
-+    // Native VSX/AltiVec ports of the load2/store2/load3/load4/store4 helpers.
-+    // Each uses vec_xl/vec_xst for unaligned 16-byte loads/stores, vec_mergeh/
-+    // vec_mergel for SSE-style epi16/epi32/ps unpack ops, and vec_perm with a
-+    // byte-mask for the SSE shufflelo/shufflehi/shuffle/srli_si128 ops. The
-+    // PPC64 LE register-to-memory byte order matches x86 LE, so the byte-mask
-+    // patterns are identical to the corresponding _mm_setr_epi8 forms.
-+
-+    SI void load2(const uint16_t* ptr, U16* r, U16* g) {
-+        // Load 8 uint16: r0 g0 r1 g1 r2 g2 r3 g3 (in LE memory order).
-+        __vector unsigned char v = vec_xl(0, (const unsigned char*)ptr);
-+        // Extract every-other 16-bit value via vec_perm (high half of result is unused
-+        // but written; sk_unaligned_load below picks up the low 8 bytes).
-+        const __vector unsigned char r_mask = (__vector unsigned char){
-+            0,1, 4,5, 8,9, 12,13,  0,0,0,0,0,0,0,0
-+        };
-+        const __vector unsigned char g_mask = (__vector unsigned char){
-+            2,3, 6,7, 10,11, 14,15,  0,0,0,0,0,0,0,0
-+        };
-+        __vector unsigned char R_v = vec_perm(v, v, r_mask);
-+        __vector unsigned char G_v = vec_perm(v, v, g_mask);
-+        *r = sk_unaligned_load<U16>(&R_v);
-+        *g = sk_unaligned_load<U16>(&G_v);
-+    }
-+
-+    SI void store2(uint16_t* ptr, U16 r, U16 g) {
-+        // Interleave: rg = r0 g0 r1 g1 r2 g2 r3 g3.
-+        // r and g are 8-byte vectors; widen to 16 and vec_mergeh on ushort takes
-+        // the low 4 lanes of each.
-+        __vector unsigned short rw = widen_cast<__vector unsigned short>(r);
-+        __vector unsigned short gw = widen_cast<__vector unsigned short>(g);
-+        __vector unsigned short rg = vec_mergeh(rw, gw);
-+        vec_xst((__vector unsigned char)rg, 0, (unsigned char*)ptr);
-+    }
-+
-+    SI void load3(const uint16_t* ptr, U16* r, U16* g, U16* b) {
-+        // 4 pixels x 3 channels x 2 bytes = 24 bytes. Two 16-byte loads with overlap
-+        // avoid reading past the 24-byte source.
-+        __vector unsigned char v01 = vec_xl(0, (const unsigned char*)(ptr + 0));
-+        __vector unsigned char v23_raw = vec_xl(0, (const unsigned char*)(ptr + 4));
-+        const __vector unsigned char zero = vec_splats((unsigned char)0);
-+        // v23 = v23_raw >> 4 bytes (drops the overlapping pixel-1 trailing R).
-+        const __vector unsigned char shift4 = (__vector unsigned char){
-+            4,5,6,7, 8,9,10,11, 12,13,14,15, 16,16,16,16
-+        };
-+        __vector unsigned char v23 = vec_perm(v23_raw, zero, shift4);
-+        // _N holds R,G,B for pixel N in its lower 3 lanes. shift6 advances to the next pixel.
-+        const __vector unsigned char shift6 = (__vector unsigned char){
-+            6,7,8,9, 10,11,12,13, 14,15, 16,16, 16,16, 16,16
-+        };
-+        __vector unsigned char _0 = v01;
-+        __vector unsigned char _1 = vec_perm(v01, zero, shift6);
-+        __vector unsigned char _2 = v23;
-+        __vector unsigned char _3 = vec_perm(v23, zero, shift6);
-+        // De-interlace to R,G,B per the SSE flow.
-+        __vector unsigned short _02 = vec_mergeh((__vector unsigned short)_0,
-+                                                  (__vector unsigned short)_2);
-+        __vector unsigned short _13 = vec_mergeh((__vector unsigned short)_1,
-+                                                  (__vector unsigned short)_3);
-+        __vector unsigned short R_v = vec_mergeh(_02, _13);
-+        const __vector unsigned char shift8 = (__vector unsigned char){
-+            8,9,10,11, 12,13,14,15, 16,16,16,16, 16,16,16,16
-+        };
-+        __vector unsigned char G_v = vec_perm((__vector unsigned char)R_v, zero, shift8);
-+        __vector unsigned short B_v = vec_mergel(_02, _13);
-+        *r = sk_unaligned_load<U16>(&R_v);
-+        *g = sk_unaligned_load<U16>(&G_v);
-+        *b = sk_unaligned_load<U16>(&B_v);
-+    }
-+
-+    SI void load4(const uint16_t* ptr, U16* r, U16* g, U16* b, U16* a) {
-+        __vector unsigned short v01 = (__vector unsigned short)
-+            vec_xl(0, (const unsigned char*)ptr);            // r0 g0 b0 a0 r1 g1 b1 a1
-+        __vector unsigned short v23 = (__vector unsigned short)
-+            vec_xl(0, (const unsigned char*)(ptr + 8));      // r2 g2 b2 a2 r3 g3 b3 a3
-+        __vector unsigned short _02 = vec_mergeh(v01, v23);  // r0 r2 g0 g2 b0 b2 a0 a2
-+        __vector unsigned short _13 = vec_mergel(v01, v23);  // r1 r3 g1 g3 b1 b3 a1 a3
-+        __vector unsigned short rg  = vec_mergeh(_02, _13);  // r0 r1 r2 r3 g0 g1 g2 g3
-+        __vector unsigned short ba  = vec_mergel(_02, _13);  // b0 b1 b2 b3 a0 a1 a2 a3
-+        *r = sk_unaligned_load<U16>((const uint16_t*)&rg + 0);
-+        *g = sk_unaligned_load<U16>((const uint16_t*)&rg + 4);
-+        *b = sk_unaligned_load<U16>((const uint16_t*)&ba + 0);
-+        *a = sk_unaligned_load<U16>((const uint16_t*)&ba + 4);
-+    }
-+
-+    SI void store4(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) {
-+        __vector unsigned short rw = widen_cast<__vector unsigned short>(r);
-+        __vector unsigned short gw = widen_cast<__vector unsigned short>(g);
-+        __vector unsigned short bw = widen_cast<__vector unsigned short>(b);
-+        __vector unsigned short aw = widen_cast<__vector unsigned short>(a);
-+        __vector unsigned short rg = vec_mergeh(rw, gw);  // r0 g0 r1 g1 r2 g2 r3 g3
-+        __vector unsigned short ba = vec_mergeh(bw, aw);  // b0 a0 b1 a1 b2 a2 b3 a3
-+        // Now interleave 32-bit lanes (each rg pair = 1 lane, each ba pair = 1 lane).
-+        __vector unsigned int rgba_lo = vec_mergeh((__vector unsigned int)rg,
-+                                                    (__vector unsigned int)ba);
-+        __vector unsigned int rgba_hi = vec_mergel((__vector unsigned int)rg,
-+                                                    (__vector unsigned int)ba);
-+        vec_xst((__vector unsigned char)rgba_lo, 0, (unsigned char*)ptr);
-+        vec_xst((__vector unsigned char)rgba_hi, 0, (unsigned char*)(ptr + 8));
-+    }
-+
-+    SI void load2(const float* ptr, F* r, F* g) {
-+        __vector float _01 = vec_xl(0, ptr);          // r0 g0 r1 g1
-+        __vector float _23 = vec_xl(0, ptr + 4);      // r2 g2 r3 g3
-+        // r = lanes {_01[0], _01[2], _23[0], _23[2]}; g = {_01[1], _01[3], _23[1], _23[3]}.
-+        const __vector unsigned char r_mask = (__vector unsigned char){
-+            0,1,2,3, 8,9,10,11, 16,17,18,19, 24,25,26,27
-+        };
-+        const __vector unsigned char g_mask = (__vector unsigned char){
-+            4,5,6,7, 12,13,14,15, 20,21,22,23, 28,29,30,31
-+        };
-+        *r = (F)vec_perm((__vector unsigned char)_01, (__vector unsigned char)_23, r_mask);
-+        *g = (F)vec_perm((__vector unsigned char)_01, (__vector unsigned char)_23, g_mask);
-+    }
-+
-+    SI void store2(float* ptr, F r, F g) {
-+        __vector float _01 = vec_mergeh((__vector float)r, (__vector float)g);   // r0 g0 r1 g1
-+        __vector float _23 = vec_mergel((__vector float)r, (__vector float)g);   // r2 g2 r3 g3
-+        vec_xst((__vector unsigned char)_01, 0, (unsigned char*)ptr);
-+        vec_xst((__vector unsigned char)_23, 0, (unsigned char*)(ptr + 4));
-+    }
-+
-+    SI void load4(const float* ptr, F* r, F* g, F* b, F* a) {
-+        // 4x4 float matrix transpose: rows -> columns.
-+        __vector float row0 = vec_xl(0, ptr +  0);
-+        __vector float row1 = vec_xl(0, ptr +  4);
-+        __vector float row2 = vec_xl(0, ptr +  8);
-+        __vector float row3 = vec_xl(0, ptr + 12);
-+        __vector float T0 = vec_mergeh(row0, row2);  // {row0[0], row2[0], row0[1], row2[1]}
-+        __vector float T1 = vec_mergeh(row1, row3);
-+        __vector float T2 = vec_mergel(row0, row2);
-+        __vector float T3 = vec_mergel(row1, row3);
-+        *r = (F)vec_mergeh(T0, T1);  // {row0[0], row1[0], row2[0], row3[0]}
-+        *g = (F)vec_mergel(T0, T1);
-+        *b = (F)vec_mergeh(T2, T3);
-+        *a = (F)vec_mergel(T2, T3);
-+    }
-+
-+    SI void store4(float* ptr, F r, F g, F b, F a) {
-+        // 4x4 float matrix transpose, then store rows.
-+        __vector float T0 = vec_mergeh((__vector float)r, (__vector float)b);
-+        __vector float T1 = vec_mergeh((__vector float)g, (__vector float)a);
-+        __vector float T2 = vec_mergel((__vector float)r, (__vector float)b);
-+        __vector float T3 = vec_mergel((__vector float)g, (__vector float)a);
-+        vec_xst((__vector unsigned char)vec_mergeh(T0, T1), 0, (unsigned char*)(ptr +  0));
-+        vec_xst((__vector unsigned char)vec_mergel(T0, T1), 0, (unsigned char*)(ptr +  4));
-+        vec_xst((__vector unsigned char)vec_mergeh(T2, T3), 0, (unsigned char*)(ptr +  8));
-+        vec_xst((__vector unsigned char)vec_mergel(T2, T3), 0, (unsigned char*)(ptr + 12));
-+    }
-+
- #elif defined(SKRP_CPU_SKX)
-     template <typename T> using V = Vec<16, T>;
-     using F   = V<float   >;
-diff --git a/gfx/skia/skia/src/opts/SkSwizzler_opts.inc b/gfx/skia/skia/src/opts/SkSwizzler_opts.inc
-index 671db3f05f61..c578238a9e58 100644
---- a/gfx/skia/skia/src/opts/SkSwizzler_opts.inc
-+++ b/gfx/skia/skia/src/opts/SkSwizzler_opts.inc
-@@ -84,6 +84,29 @@ SI float reciprocal_alpha(float a) {
-     auto q = F4{1.0f} / vA;
-     return _mm_and_ps(sk_bit_cast<__m128>(vA != F4{0.0f}), q)[0];
- }
-+#elif defined(SK_CPU_PPC) && defined(__VSX__)
-+// -- VSX -- Harden against timing attacks.
-+// vec_splats / vec_div / vec_cmpgt / vec_and each map to a single VSX op on
-+// both GCC and Clang. vec_cmpgt(vA, 0) is exact for the non-negative-alpha
-+// contract (0 <= a) and avoids Clang's static_cast<float>(vector) extension
-+// that GCC does not support.
-+SK_NO_SANITIZE("float-divide-by-zero")
-+SI float reciprocal_alpha_times_255(float a) {
-+    SkASSERT(0 <= a && a <= 255);
-+    __vector float vA = vec_splats(a);
-+    __vector float q = vec_div(vec_splats(255.0f), vA);
-+    __vector float vMask = (__vector float)vec_cmpgt(vA, vec_splats(0.0f));
-+    return vec_and(vMask, q)[0];
-+}
-+
-+SK_NO_SANITIZE("float-divide-by-zero")
-+SI float reciprocal_alpha(float a) {
-+    SkASSERT(0 <= a && a <= 1);
-+    __vector float vA = vec_splats(a);
-+    __vector float q = vec_div(vec_splats(1.0f), vA);
-+    __vector float vMask = (__vector float)vec_cmpgt(vA, vec_splats(0.0f));
-+    return vec_and(vMask, q)[0];
-+}
- #else
- // -- Portable -- *Not* hardened against timing attacks
- SI float reciprocal_alpha_times_255(float a) {
-@@ -1085,6 +1108,208 @@ void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
-     rgbA_to_BGRA_portable(dst, src, count);
- }
- 
-+#elif defined(SK_CPU_PPC) && defined(__VSX__)
-+// -- VSX -- Native Power VSX/AltiVec ports of the SSSE3 swizzlers below.
-+// Each _mm_* operation is replaced by the corresponding vec_* sequence per
-+// the GCC ppc_wrappers translation pattern (vec_mergeh/l, vec_perm, and the
-+// vec_vmuleuh/vmulouh + permute idiom for _mm_mulhi_epu16). The permute
-+// masks for byte-shuffles use the same byte-order layout as the SSE
-+// _mm_setr_epi8 forms because PPC64 LE register-to-memory byte order is the
-+// same as x86 LE.
-+
-+// Scale: ((x*y) + 128) * 257 >> 16, per 16-bit lane (matches the SSSE3 form).
-+static inline __vector unsigned short scale(__vector unsigned short x, __vector unsigned short y) {
-+    const __vector unsigned short v128 = vec_splats((unsigned short)128);
-+    const __vector unsigned short v257 = vec_splats((unsigned short)257);
-+    __vector unsigned short summ = (__vector unsigned short)((__vector unsigned short)(x * y) + v128);
-+    // _mm_mulhi_epu16 equivalent: 16x16 -> high 16 bits, via mule+mulo+permute.
-+    __vector unsigned int even = vec_vmuleuh(summ, v257);
-+    __vector unsigned int odd  = vec_vmulouh(summ, v257);
-+    const __vector unsigned char xform = (__vector unsigned char){
-+        0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
-+        0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
-+    };
-+    return (__vector unsigned short)vec_perm((__vector unsigned char)even,
-+                                             (__vector unsigned char)odd, xform);
-+}
-+
-+static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
-+    auto premul8 = [=](__vector unsigned char* lo, __vector unsigned char* hi) {
-+        const __vector unsigned char zeros = (__vector unsigned char){0};
-+        const __vector unsigned char planar = kSwapRB
-+            ? (__vector unsigned char){2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15}
-+            : (__vector unsigned char){0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15};
-+
-+        // Swizzle each 16-byte chunk into 8-bit planar layout.
-+        *lo = vec_perm(*lo, *lo, planar);                        // rrrrgggg bbbbaaaa
-+        *hi = vec_perm(*hi, *hi, planar);                        // RRRRGGGG BBBBAAAA
-+
-+        // Interleave the two halves at 32-bit granularity.
-+        __vector unsigned char rg = (__vector unsigned char)
-+            vec_mergeh((__vector unsigned int)*lo, (__vector unsigned int)*hi);  // rrrrRRRR ggggGGGG
-+        __vector unsigned char ba = (__vector unsigned char)
-+            vec_mergel((__vector unsigned int)*lo, (__vector unsigned int)*hi);  // bbbbBBBB aaaaAAAA
-+
-+        // Unpack to 16-bit planar.
-+        __vector unsigned short r = (__vector unsigned short)vec_mergeh(rg, zeros);
-+        __vector unsigned short g = (__vector unsigned short)vec_mergel(rg, zeros);
-+        __vector unsigned short b = (__vector unsigned short)vec_mergeh(ba, zeros);
-+        __vector unsigned short a = (__vector unsigned short)vec_mergel(ba, zeros);
-+
-+        // Premultiply each colour channel by alpha.
-+        r = scale(r, a);
-+        g = scale(g, a);
-+        b = scale(b, a);
-+
-+        // Repack into interlaced pixels.
-+        const __vector unsigned short v8 = vec_splats((unsigned short)8);
-+        __vector unsigned short rg2 = vec_or(r, vec_sl(g, v8));
-+        __vector unsigned short ba2 = vec_or(b, vec_sl(a, v8));
-+        *lo = (__vector unsigned char)vec_mergeh(rg2, ba2);
-+        *hi = (__vector unsigned char)vec_mergel(rg2, ba2);
-+    };
-+
-+    while (count >= 8) {
-+        __vector unsigned char lo = vec_xl(0, (const unsigned char*)(src + 0));
-+        __vector unsigned char hi = vec_xl(0, (const unsigned char*)(src + 4));
-+        premul8(&lo, &hi);
-+        vec_xst(lo, 0, (unsigned char*)(dst + 0));
-+        vec_xst(hi, 0, (unsigned char*)(dst + 4));
-+        src += 8; dst += 8; count -= 8;
-+    }
-+
-+    if (count >= 4) {
-+        __vector unsigned char lo = vec_xl(0, (const unsigned char*)src);
-+        __vector unsigned char hi = (__vector unsigned char){0};
-+        premul8(&lo, &hi);
-+        vec_xst(lo, 0, (unsigned char*)dst);
-+        src += 4; dst += 4; count -= 4;
-+    }
-+
-+    auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
-+    proc(dst, src, count);
-+}
-+
-+void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
-+    premul_should_swapRB(false, dst, src, count);
-+}
-+
-+void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
-+    premul_should_swapRB(true, dst, src, count);
-+}
-+
-+void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
-+    const __vector unsigned char swapRB = (__vector unsigned char){
-+        2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15
-+    };
-+    while (count >= 4) {
-+        __vector unsigned char rgba = vec_xl(0, (const unsigned char*)src);
-+        __vector unsigned char bgra = vec_perm(rgba, rgba, swapRB);
-+        vec_xst(bgra, 0, (unsigned char*)dst);
-+        src += 4; dst += 4; count -= 4;
-+    }
-+    RGBA_to_BGRA_portable(dst, src, count);
-+}
-+
-+void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
-+    while (count >= 8) {
-+        __vector unsigned short ga = (__vector unsigned short)vec_xl(0, src);
-+        __vector unsigned short gg = vec_or(
-+                vec_and(ga, vec_splats((unsigned short)0x00FF)),
-+                vec_sl (ga, vec_splats((unsigned short)8)));
-+        __vector unsigned short ggga_lo = vec_mergeh(gg, ga);
-+        __vector unsigned short ggga_hi = vec_mergel(gg, ga);
-+        vec_xst((__vector unsigned char)ggga_lo, 0, (unsigned char*)(dst + 0));
-+        vec_xst((__vector unsigned char)ggga_hi, 0, (unsigned char*)(dst + 4));
-+        src += 8 * 2; dst += 8; count -= 8;
-+    }
-+    grayA_to_RGBA_portable(dst, src, count);
-+}
-+
-+void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
-+    while (count >= 8) {
-+        __vector unsigned short grayA = (__vector unsigned short)vec_xl(0, src);
-+        __vector unsigned short g0 = vec_and(grayA, vec_splats((unsigned short)0x00FF));
-+        __vector unsigned short a0 = vec_sr   (grayA, vec_splats((unsigned short)8));
-+        g0 = scale(g0, a0);
-+        const __vector unsigned short v8 = vec_splats((unsigned short)8);
-+        __vector unsigned short gg = vec_or(g0, vec_sl(g0, v8));
-+        __vector unsigned short ga = vec_or(g0, vec_sl(a0, v8));
-+        __vector unsigned short ggga_lo = vec_mergeh(gg, ga);
-+        __vector unsigned short ggga_hi = vec_mergel(gg, ga);
-+        vec_xst((__vector unsigned char)ggga_lo, 0, (unsigned char*)(dst + 0));
-+        vec_xst((__vector unsigned char)ggga_hi, 0, (unsigned char*)(dst + 4));
-+        src += 8 * 2; dst += 8; count -= 8;
-+    }
-+    grayA_to_rgbA_portable(dst, src, count);
-+}
-+
-+enum Format { kRGB1, kBGR1 };
-+static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
-+    auto convert8 = [=](__vector unsigned char* lo, __vector unsigned char* hi) {
-+        const __vector unsigned char zeros = (__vector unsigned char){0};
-+        const __vector unsigned char planar = (kBGR1 == format)
-+            ? (__vector unsigned char){2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15}
-+            : (__vector unsigned char){0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15};
-+
-+        *lo = vec_perm(*lo, *lo, planar);                        // ccccmmmm yyyykkkk
-+        *hi = vec_perm(*hi, *hi, planar);                        // CCCCMMMM YYYYKKKK
-+        __vector unsigned char cm = (__vector unsigned char)
-+            vec_mergeh((__vector unsigned int)*lo, (__vector unsigned int)*hi);
-+        __vector unsigned char yk = (__vector unsigned char)
-+            vec_mergel((__vector unsigned int)*lo, (__vector unsigned int)*hi);
-+
-+        __vector unsigned short c = (__vector unsigned short)vec_mergeh(cm, zeros);
-+        __vector unsigned short m = (__vector unsigned short)vec_mergel(cm, zeros);
-+        __vector unsigned short y = (__vector unsigned short)vec_mergeh(yk, zeros);
-+        __vector unsigned short k = (__vector unsigned short)vec_mergel(yk, zeros);
-+
-+        __vector unsigned short r = scale(c, k);
-+        __vector unsigned short g = scale(m, k);
-+        __vector unsigned short b = scale(y, k);
-+
-+        const __vector unsigned short v8 = vec_splats((unsigned short)8);
-+        __vector unsigned short rg = vec_or(r, vec_sl(g, v8));
-+        __vector unsigned short ba = vec_or(b, vec_splats((unsigned short)0xFF00));
-+        *lo = (__vector unsigned char)vec_mergeh(rg, ba);
-+        *hi = (__vector unsigned char)vec_mergel(rg, ba);
-+    };
-+
-+    while (count >= 8) {
-+        __vector unsigned char lo = vec_xl(0, (const unsigned char*)(src + 0));
-+        __vector unsigned char hi = vec_xl(0, (const unsigned char*)(src + 4));
-+        convert8(&lo, &hi);
-+        vec_xst(lo, 0, (unsigned char*)(dst + 0));
-+        vec_xst(hi, 0, (unsigned char*)(dst + 4));
-+        src += 8; dst += 8; count -= 8;
-+    }
-+    if (count >= 4) {
-+        __vector unsigned char lo = vec_xl(0, (const unsigned char*)src);
-+        __vector unsigned char hi = (__vector unsigned char){0};
-+        convert8(&lo, &hi);
-+        vec_xst(lo, 0, (unsigned char*)dst);
-+        src += 4; dst += 4; count -= 4;
-+    }
-+    auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
-+    proc(dst, src, count);
-+}
-+
-+void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
-+    inverted_cmyk_to(kRGB1, dst, src, count);
-+}
-+
-+void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
-+    inverted_cmyk_to(kBGR1, dst, src, count);
-+}
-+
-+void rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) {
-+    rgbA_to_RGBA_portable(dst, src, count);
-+}
-+
-+void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
-+    rgbA_to_BGRA_portable(dst, src, count);
-+}
-+
- #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
- // -- LASX ----------------------------------------------------------------------------------------
- 
-@@ -1736,6 +1961,39 @@ static void gray_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count)
-         }
-         gray_to_RGB1_portable(dst, src, count);
-     }
-+#elif defined(SK_CPU_PPC) && defined(__VSX__)
-+    void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
-+        const __vector unsigned char alphas = vec_splats((unsigned char)0xFF);
-+        while (count >= 16) {
-+            __vector unsigned char grays = vec_xl(0, src);
-+
-+            // Replicate gray byte: gg = unpack(gray, gray) per 8-bit lane.
-+            __vector unsigned char gg_lo = vec_mergeh(grays, grays);
-+            __vector unsigned char gg_hi = vec_mergel(grays, grays);
-+            __vector unsigned char ga_lo = vec_mergeh(grays, alphas);
-+            __vector unsigned char ga_hi = vec_mergel(grays, alphas);
-+
-+            // Interleave g-pairs and ga-pairs at 16-bit granularity.
-+            __vector unsigned short ggga0 = vec_mergeh((__vector unsigned short)gg_lo,
-+                                                       (__vector unsigned short)ga_lo);
-+            __vector unsigned short ggga1 = vec_mergel((__vector unsigned short)gg_lo,
-+                                                       (__vector unsigned short)ga_lo);
-+            __vector unsigned short ggga2 = vec_mergeh((__vector unsigned short)gg_hi,
-+                                                       (__vector unsigned short)ga_hi);
-+            __vector unsigned short ggga3 = vec_mergel((__vector unsigned short)gg_hi,
-+                                                       (__vector unsigned short)ga_hi);
-+
-+            vec_xst((__vector unsigned char)ggga0, 0, (unsigned char*)(dst +  0));
-+            vec_xst((__vector unsigned char)ggga1, 0, (unsigned char*)(dst +  4));
-+            vec_xst((__vector unsigned char)ggga2, 0, (unsigned char*)(dst +  8));
-+            vec_xst((__vector unsigned char)ggga3, 0, (unsigned char*)(dst + 12));
-+
-+            src += 16;
-+            dst += 16;
-+            count -= 16;
-+        }
-+        gray_to_RGB1_portable(dst, src, count);
-+    }
- #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
-     /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
-         const __m256i alphas = __lasx_xvreplgr2vr_b(0xFF);
-@@ -1920,6 +2178,37 @@ static void RGB_to_BGR1_portable(uint32_t dst[], const uint8_t* src, int count)
-         proc(dst, src, count);
-     }
- 
-+    void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
-+        insert_alpha_should_swaprb(false, dst, src, count);
-+    }
-+    void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
-+        insert_alpha_should_swaprb(true, dst, src, count);
-+    }
-+#elif defined(SK_CPU_PPC) && defined(__VSX__)
-+    static void insert_alpha_should_swaprb(bool kSwapRB,
-+                                           uint32_t dst[], const uint8_t* src, int count) {
-+        // alphaMask = 0xFF000000 per 32-bit lane -> bytes (in LE memory layout) are
-+        // {00,00,00,FF, 00,00,00,FF, ...}.
-+        const __vector unsigned char alphaMask = (__vector unsigned char){
-+            0,0,0,0xFF, 0,0,0,0xFF, 0,0,0,0xFF, 0,0,0,0xFF
-+        };
-+        // 'X' (= 0) is irrelevant: the alphaMask OR overwrites those lanes with FF.
-+        const __vector unsigned char expand = kSwapRB
-+            ? (__vector unsigned char){2,1,0,0, 5,4,3,0, 8,7,6,0, 11,10,9,0}
-+            : (__vector unsigned char){0,1,2,0, 3,4,5,0, 6,7,8,0,  9,10,11,0};
-+
-+        while (count >= 6) {
-+            __vector unsigned char rgb = vec_xl(0, src);
-+            __vector unsigned char rgba = vec_or(vec_perm(rgb, rgb, expand), alphaMask);
-+            vec_xst(rgba, 0, (unsigned char*)dst);
-+            src += 4*3;
-+            dst += 4;
-+            count -= 4;
-+        }
-+        auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
-+        proc(dst, src, count);
-+    }
-+
-     void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
-         insert_alpha_should_swaprb(false, dst, src, count);
-     }
-
-base-commit: a8d530ac13f0ce7e937c047f01f0d36764f5d34e
--- 
-2.52.0
-

diff --git a/0002-Add-VSX-instructions-for-libwebp.patch b/0002-Add-VSX-instructions-for-libwebp.patch
deleted file mode 100644
index 1f857a7..0000000
--- a/0002-Add-VSX-instructions-for-libwebp.patch
+++ /dev/null
@@ -1,2524 +0,0 @@
-From b9e116898830a0f9edd1b0566651ce2d4989618d Mon Sep 17 00:00:00 2001
-From: =?UTF-8?q?Trung=20L=C3=AA?= <8@tle.id.au>
-Date: Fri, 12 Jun 2026 15:30:13 +1000
-Subject: [PATCH 2/3] Add VSX instructions for libwebp
-
-Assisted-by: Lance Albertson <lance@osuosl.org>
-Assisted-by: Thushan Fernando <thushan@thushanfernando.com>
----
- media/libwebp/src/dsp/alpha_processing.c     |   6 +
- media/libwebp/src/dsp/alpha_processing_vsx.c | 246 +++++++
- media/libwebp/src/dsp/cpu.h                  |  14 +-
- media/libwebp/src/dsp/dec.c                  |   6 +
- media/libwebp/src/dsp/dec_vsx.c              | 737 +++++++++++++++++++
- media/libwebp/src/dsp/filters.c              |   6 +
- media/libwebp/src/dsp/filters_vsx.c          | 162 ++++
- media/libwebp/src/dsp/lossless.c             |   6 +
- media/libwebp/src/dsp/lossless_vsx.c         | 449 +++++++++++
- media/libwebp/src/dsp/moz.build              |  14 +
- media/libwebp/src/dsp/rescaler.c             |   6 +
- media/libwebp/src/dsp/rescaler_vsx.c         | 201 +++++
- media/libwebp/src/dsp/upsampling.c           |  12 +
- media/libwebp/src/dsp/upsampling_vsx.c       | 151 ++++
- media/libwebp/src/dsp/yuv.c                  |   6 +
- media/libwebp/src/dsp/yuv.h                  |  21 +
- media/libwebp/src/dsp/yuv_vsx.c              | 206 ++++++
- media/libwebp/src/moz/cpu.cpp                |   4 +
- 18 files changed, 2252 insertions(+), 1 deletion(-)
- create mode 100644 media/libwebp/src/dsp/alpha_processing_vsx.c
- create mode 100644 media/libwebp/src/dsp/dec_vsx.c
- create mode 100644 media/libwebp/src/dsp/filters_vsx.c
- create mode 100644 media/libwebp/src/dsp/lossless_vsx.c
- create mode 100644 media/libwebp/src/dsp/rescaler_vsx.c
- create mode 100644 media/libwebp/src/dsp/upsampling_vsx.c
- create mode 100644 media/libwebp/src/dsp/yuv_vsx.c
-
-diff --git a/media/libwebp/src/dsp/alpha_processing.c b/media/libwebp/src/dsp/alpha_processing.c
-index 4927e73e81bf..5f9152bf701a 100644
---- a/media/libwebp/src/dsp/alpha_processing.c
-+++ b/media/libwebp/src/dsp/alpha_processing.c
-@@ -434,6 +434,7 @@ extern void WebPInitAlphaProcessingMIPSdspR2(void);
- extern void WebPInitAlphaProcessingSSE2(void);
- extern void WebPInitAlphaProcessingSSE41(void);
- extern void WebPInitAlphaProcessingNEON(void);
-+extern void WebPInitAlphaProcessingVSX(void);
- 
- WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
-   WebPMultARGBRow = WebPMultARGBRow_C;
-@@ -472,6 +473,11 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
-     if (VP8GetCPUInfo(kMIPSdspR2)) {
-       WebPInitAlphaProcessingMIPSdspR2();
-     }
-+#endif
-+#if defined(WEBP_HAVE_VSX)
-+    if (VP8GetCPUInfo(kVSX)) {
-+      WebPInitAlphaProcessingVSX();
-+    }
- #endif
-   }
- 
-diff --git a/media/libwebp/src/dsp/alpha_processing_vsx.c b/media/libwebp/src/dsp/alpha_processing_vsx.c
-new file mode 100644
-index 000000000000..2aad1cd8b648
---- /dev/null
-+++ b/media/libwebp/src/dsp/alpha_processing_vsx.c
-@@ -0,0 +1,246 @@
-+// Copyright 2014 Google Inc. All Rights Reserved.
-+//
-+// Use of this source code is governed by a BSD-style license
-+// that can be found in the COPYING file in the root of the source
-+// tree. An additional intellectual property rights grant can be found
-+// in the file PATENTS. All contributing project authors may
-+// be found in the AUTHORS file in the root of the source tree.
-+// -----------------------------------------------------------------------------
-+//
-+// VSX (PowerPC) version of alpha processing functions.
-+
-+#include "src/dsp/dsp.h"
-+
-+#if defined(WEBP_USE_VSX)
-+
-+#include <altivec.h>
-+#include <string.h>
-+
-+#include "src/dsp/cpu.h"
-+#include "src/webp/types.h"
-+
-+typedef __vector unsigned char u8x16;
-+typedef __vector unsigned short u16x8;
-+typedef __vector signed short i16x8;
-+typedef __vector unsigned int u32x4;
-+typedef __vector signed int i32x4;
-+
-+//------------------------------------------------------------------------------
-+// Alpha dispatch / extraction.
-+
-+static int DispatchAlpha_VSX(const uint8_t* WEBP_RESTRICT alpha,
-+                             int alpha_stride, int width, int height,
-+                             uint8_t* WEBP_RESTRICT dst, int dst_stride) {
-+  uint32_t alpha_and = 0xff;
-+  int i, j, k;
-+  const u8x16 zero = vec_splats((unsigned char)0);
-+  const u16x8 z16 = vec_splats((unsigned short)0);
-+  const u32x4 a_mask = vec_splats((uint32_t)0xff);  // selects the low byte
-+  u8x16 all_and = vec_splats((unsigned char)0xff);
-+  const int limit = width & ~15;
-+
-+  for (j = 0; j < height; ++j) {
-+    uint8_t* ptr = dst;
-+    for (i = 0; i < limit; i += 16) {
-+      const u8x16 a0 = vec_xl(0, (unsigned char*)&alpha[i]);
-+      // Spread the 16 alpha bytes to the low byte of 16 32-bit lanes.
-+      const u16x8 a1_lo = (u16x8)vec_mergeh(a0, zero);
-+      const u16x8 a1_hi = (u16x8)vec_mergel(a0, zero);
-+      const u32x4 s0 = (u32x4)vec_mergeh(a1_lo, z16);
-+      const u32x4 s1 = (u32x4)vec_mergel(a1_lo, z16);
-+      const u32x4 s2 = (u32x4)vec_mergeh(a1_hi, z16);
-+      const u32x4 s3 = (u32x4)vec_mergel(a1_hi, z16);
-+      const u32x4* spread[4] = {&s0, &s1, &s2, &s3};
-+      for (k = 0; k < 4; ++k) {
-+        const u32x4 d = vec_xl(0, (uint32_t*)(ptr + 16 * k));
-+        vec_xst(vec_sel(d, *spread[k], a_mask), 0, (uint32_t*)(ptr + 16 * k));
-+      }
-+      all_and = vec_and(all_and, a0);
-+      ptr += 64;
-+    }
-+    for (; i < width; ++i) {
-+      const uint32_t alpha_value = alpha[i];
-+      dst[4 * i] = alpha_value;
-+      alpha_and &= alpha_value;
-+    }
-+    alpha += alpha_stride;
-+    dst += dst_stride;
-+  }
-+  {
-+    unsigned char tmp[16];
-+    memcpy(tmp, &all_and, 16);
-+    for (k = 0; k < 16; ++k) alpha_and &= tmp[k];
-+  }
-+  return (alpha_and != 0xff);
-+}
-+
-+static void DispatchAlphaToGreen_VSX(const uint8_t* WEBP_RESTRICT alpha,
-+                                     int alpha_stride, int width, int height,
-+                                     uint32_t* WEBP_RESTRICT dst,
-+                                     int dst_stride) {
-+  int i, j;
-+  const u8x16 zero = vec_splats((unsigned char)0);
-+  const u16x8 z16 = vec_splats((unsigned short)0);
-+  const int limit = width & ~15;
-+  for (j = 0; j < height; ++j) {
-+    for (i = 0; i < limit; i += 16) {
-+      const u8x16 a0 = vec_xl(0, (unsigned char*)&alpha[i]);
-+      // Place each alpha byte into the green slot (<< 8) of a 32-bit lane.
-+      const u16x8 a1_lo = (u16x8)vec_mergeh(zero, a0);  // note the 'zero' first
-+      const u16x8 a1_hi = (u16x8)vec_mergel(zero, a0);
-+      const u32x4 g0 = (u32x4)vec_mergeh(a1_lo, z16);
-+      const u32x4 g1 = (u32x4)vec_mergel(a1_lo, z16);
-+      const u32x4 g2 = (u32x4)vec_mergeh(a1_hi, z16);
-+      const u32x4 g3 = (u32x4)vec_mergel(a1_hi, z16);
-+      vec_xst(g0, 0, &dst[i + 0]);
-+      vec_xst(g1, 0, &dst[i + 4]);
-+      vec_xst(g2, 0, &dst[i + 8]);
-+      vec_xst(g3, 0, &dst[i + 12]);
-+    }
-+    for (; i < width; ++i) dst[i] = alpha[i] << 8;
-+    alpha += alpha_stride;
-+    dst += dst_stride;
-+  }
-+}
-+
-+static int ExtractAlpha_VSX(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
-+                            int width, int height, uint8_t* WEBP_RESTRICT alpha,
-+                            int alpha_stride) {
-+  uint32_t alpha_and = 0xff;
-+  int i, j, k;
-+  const u32x4 a_mask = vec_splats((uint32_t)0xff);  // keeps the low byte
-+  u8x16 all_and = vec_splats((unsigned char)0xff);
-+  const int limit = width & ~7;
-+
-+  for (j = 0; j < height; ++j) {
-+    const uint32_t* src = (const uint32_t*)argb;
-+    for (i = 0; i < limit; i += 8) {
-+      const u32x4 a0 = vec_and(vec_xl(0, (uint32_t*)(src + 0)), a_mask);
-+      const u32x4 a1 = vec_and(vec_xl(0, (uint32_t*)(src + 4)), a_mask);
-+      const i16x8 c0 = vec_packs((i32x4)a0, (i32x4)a1);
-+      const u8x16 d0 = vec_packsu(c0, c0);  // 8 alpha bytes in the low half
-+      memcpy(&alpha[i], &d0, 8);
-+      all_and = vec_and(all_and, d0);
-+      src += 8;
-+    }
-+    for (; i < width; ++i) {
-+      const uint32_t alpha_value = argb[4 * i];
-+      alpha[i] = alpha_value;
-+      alpha_and &= alpha_value;
-+    }
-+    argb += argb_stride;
-+    alpha += alpha_stride;
-+  }
-+  {
-+    unsigned char tmp[16];
-+    memcpy(tmp, &all_and, 16);
-+    for (k = 0; k < 8; ++k) alpha_and &= tmp[k];
-+  }
-+  return (alpha_and == 0xff);
-+}
-+
-+static void ExtractGreen_VSX(const uint32_t* WEBP_RESTRICT argb,
-+                             uint8_t* WEBP_RESTRICT alpha, int size) {
-+  int i;
-+  const u32x4 mask = vec_splats((uint32_t)0xff);
-+  const u32x4 sh8 = vec_splats((uint32_t)8);
-+  for (i = 0; i + 16 <= size; i += 16) {
-+    const u32x4 a0 =
-+        vec_and(vec_sr(vec_xl(0, (uint32_t*)(argb + i + 0)), sh8), mask);
-+    const u32x4 a1 =
-+        vec_and(vec_sr(vec_xl(0, (uint32_t*)(argb + i + 4)), sh8), mask);
-+    const u32x4 a2 =
-+        vec_and(vec_sr(vec_xl(0, (uint32_t*)(argb + i + 8)), sh8), mask);
-+    const u32x4 a3 =
-+        vec_and(vec_sr(vec_xl(0, (uint32_t*)(argb + i + 12)), sh8), mask);
-+    const i16x8 d0 = vec_packs((i32x4)a0, (i32x4)a1);
-+    const i16x8 d1 = vec_packs((i32x4)a2, (i32x4)a3);
-+    const u8x16 e = vec_packsu(d0, d1);
-+    vec_xst(e, 0, &alpha[i]);
-+  }
-+  for (; i < size; ++i) alpha[i] = argb[i] >> 8;
-+}
-+
-+//------------------------------------------------------------------------------
-+// Premultiply.
-+
-+#define MULTIPLIER(a) ((a) * 32897U)
-+#define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
-+
-+// Spreads the alpha lane across r/g/b and inserts 0xff in the alpha lane, for
-+// the two pixels packed in a 16-bit-per-channel vector. Built against the
-+// little-endian byte order; src is the channel vector, the second operand is
-+// an all-0xff vector.
-+static const u8x16 kSpreadAlphaLast = {6,  7,  6,  7,  6,  7,  16, 7,
-+                                       14, 15, 14, 15, 14, 15, 16, 15};
-+static const u8x16 kSpreadAlphaFirst = {16, 1, 0, 1, 0, 1, 0, 1,
-+                                        16, 9, 8, 9, 8, 9, 8, 9};
-+
-+static WEBP_INLINE u16x8 MulHi16(u16x8 a, u16x8 b) {
-+  const u32x4 sh = vec_splats((unsigned int)16);
-+  const u32x4 e = vec_sr(vec_mule(a, b), sh);
-+  const u32x4 o = vec_sr(vec_mulo(a, b), sh);
-+  return vec_pack(vec_mergeh(e, o), vec_mergel(e, o));
-+}
-+
-+static void ApplyAlphaMultiply_VSX(uint8_t* rgba, int alpha_first, int w, int h,
-+                                   int stride) {
-+  const u8x16 zero = vec_splats((unsigned char)0);
-+  const u8x16 allff = vec_splats((unsigned char)0xff);
-+  const u16x8 z16 = vec_splats((unsigned short)0);
-+  const u16x8 kMult = vec_splats((unsigned short)0x8081);
-+  const u16x8 sh7 = vec_splats((unsigned short)7);
-+  const u8x16 ctrl = alpha_first ? kSpreadAlphaFirst : kSpreadAlphaLast;
-+  const int kSpan = 4;
-+  while (h-- > 0) {
-+    uint8_t* const rgbx = rgba;
-+    int i;
-+    for (i = 0; i + kSpan <= w; i += kSpan) {
-+      const u8x16 argb0 = vec_xl(0, (unsigned char*)(rgbx + 4 * i));
-+      const u16x8 lo = (u16x8)vec_mergeh(argb0, zero);
-+      const u16x8 hi = (u16x8)vec_mergel(argb0, zero);
-+      const u16x8 a_lo = (u16x8)vec_perm((u8x16)lo, allff, ctrl);
-+      const u16x8 a_hi = (u16x8)vec_perm((u8x16)hi, allff, ctrl);
-+      const u16x8 A0lo = vec_mladd(a_lo, lo, z16);
-+      const u16x8 A0hi = vec_mladd(a_hi, hi, z16);
-+      const u16x8 A2lo = vec_sr(MulHi16(A0lo, kMult), sh7);
-+      const u16x8 A2hi = vec_sr(MulHi16(A0hi, kMult), sh7);
-+      const u8x16 out = vec_packsu((i16x8)A2lo, (i16x8)A2hi);
-+      vec_xst(out, 0, (unsigned char*)(rgbx + 4 * i));
-+    }
-+    // Finish with left-overs.
-+    for (; i < w; ++i) {
-+      uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
-+      const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
-+      const uint32_t a = alpha[4 * i];
-+      if (a != 0xff) {
-+        const uint32_t mult = MULTIPLIER(a);
-+        rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);
-+        rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);
-+        rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);
-+      }
-+    }
-+    rgba += stride;
-+  }
-+}
-+
-+#undef MULTIPLIER
-+#undef PREMULTIPLY
-+
-+//------------------------------------------------------------------------------
-+
-+extern void WebPInitAlphaProcessingVSX(void);
-+
-+WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingVSX(void) {
-+  WebPApplyAlphaMultiply = ApplyAlphaMultiply_VSX;
-+  WebPDispatchAlpha = DispatchAlpha_VSX;
-+  WebPDispatchAlphaToGreen = DispatchAlphaToGreen_VSX;
-+  WebPExtractAlpha = ExtractAlpha_VSX;
-+  WebPExtractGreen = ExtractGreen_VSX;
-+}
-+
-+#else  // !WEBP_USE_VSX
-+
-+WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingVSX)
-+
-+#endif  // WEBP_USE_VSX
-diff --git a/media/libwebp/src/dsp/cpu.h b/media/libwebp/src/dsp/cpu.h
-index 17c4db971c7f..d1d4b3127c84 100644
---- a/media/libwebp/src/dsp/cpu.h
-+++ b/media/libwebp/src/dsp/cpu.h
-@@ -154,6 +154,17 @@
- #define WEBP_USE_MSA
- #endif
- 
-+//------------------------------------------------------------------------------
-+// PPC64 / Power VSX (ISA 2.07 / POWER8 baseline).
-+
-+#if defined(__powerpc64__) && defined(__VSX__)
-+#define WEBP_USE_VSX
-+#endif
-+
-+#if defined(WEBP_USE_VSX) && !defined(WEBP_HAVE_VSX)
-+#define WEBP_HAVE_VSX
-+#endif
-+
- //------------------------------------------------------------------------------
- 
- #ifndef WEBP_DSP_OMIT_C_CODE
-@@ -308,7 +319,8 @@ typedef enum {
-   kNEON,
-   kMIPS32,
-   kMIPSdspR2,
--  kMSA
-+  kMSA,
-+  kVSX
- } CPUFeature;
- 
- // returns true if the CPU supports the feature.
-diff --git a/media/libwebp/src/dsp/dec.c b/media/libwebp/src/dsp/dec.c
-index 4f38309980ce..f34276ba7316 100644
---- a/media/libwebp/src/dsp/dec.c
-+++ b/media/libwebp/src/dsp/dec.c
-@@ -752,6 +752,7 @@ extern void VP8DspInitNEON(void);
- extern void VP8DspInitMIPS32(void);
- extern void VP8DspInitMIPSdspR2(void);
- extern void VP8DspInitMSA(void);
-+extern void VP8DspInitVSX(void);
- 
- WEBP_DSP_INIT_FUNC(VP8DspInit) {
-   VP8InitClipTables();
-@@ -843,6 +844,11 @@ WEBP_DSP_INIT_FUNC(VP8DspInit) {
-     if (VP8GetCPUInfo(kMSA)) {
-       VP8DspInitMSA();
-     }
-+#endif
-+#if defined(WEBP_HAVE_VSX)
-+    if (VP8GetCPUInfo(kVSX)) {
-+      VP8DspInitVSX();
-+    }
- #endif
-   }
- 
-diff --git a/media/libwebp/src/dsp/dec_vsx.c b/media/libwebp/src/dsp/dec_vsx.c
-new file mode 100644
-index 000000000000..e0c1cbc3b71b
---- /dev/null
-+++ b/media/libwebp/src/dsp/dec_vsx.c
-@@ -0,0 +1,737 @@
-+// Copyright 2011 Google Inc. All Rights Reserved.
-+//
-+// Use of this source code is governed by a BSD-style license
-+// that can be found in the COPYING file in the root of the source
-+// tree. An additional intellectual property rights grant can be found
-+// in the file PATENTS. All contributing project authors may
-+// be found in the AUTHORS file in the root of the source tree.
-+// -----------------------------------------------------------------------------
-+//
-+// VSX (PowerPC) version of decoding functions.
-+
-+#include "src/dsp/dsp.h"
-+
-+#if defined(WEBP_USE_VSX)
-+
-+#include <altivec.h>
-+#include <string.h>
-+
-+typedef __vector signed short     i16x8;
-+typedef __vector unsigned short   u16x8;
-+typedef __vector signed int       i32x4;
-+typedef __vector unsigned int     u32x4;
-+typedef __vector unsigned char    u8x16;
-+typedef __vector signed char      i8x16;
-+typedef __vector signed long long i64x2;
-+
-+// Signed multiply-high of packed 16-bit lanes (POWER8 has no vmulhsh).
-+static WEBP_INLINE i16x8 MulHi16_S(i16x8 a, i16x8 b) {
-+  const u32x4 sh = vec_splats((unsigned int)16);
-+  const i32x4 e = vec_sra(vec_mule(a, b), sh);
-+  const i32x4 o = vec_sra(vec_mulo(a, b), sh);
-+  return (i16x8)vec_pack(vec_mergeh(e, o), vec_mergel(e, o));
-+}
-+
-+// Transpose two interleaved 4x4 blocks of 16-bit values.
-+static WEBP_INLINE void Transpose2_4x4(i16x8 in0, i16x8 in1, i16x8 in2,
-+                                       i16x8 in3, i16x8* out0, i16x8* out1,
-+                                       i16x8* out2, i16x8* out3) {
-+  const i16x8 t0 = (i16x8)vec_mergeh(in0, in1);
-+  const i16x8 t1 = (i16x8)vec_mergeh(in2, in3);
-+  const i16x8 t2 = (i16x8)vec_mergel(in0, in1);
-+  const i16x8 t3 = (i16x8)vec_mergel(in2, in3);
-+  const i32x4 u0 = vec_mergeh((i32x4)t0, (i32x4)t1);
-+  const i32x4 u1 = vec_mergeh((i32x4)t2, (i32x4)t3);
-+  const i32x4 u2 = vec_mergel((i32x4)t0, (i32x4)t1);
-+  const i32x4 u3 = vec_mergel((i32x4)t2, (i32x4)t3);
-+  *out0 = (i16x8)vec_mergeh((i64x2)u0, (i64x2)u1);
-+  *out1 = (i16x8)vec_mergel((i64x2)u0, (i64x2)u1);
-+  *out2 = (i16x8)vec_mergeh((i64x2)u2, (i64x2)u3);
-+  *out3 = (i16x8)vec_mergel((i64x2)u2, (i64x2)u3);
-+}
-+
-+// Bounded 4-coefficient load into the low half of a 16-bit vector.
-+static WEBP_INLINE i16x8 Load4Coeffs(const int16_t* WEBP_RESTRICT p) {
-+  int16_t tmp[8] = {0};
-+  memcpy(tmp, p, 4 * sizeof(int16_t));
-+  return *(const i16x8*)tmp;
-+}
-+
-+// Bounded load of n pixels, zero-extended to 16-bit lanes.
-+static WEBP_INLINE i16x8 LoadDst(const uint8_t* WEBP_RESTRICT p, int n) {
-+  unsigned char tmp[16] = {0};
-+  memcpy(tmp, p, n);
-+  return (i16x8)vec_mergeh(vec_xl(0, tmp), vec_splats((unsigned char)0));
-+}
-+
-+static void Transform_VSX(const int16_t* WEBP_RESTRICT in,
-+                          uint8_t* WEBP_RESTRICT dst, int do_two) {
-+  const i16x8 k1 = vec_splats((short)20091);
-+  const i16x8 k2 = vec_splats((short)-30068);
-+  const u16x8 three = vec_splats((unsigned short)3);
-+  i16x8 in0 = Load4Coeffs(in + 0), in1 = Load4Coeffs(in + 4);
-+  i16x8 in2 = Load4Coeffs(in + 8), in3 = Load4Coeffs(in + 12);
-+  i16x8 T0, T1, T2, T3;
-+
-+  if (do_two) {
-+    in0 = (i16x8)vec_mergeh((i64x2)in0, (i64x2)Load4Coeffs(in + 16));
-+    in1 = (i16x8)vec_mergeh((i64x2)in1, (i64x2)Load4Coeffs(in + 20));
-+    in2 = (i16x8)vec_mergeh((i64x2)in2, (i64x2)Load4Coeffs(in + 24));
-+    in3 = (i16x8)vec_mergeh((i64x2)in3, (i64x2)Load4Coeffs(in + 28));
-+  }
-+
-+  {  // Vertical pass + transpose.
-+    const i16x8 a = vec_add(in0, in2);
-+    const i16x8 b = vec_sub(in0, in2);
-+    const i16x8 c = vec_add(vec_sub(in1, in3),
-+                            vec_sub(MulHi16_S(in1, k2), MulHi16_S(in3, k1)));
-+    const i16x8 d = vec_add(vec_add(in1, in3),
-+                            vec_add(MulHi16_S(in1, k1), MulHi16_S(in3, k2)));
-+    Transpose2_4x4(vec_add(a, d), vec_add(b, c), vec_sub(b, c), vec_sub(a, d),
-+                   &T0, &T1, &T2, &T3);
-+  }
-+  {  // Horizontal pass + transpose.
-+    const i16x8 dc = vec_add(T0, vec_splats((short)4));
-+    const i16x8 a = vec_add(dc, T2);
-+    const i16x8 b = vec_sub(dc, T2);
-+    const i16x8 c = vec_add(vec_sub(T1, T3),
-+                            vec_sub(MulHi16_S(T1, k2), MulHi16_S(T3, k1)));
-+    const i16x8 d = vec_add(vec_add(T1, T3),
-+                            vec_add(MulHi16_S(T1, k1), MulHi16_S(T3, k2)));
-+    const i16x8 s0 = vec_sra(vec_add(a, d), three);
-+    const i16x8 s1 = vec_sra(vec_add(b, c), three);
-+    const i16x8 s2 = vec_sra(vec_sub(b, c), three);
-+    const i16x8 s3 = vec_sra(vec_sub(a, d), three);
-+    Transpose2_4x4(s0, s1, s2, s3, &T0, &T1, &T2, &T3);
-+  }
-+  {  // Add to the reference pixels and store with saturation.
-+    const int n = do_two ? 8 : 4;
-+    const i16x8 d0 = LoadDst(dst + 0 * BPS, n);
-+    const i16x8 d1 = LoadDst(dst + 1 * BPS, n);
-+    const i16x8 d2 = LoadDst(dst + 2 * BPS, n);
-+    const i16x8 d3 = LoadDst(dst + 3 * BPS, n);
-+    const u8x16 r0 = vec_packsu(vec_add(d0, T0), vec_add(d0, T0));
-+    const u8x16 r1 = vec_packsu(vec_add(d1, T1), vec_add(d1, T1));
-+    const u8x16 r2 = vec_packsu(vec_add(d2, T2), vec_add(d2, T2));
-+    const u8x16 r3 = vec_packsu(vec_add(d3, T3), vec_add(d3, T3));
-+    unsigned char b0[16], b1[16], b2[16], b3[16];
-+    memcpy(b0, &r0, 16); memcpy(b1, &r1, 16);
-+    memcpy(b2, &r2, 16); memcpy(b3, &r3, 16);
-+    memcpy(dst + 0 * BPS, b0, n); memcpy(dst + 1 * BPS, b1, n);
-+    memcpy(dst + 2 * BPS, b2, n); memcpy(dst + 3 * BPS, b3, n);
-+  }
-+}
-+
-+//------------------------------------------------------------------------------
-+// Simple in-loop edge filtering.
-+
-+#define ABSU(p, q) vec_or(vec_subs((u8x16)(q), (u8x16)(p)), \
-+                          vec_subs((u8x16)(p), (u8x16)(q)))
-+
-+// Per-byte signed arithmetic >>3, packed with saturation.
-+static WEBP_INLINE i8x16 SignedShift3(i8x16 x) {
-+  const u8x16 z = vec_splats((unsigned char)0);
-+  const u16x8 sh = vec_splats((unsigned short)(3 + 8));
-+  const i16x8 lo = vec_sra((i16x8)vec_mergeh(z, (u8x16)x), sh);
-+  const i16x8 hi = vec_sra((i16x8)vec_mergel(z, (u8x16)x), sh);
-+  return (i8x16)vec_packs(lo, hi);
-+}
-+
-+static WEBP_INLINE void DoFilter2_VSX(u8x16* WEBP_RESTRICT p1,
-+                                      u8x16* WEBP_RESTRICT p0,
-+                                      u8x16* WEBP_RESTRICT q0,
-+                                      u8x16* WEBP_RESTRICT q1, int thresh) {
-+  const u8x16 sign = vec_splats((unsigned char)0x80);
-+  const u8x16 t1 = ABSU(*p1, *q1);
-+  const u8x16 t2 = vec_and(t1, vec_splats((unsigned char)0xFE));
-+  const u8x16 t3 = (u8x16)vec_sr((u16x8)t2, vec_splats((unsigned short)1));
-+  const u8x16 t4 = ABSU(*p0, *q0);
-+  const u8x16 t6 = vec_adds(vec_adds(t4, t4), t3);
-+  const u8x16 t7 = vec_subs(t6, vec_splats((unsigned char)thresh));
-+  const u8x16 mask = (u8x16)vec_cmpeq(t7, vec_splats((unsigned char)0));
-+
-+  const i8x16 p1s = (i8x16)vec_xor(*p1, sign);
-+  const i8x16 q1s = (i8x16)vec_xor(*q1, sign);
-+  i8x16 P0 = (i8x16)vec_xor(*p0, sign);
-+  i8x16 Q0 = (i8x16)vec_xor(*q0, sign);
-+
-+  const i8x16 d0 = vec_subs(Q0, P0);
-+  const i8x16 s1 = vec_adds(vec_subs(p1s, q1s), d0);
-+  i8x16 a = vec_adds(d0, vec_adds(d0, s1));
-+  a = vec_and(a, (i8x16)mask);
-+  const i8x16 v3 = SignedShift3(vec_adds(a, vec_splats((signed char)3)));
-+  const i8x16 v4 = SignedShift3(vec_adds(a, vec_splats((signed char)4)));
-+  Q0 = vec_subs(Q0, v4);
-+  P0 = vec_adds(P0, v3);
-+  *p0 = vec_xor((u8x16)P0, sign);
-+  *q0 = vec_xor((u8x16)Q0, sign);
-+}
-+
-+static void SimpleVFilter16_VSX(uint8_t* p, int stride, int thresh) {
-+  u8x16 p1 = vec_xl(0, p - 2 * stride);
-+  u8x16 p0 = vec_xl(0, p - stride);
-+  u8x16 q0 = vec_xl(0, p);
-+  u8x16 q1 = vec_xl(0, p + stride);
-+  DoFilter2_VSX(&p1, &p0, &q0, &q1, thresh);
-+  vec_xst(p0, 0, p - stride);
-+  vec_xst(q0, 0, p);
-+}
-+
-+static void SimpleVFilter16i_VSX(uint8_t* p, int stride, int thresh) {
-+  int k;
-+  for (k = 3; k > 0; --k) {
-+    p += 4 * stride;
-+    SimpleVFilter16_VSX(p, stride, thresh);
-+  }
-+}
-+
-+// Transpose four columns out of / into 16 rows for horizontal-edge filtering.
-+static WEBP_INLINE void Load8x4(const uint8_t* WEBP_RESTRICT b, int s,
-+                                u8x16* WEBP_RESTRICT p, u8x16* WEBP_RESTRICT q) {
-+  uint32_t a0[4], a1[4];
-+  memcpy(&a0[0], b + 0 * s, 4); memcpy(&a0[1], b + 4 * s, 4);
-+  memcpy(&a0[2], b + 2 * s, 4); memcpy(&a0[3], b + 6 * s, 4);
-+  memcpy(&a1[0], b + 1 * s, 4); memcpy(&a1[1], b + 5 * s, 4);
-+  memcpy(&a1[2], b + 3 * s, 4); memcpy(&a1[3], b + 7 * s, 4);
-+  const u8x16 A0 = vec_xl(0, (unsigned char*)a0);
-+  const u8x16 A1 = vec_xl(0, (unsigned char*)a1);
-+  const u8x16 B0 = vec_mergeh(A0, A1), B1 = vec_mergel(A0, A1);
-+  const u16x8 C0 = vec_mergeh((u16x8)B0, (u16x8)B1);
-+  const u16x8 C1 = vec_mergel((u16x8)B0, (u16x8)B1);
-+  *p = (u8x16)vec_mergeh((u32x4)C0, (u32x4)C1);
-+  *q = (u8x16)vec_mergel((u32x4)C0, (u32x4)C1);
-+}
-+
-+static WEBP_INLINE void Load16x4(const uint8_t* WEBP_RESTRICT r0,
-+                                 const uint8_t* WEBP_RESTRICT r8, int s,
-+                                 u8x16* p1, u8x16* p0, u8x16* q0, u8x16* q1) {
-+  Load8x4(r0, s, p1, q0);
-+  Load8x4(r8, s, p0, q1);
-+  const u8x16 t1 = *p1, t2 = *q0;
-+  *p1 = (u8x16)vec_mergeh((i64x2)t1, (i64x2)*p0);
-+  *p0 = (u8x16)vec_mergel((i64x2)t1, (i64x2)*p0);
-+  *q0 = (u8x16)vec_mergeh((i64x2)t2, (i64x2)*q1);
-+  *q1 = (u8x16)vec_mergel((i64x2)t2, (i64x2)*q1);
-+}
-+
-+static WEBP_INLINE void Store4x4(u8x16 x, uint8_t* WEBP_RESTRICT dst, int s) {
-+  unsigned char b[16];
-+  int i;
-+  memcpy(b, &x, 16);
-+  for (i = 0; i < 4; ++i) memcpy(dst + i * s, b + 4 * i, 4);
-+}
-+
-+static WEBP_INLINE void Store16x4(u8x16 p1, u8x16 p0, u8x16 q0, u8x16 q1,
-+                                  uint8_t* WEBP_RESTRICT r0,
-+                                  uint8_t* WEBP_RESTRICT r8, int s) {
-+  u8x16 t = p0;
-+  u8x16 p0s = vec_mergeh(p1, t), p1s = vec_mergel(p1, t);
-+  t = q0;
-+  u8x16 q0s = vec_mergeh(t, q1), q1s = vec_mergel(t, q1);
-+  t = p0s;
-+  p0s = (u8x16)vec_mergeh((u16x8)t, (u16x8)q0s);
-+  q0s = (u8x16)vec_mergel((u16x8)t, (u16x8)q0s);
-+  t = p1s;
-+  p1s = (u8x16)vec_mergeh((u16x8)t, (u16x8)q1s);
-+  q1s = (u8x16)vec_mergel((u16x8)t, (u16x8)q1s);
-+  Store4x4(p0s, r0, s); Store4x4(q0s, r0 + 4 * s, s);
-+  Store4x4(p1s, r8, s); Store4x4(q1s, r8 + 4 * s, s);
-+}
-+
-+static void SimpleHFilter16_VSX(uint8_t* p, int stride, int thresh) {
-+  u8x16 p1, p0, q0, q1;
-+  p -= 2;  // beginning of p1
-+  Load16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
-+  DoFilter2_VSX(&p1, &p0, &q0, &q1, thresh);
-+  Store16x4(p1, p0, q0, q1, p, p + 8 * stride, stride);
-+}
-+
-+static void SimpleHFilter16i_VSX(uint8_t* p, int stride, int thresh) {
-+  int k;
-+  for (k = 3; k > 0; --k) {
-+    p += 4;
-+    SimpleHFilter16_VSX(p, stride, thresh);
-+  }
-+}
-+
-+//------------------------------------------------------------------------------
-+// Complex in-loop edge filtering (vertical/luma).
-+
-+static const u8x16 kSignBit = {
-+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
-+#define FLIPB(x) ((x) = (i8x16)vec_xor((u8x16)(x), kSignBit))
-+
-+static WEBP_INLINE u8x16 GetNotHEV(u8x16 p1, u8x16 p0, u8x16 q0, u8x16 q1,
-+                                   int hev_thresh) {
-+  const u8x16 d = vec_subs(vec_max(ABSU(p1, p0), ABSU(q1, q0)),
-+                           vec_splats((unsigned char)hev_thresh));
-+  return (u8x16)vec_cmpeq(d, vec_splats((unsigned char)0));
-+}
-+
-+static WEBP_INLINE i8x16 GetBaseDelta(i8x16 p1, i8x16 p0, i8x16 q0, i8x16 q1) {
-+  const i8x16 d = vec_subs(q0, p0);
-+  const i8x16 s1 = vec_adds(vec_subs(p1, q1), d);
-+  return vec_adds(d, vec_adds(d, s1));
-+}
-+
-+static WEBP_INLINE void DoSimpleFilterS(i8x16* p0, i8x16* q0, i8x16 f) {
-+  *q0 = vec_subs(*q0, SignedShift3(vec_adds(f, vec_splats((signed char)4))));
-+  *p0 = vec_adds(*p0, SignedShift3(vec_adds(f, vec_splats((signed char)3))));
-+}
-+
-+static WEBP_INLINE void Update2Pixels(i8x16* pi, i8x16* qi, i16x8 lo, i16x8 hi) {
-+  const u16x8 s7 = vec_splats((unsigned short)7);
-+  const i8x16 d = (i8x16)vec_packs(vec_sra(lo, s7), vec_sra(hi, s7));
-+  *pi = vec_adds(*pi, d);
-+  *qi = vec_subs(*qi, d);
-+  FLIPB(*pi);
-+  FLIPB(*qi);
-+}
-+
-+// mask = (max inner abs-diff <= ithresh) && NeedsFilter(thresh).
-+static WEBP_INLINE u8x16 ComplexMask(u8x16 p3, u8x16 p2, u8x16 p1, u8x16 p0,
-+                                     u8x16 q0, u8x16 q1, u8x16 q2, u8x16 q3,
-+                                     int thresh, int ithresh) {
-+  u8x16 m = ABSU(p1, p0);
-+  m = vec_max(m, ABSU(p3, p2));
-+  m = vec_max(m, ABSU(p2, p1));
-+  m = vec_max(m, ABSU(q1, q0));
-+  m = vec_max(m, ABSU(q3, q2));
-+  m = vec_max(m, ABSU(q2, q1));
-+  const u8x16 tm = (u8x16)vec_cmpeq(
-+      vec_subs(m, vec_splats((unsigned char)ithresh)),
-+      vec_splats((unsigned char)0));
-+  const u8x16 t2 = vec_and(ABSU(p1, q1), vec_splats((unsigned char)0xFE));
-+  const u8x16 t3 = (u8x16)vec_sr((u16x8)t2, vec_splats((unsigned short)1));
-+  const u8x16 t6 = vec_adds(vec_adds(ABSU(p0, q0), ABSU(p0, q0)), t3);
-+  const u8x16 fm = (u8x16)vec_cmpeq(
-+      vec_subs(t6, vec_splats((unsigned char)thresh)),
-+      vec_splats((unsigned char)0));
-+  return vec_and(tm, fm);
-+}
-+
-+static WEBP_INLINE void DoFilter4(u8x16* p1u, u8x16* p0u, u8x16* q0u,
-+                                  u8x16* q1u, u8x16 mask, int hev_thresh) {
-+  const u8x16 zero = vec_splats((unsigned char)0);
-+  const u8x16 not_hev = GetNotHEV(*p1u, *p0u, *q0u, *q1u, hev_thresh);
-+  i8x16 p1 = (i8x16)vec_xor(*p1u, kSignBit), p0 = (i8x16)vec_xor(*p0u, kSignBit);
-+  i8x16 q0 = (i8x16)vec_xor(*q0u, kSignBit), q1 = (i8x16)vec_xor(*q1u, kSignBit);
-+  i8x16 t1 = vec_andc(vec_subs(p1, q1), (i8x16)not_hev);
-+  const i8x16 t2 = vec_subs(q0, p0);
-+  t1 = vec_adds(t1, t2); t1 = vec_adds(t1, t2); t1 = vec_adds(t1, t2);
-+  t1 = vec_and(t1, (i8x16)mask);
-+  const i8x16 a3 = SignedShift3(vec_adds(t1, vec_splats((signed char)4)));
-+  p0 = vec_adds(p0, SignedShift3(vec_adds(t1, vec_splats((signed char)3))));
-+  q0 = vec_subs(q0, a3);
-+  FLIPB(p0); FLIPB(q0);
-+  const i8x16 t = vec_add(a3, (i8x16)kSignBit);
-+  i8x16 t3 = vec_sub((i8x16)vec_avg((u8x16)t, zero), vec_splats((signed char)64));
-+  t3 = vec_and((i8x16)not_hev, t3);
-+  q1 = vec_subs(q1, t3); p1 = vec_adds(p1, t3);
-+  FLIPB(p1); FLIPB(q1);
-+  *p1u = (u8x16)p1; *p0u = (u8x16)p0; *q0u = (u8x16)q0; *q1u = (u8x16)q1;
-+}
-+
-+static WEBP_INLINE void DoFilter6(u8x16* p2u, u8x16* p1u, u8x16* p0u,
-+                                  u8x16* q0u, u8x16* q1u, u8x16* q2u,
-+                                  u8x16 mask, int hev_thresh) {
-+  const u8x16 zero = vec_splats((unsigned char)0);
-+  const u8x16 not_hev = GetNotHEV(*p1u, *p0u, *q0u, *q1u, hev_thresh);
-+  i8x16 p2 = (i8x16)vec_xor(*p2u, kSignBit), p1 = (i8x16)vec_xor(*p1u, kSignBit);
-+  i8x16 p0 = (i8x16)vec_xor(*p0u, kSignBit), q0 = (i8x16)vec_xor(*q0u, kSignBit);
-+  i8x16 q1 = (i8x16)vec_xor(*q1u, kSignBit), q2 = (i8x16)vec_xor(*q2u, kSignBit);
-+  const i8x16 a = GetBaseDelta(p1, p0, q0, q1);
-+  {  // hev pixels: simple filter
-+    const i8x16 f = vec_and(a, (i8x16)vec_andc(mask, not_hev));
-+    DoSimpleFilterS(&p0, &q0, f);
-+  }
-+  {  // non-hev pixels: strong filter
-+    const i8x16 f = vec_and(a, vec_and((i8x16)not_hev, (i8x16)mask));
-+    const i16x8 k9 = vec_splats((short)0x0900), k63 = vec_splats((short)63);
-+    const i16x8 f9lo = MulHi16_S((i16x8)vec_mergeh(zero, (u8x16)f), k9);
-+    const i16x8 f9hi = MulHi16_S((i16x8)vec_mergel(zero, (u8x16)f), k9);
-+    const i16x8 a2lo = vec_add(f9lo, k63), a2hi = vec_add(f9hi, k63);
-+    const i16x8 a1lo = vec_add(a2lo, f9lo), a1hi = vec_add(a2hi, f9hi);
-+    const i16x8 a0lo = vec_add(a1lo, f9lo), a0hi = vec_add(a1hi, f9hi);
-+    Update2Pixels(&p2, &q2, a2lo, a2hi);
-+    Update2Pixels(&p1, &q1, a1lo, a1hi);
-+    Update2Pixels(&p0, &q0, a0lo, a0hi);
-+  }
-+  *p2u = (u8x16)p2; *p1u = (u8x16)p1; *p0u = (u8x16)p0;
-+  *q0u = (u8x16)q0; *q1u = (u8x16)q1; *q2u = (u8x16)q2;
-+}
-+
-+static void VFilter16_VSX(uint8_t* p, int s, int thresh, int ithresh,
-+                          int hev_thresh) {
-+  u8x16 p3 = vec_xl(0, p - 4 * s), p2 = vec_xl(0, p - 3 * s);
-+  u8x16 p1 = vec_xl(0, p - 2 * s), p0 = vec_xl(0, p - s);
-+  u8x16 q0 = vec_xl(0, p), q1 = vec_xl(0, p + s);
-+  u8x16 q2 = vec_xl(0, p + 2 * s), q3 = vec_xl(0, p + 3 * s);
-+  const u8x16 m = ComplexMask(p3, p2, p1, p0, q0, q1, q2, q3, thresh, ithresh);
-+  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, m, hev_thresh);
-+  vec_xst(p2, 0, p - 3 * s); vec_xst(p1, 0, p - 2 * s); vec_xst(p0, 0, p - s);
-+  vec_xst(q0, 0, p); vec_xst(q1, 0, p + s); vec_xst(q2, 0, p + 2 * s);
-+}
-+
-+static void VFilter16i_VSX(uint8_t* p, int s, int thresh, int ithresh,
-+                           int hev_thresh) {
-+  int k;
-+  for (k = 3; k > 0; --k) {
-+    p += 4 * s;
-+    u8x16 p3 = vec_xl(0, p - 4 * s), p2 = vec_xl(0, p - 3 * s);
-+    u8x16 p1 = vec_xl(0, p - 2 * s), p0 = vec_xl(0, p - s);
-+    u8x16 q0 = vec_xl(0, p), q1 = vec_xl(0, p + s);
-+    u8x16 q2 = vec_xl(0, p + 2 * s), q3 = vec_xl(0, p + 3 * s);
-+    const u8x16 m = ComplexMask(p3, p2, p1, p0, q0, q1, q2, q3, thresh, ithresh);
-+    DoFilter4(&p1, &p0, &q0, &q1, m, hev_thresh);
-+    vec_xst(p1, 0, p - 2 * s); vec_xst(p0, 0, p - s);
-+    vec_xst(q0, 0, p); vec_xst(q1, 0, p + s);
-+  }
-+}
-+
-+// Complex horizontal luma: two 16x4 transposes around the vertical edge feed
-+// the same DoFilter4/DoFilter6 used by the vertical variants.
-+static void HFilter16_VSX(uint8_t* p, int s, int thresh, int ithresh,
-+                          int hev_thresh) {
-+  uint8_t* const b = p - 4;
-+  u8x16 p3, p2, p1, p0, q0, q1, q2, q3;
-+  Load16x4(b, b + 8 * s, s, &p3, &p2, &p1, &p0);
-+  Load16x4(p, p + 8 * s, s, &q0, &q1, &q2, &q3);
-+  const u8x16 m = ComplexMask(p3, p2, p1, p0, q0, q1, q2, q3, thresh, ithresh);
-+  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, m, hev_thresh);
-+  Store16x4(p3, p2, p1, p0, b, b + 8 * s, s);
-+  Store16x4(q0, q1, q2, q3, p, p + 8 * s, s);
-+}
-+
-+static void HFilter16i_VSX(uint8_t* p, int s, int thresh, int ithresh,
-+                           int hev_thresh) {
-+  int k;
-+  for (k = 3; k > 0; --k) {
-+    p += 4;
-+    u8x16 p3, p2, p1, p0, q0, q1, q2, q3;
-+    Load16x4(p - 4, p - 4 + 8 * s, s, &p3, &p2, &p1, &p0);
-+    Load16x4(p, p + 8 * s, s, &q0, &q1, &q2, &q3);
-+    const u8x16 m =
-+        ComplexMask(p3, p2, p1, p0, q0, q1, q2, q3, thresh, ithresh);
-+    DoFilter4(&p1, &p0, &q0, &q1, m, hev_thresh);
-+    Store16x4(p1, p0, q0, q1, p - 2, p - 2 + 8 * s, s);
-+  }
-+}
-+
-+//------------------------------------------------------------------------------
-+// Complex chroma filtering: operate on the u and v planes (8 wide) together.
-+
-+// Pack 8 u-bytes into the low half and 8 v-bytes into the high half.
-+static WEBP_INLINE u8x16 LoadUV(const uint8_t* WEBP_RESTRICT u,
-+                                const uint8_t* WEBP_RESTRICT v) {
-+  unsigned char b[16];
-+  memcpy(b, u, 8);
-+  memcpy(b + 8, v, 8);
-+  return vec_xl(0, b);
-+}
-+
-+static WEBP_INLINE void StoreUV(u8x16 x, uint8_t* WEBP_RESTRICT u,
-+                                uint8_t* WEBP_RESTRICT v) {
-+  unsigned char b[16];
-+  memcpy(b, &x, 16);
-+  memcpy(u, b, 8);
-+  memcpy(v, b + 8, 8);
-+}
-+
-+static void VFilter8_VSX(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
-+                         int s, int thresh, int ithresh, int hev_thresh) {
-+  u8x16 p3 = LoadUV(u - 4 * s, v - 4 * s), p2 = LoadUV(u - 3 * s, v - 3 * s);
-+  u8x16 p1 = LoadUV(u - 2 * s, v - 2 * s), p0 = LoadUV(u - s, v - s);
-+  u8x16 q0 = LoadUV(u, v), q1 = LoadUV(u + s, v + s);
-+  u8x16 q2 = LoadUV(u + 2 * s, v + 2 * s), q3 = LoadUV(u + 3 * s, v + 3 * s);
-+  const u8x16 m = ComplexMask(p3, p2, p1, p0, q0, q1, q2, q3, thresh, ithresh);
-+  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, m, hev_thresh);
-+  StoreUV(p2, u - 3 * s, v - 3 * s); StoreUV(p1, u - 2 * s, v - 2 * s);
-+  StoreUV(p0, u - s, v - s); StoreUV(q0, u, v);
-+  StoreUV(q1, u + s, v + s); StoreUV(q2, u + 2 * s, v + 2 * s);
-+}
-+
-+static void VFilter8i_VSX(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
-+                          int s, int thresh, int ithresh, int hev_thresh) {
-+  u += 4 * s; v += 4 * s;
-+  u8x16 p3 = LoadUV(u - 4 * s, v - 4 * s), p2 = LoadUV(u - 3 * s, v - 3 * s);
-+  u8x16 p1 = LoadUV(u - 2 * s, v - 2 * s), p0 = LoadUV(u - s, v - s);
-+  u8x16 q0 = LoadUV(u, v), q1 = LoadUV(u + s, v + s);
-+  u8x16 q2 = LoadUV(u + 2 * s, v + 2 * s), q3 = LoadUV(u + 3 * s, v + 3 * s);
-+  const u8x16 m = ComplexMask(p3, p2, p1, p0, q0, q1, q2, q3, thresh, ithresh);
-+  DoFilter4(&p1, &p0, &q0, &q1, m, hev_thresh);
-+  StoreUV(p1, u - 2 * s, v - 2 * s); StoreUV(p0, u - s, v - s);
-+  StoreUV(q0, u, v); StoreUV(q1, u + s, v + s);
-+}
-+
-+static void HFilter8_VSX(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
-+                         int s, int thresh, int ithresh, int hev_thresh) {
-+  u8x16 p3, p2, p1, p0, q0, q1, q2, q3;
-+  Load16x4(u - 4, v - 4, s, &p3, &p2, &p1, &p0);
-+  Load16x4(u, v, s, &q0, &q1, &q2, &q3);
-+  const u8x16 m = ComplexMask(p3, p2, p1, p0, q0, q1, q2, q3, thresh, ithresh);
-+  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, m, hev_thresh);
-+  Store16x4(p3, p2, p1, p0, u - 4, v - 4, s);
-+  Store16x4(q0, q1, q2, q3, u, v, s);
-+}
-+
-+static void HFilter8i_VSX(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
-+                          int s, int thresh, int ithresh, int hev_thresh) {
-+  u += 4; v += 4;
-+  u8x16 p3, p2, p1, p0, q0, q1, q2, q3;
-+  Load16x4(u - 4, v - 4, s, &p3, &p2, &p1, &p0);
-+  Load16x4(u, v, s, &q0, &q1, &q2, &q3);
-+  const u8x16 m = ComplexMask(p3, p2, p1, p0, q0, q1, q2, q3, thresh, ithresh);
-+  DoFilter4(&p1, &p0, &q0, &q1, m, hev_thresh);
-+  Store16x4(p1, p0, q0, q1, u - 2, v - 2, s);
-+}
-+
-+//------------------------------------------------------------------------------
-+// Intra prediction (16x16 luma, 8x8 chroma). DC top-sums are scalar (the SIMD
-+// win is the block fill); TrueMotion/VE/HE are vectorized.
-+
-+static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
-+  const u8x16 x = vec_splats(v);
-+  int j;
-+  for (j = 0; j < 16; ++j) vec_xst(x, 0, dst + j * BPS);
-+}
-+static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
-+  const u8x16 x = vec_splats(v);
-+  unsigned char b[16];
-+  int j;
-+  memcpy(b, &x, 16);
-+  for (j = 0; j < 8; ++j) memcpy(dst + j * BPS, b, 8);
-+}
-+
-+static void VE16_VSX(uint8_t* dst) {
-+  const u8x16 top = vec_xl(0, dst - BPS);
-+  int j;
-+  for (j = 0; j < 16; ++j) vec_xst(top, 0, dst + j * BPS);
-+}
-+static void HE16_VSX(uint8_t* dst) {
-+  int j;
-+  for (j = 0; j < 16; ++j) vec_xst(vec_splats(dst[-1 + j * BPS]), 0, dst + j * BPS);
-+}
-+static void DC16_VSX(uint8_t* dst) {
-+  int s = 16, j;
-+  for (j = 0; j < 16; ++j) s += dst[-BPS + j] + dst[-1 + j * BPS];
-+  Put16(s >> 5, dst);
-+}
-+static void DC16NoTop_VSX(uint8_t* dst) {
-+  int s = 8, j;
-+  for (j = 0; j < 16; ++j) s += dst[-1 + j * BPS];
-+  Put16(s >> 4, dst);
-+}
-+static void DC16NoLeft_VSX(uint8_t* dst) {
-+  int s = 8, j;
-+  for (j = 0; j < 16; ++j) s += dst[-BPS + j];
-+  Put16(s >> 4, dst);
-+}
-+static void DC16NoTopLeft_VSX(uint8_t* dst) { Put16(0x80, dst); }
-+static void TM16_VSX(uint8_t* dst) {
-+  const u8x16 zero = vec_splats((unsigned char)0);
-+  const u8x16 t = vec_xl(0, dst - BPS);
-+  const i16x8 tl = (i16x8)vec_mergeh(t, zero), th = (i16x8)vec_mergel(t, zero);
-+  const int c = dst[-BPS - 1];
-+  int y;
-+  for (y = 0; y < 16; ++y) {
-+    const i16x8 b = vec_splats((short)(dst[-1 + y * BPS] - c));
-+    vec_xst((u8x16)vec_packsu(vec_add(b, tl), vec_add(b, th)), 0, dst + y * BPS);
-+  }
-+}
-+
-+static void VE8uv_VSX(uint8_t* dst) {
-+  unsigned char t[8];
-+  int j;
-+  memcpy(t, dst - BPS, 8);
-+  for (j = 0; j < 8; ++j) memcpy(dst + j * BPS, t, 8);
-+}
-+static void DC8uv_VSX(uint8_t* dst) {
-+  int s = 8, j;
-+  for (j = 0; j < 8; ++j) s += dst[-BPS + j] + dst[-1 + j * BPS];
-+  Put8x8uv(s >> 4, dst);
-+}
-+static void DC8uvNoTop_VSX(uint8_t* dst) {
-+  int s = 4, j;
-+  for (j = 0; j < 8; ++j) s += dst[-1 + j * BPS];
-+  Put8x8uv(s >> 3, dst);
-+}
-+static void DC8uvNoLeft_VSX(uint8_t* dst) {
-+  int s = 4, j;
-+  for (j = 0; j < 8; ++j) s += dst[-BPS + j];
-+  Put8x8uv(s >> 3, dst);
-+}
-+static void DC8uvNoTopLeft_VSX(uint8_t* dst) { Put8x8uv(0x80, dst); }
-+static void TM8uv_VSX(uint8_t* dst) {
-+  const u8x16 zero = vec_splats((unsigned char)0);
-+  const u8x16 t = vec_xl(0, dst - BPS);
-+  const i16x8 tl = (i16x8)vec_mergeh(t, zero);
-+  const int c = dst[-BPS - 1];
-+  int y;
-+  for (y = 0; y < 8; ++y) {
-+    const i16x8 b = vec_splats((short)(dst[-1 + y * BPS] - c));
-+    const u8x16 o = (u8x16)vec_packsu(vec_add(b, tl), vec_splats((short)0));
-+    unsigned char bb[16];
-+    memcpy(bb, &o, 16);
-+    memcpy(dst + y * BPS, bb, 8);
-+  }
-+}
-+
-+//------------------------------------------------------------------------------
-+// 4x4 luma intra prediction. Whole-vector byte shifts window the edge samples:
-+//   srli_si128(x,n) == vec_sld(zero, x, 16 - n)
-+//   slli_si128(x,n) == vec_sld(x, zero, n)
-+
-+#define SRLI(x, n) vec_sld(zero, (x), 16 - (n))
-+#define SLLI(x, n) vec_sld((x), zero, (n))
-+#define INS16(v, val, i) ((u8x16)vec_insert((short)(val), (i16x8)(v), (i)))
-+#define AVG3C(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
-+
-+static WEBP_INLINE u8x16 Load64(const uint8_t* WEBP_RESTRICT p) {
-+  unsigned char b[16] = {0};
-+  memcpy(b, p, 8);
-+  return vec_xl(0, b);
-+}
-+static WEBP_INLINE uint32_t GetWord(u8x16 v) {
-+  unsigned char b[16];
-+  uint32_t r;
-+  memcpy(b, &v, 16);
-+  memcpy(&r, b, 4);
-+  return r;
-+}
-+static WEBP_INLINE u8x16 SetWord(uint32_t v) {
-+  unsigned char b[16] = {0};
-+  memcpy(b, &v, 4);
-+  return vec_xl(0, b);
-+}
-+static WEBP_INLINE void StoreWord(uint32_t v, uint8_t* dst) {
-+  memcpy(dst, &v, 4);
-+}
-+
-+static void VE4_VSX(uint8_t* dst) {
-+  const u8x16 zero = vec_splats((unsigned char)0), one = vec_splats((unsigned char)1);
-+  const u8x16 A = Load64(dst - BPS - 1), B = SRLI(A, 1), C = SRLI(A, 2);
-+  const u8x16 a = vec_avg(A, C), lsb = vec_and(vec_xor(A, C), one);
-+  const u8x16 avg = vec_avg(vec_subs(a, lsb), B);
-+  const uint32_t v = GetWord(avg);
-+  int i;
-+  for (i = 0; i < 4; ++i) StoreWord(v, dst + i * BPS);
-+}
-+static void LD4_VSX(uint8_t* dst) {
-+  const u8x16 zero = vec_splats((unsigned char)0), one = vec_splats((unsigned char)1);
-+  const u8x16 A = Load64(dst - BPS), B = SRLI(A, 1), C = SRLI(A, 2);
-+  const u8x16 CH = INS16(C, dst[-BPS + 7], 3);
-+  const u8x16 a1 = vec_avg(A, CH), lsb = vec_and(vec_xor(A, CH), one);
-+  const u8x16 r = vec_avg(vec_subs(a1, lsb), B);
-+  StoreWord(GetWord(r), dst + 0 * BPS);
-+  StoreWord(GetWord(SRLI(r, 1)), dst + 1 * BPS);
-+  StoreWord(GetWord(SRLI(r, 2)), dst + 2 * BPS);
-+  StoreWord(GetWord(SRLI(r, 3)), dst + 3 * BPS);
-+}
-+static void VR4_VSX(uint8_t* dst) {
-+  const u8x16 zero = vec_splats((unsigned char)0), one = vec_splats((unsigned char)1);
-+  const int I = dst[-1 + 0 * BPS], J = dst[-1 + 1 * BPS], K = dst[-1 + 2 * BPS];
-+  const int X = dst[-1 - BPS];
-+  const u8x16 XA = Load64(dst - BPS - 1), A0 = SRLI(XA, 1);
-+  const u8x16 abcd = vec_avg(XA, A0);
-+  const u8x16 IX = INS16(SLLI(XA, 1), (I | (X << 8)), 0);
-+  const u8x16 a1 = vec_avg(IX, A0), lsb = vec_and(vec_xor(IX, A0), one);
-+  const u8x16 efgh = vec_avg(vec_subs(a1, lsb), XA);
-+  StoreWord(GetWord(abcd), dst + 0 * BPS);
-+  StoreWord(GetWord(efgh), dst + 1 * BPS);
-+  StoreWord(GetWord(SLLI(abcd, 1)), dst + 2 * BPS);
-+  StoreWord(GetWord(SLLI(efgh, 1)), dst + 3 * BPS);
-+  dst[0 + 2 * BPS] = AVG3C(J, I, X);
-+  dst[0 + 3 * BPS] = AVG3C(K, J, I);
-+}
-+static void VL4_VSX(uint8_t* dst) {
-+  const u8x16 zero = vec_splats((unsigned char)0), one = vec_splats((unsigned char)1);
-+  const u8x16 A = Load64(dst - BPS), B = SRLI(A, 1), C = SRLI(A, 2);
-+  const u8x16 a1 = vec_avg(A, B), a2 = vec_avg(C, B), a3 = vec_avg(a1, a2);
-+  const u8x16 lsb1 = vec_and(vec_xor(a1, a2), one);
-+  const u8x16 abbc = vec_or(vec_xor(A, B), vec_xor(C, B));
-+  const u8x16 a4 = vec_subs(a3, vec_and(abbc, lsb1));
-+  const uint32_t extra = GetWord(SRLI(a4, 4));
-+  StoreWord(GetWord(a1), dst + 0 * BPS);
-+  StoreWord(GetWord(a4), dst + 1 * BPS);
-+  StoreWord(GetWord(SRLI(a1, 1)), dst + 2 * BPS);
-+  StoreWord(GetWord(SRLI(a4, 1)), dst + 3 * BPS);
-+  dst[3 + 2 * BPS] = (extra >> 0) & 0xff;
-+  dst[3 + 3 * BPS] = (extra >> 8) & 0xff;
-+}
-+static void RD4_VSX(uint8_t* dst) {
-+  const u8x16 zero = vec_splats((unsigned char)0), one = vec_splats((unsigned char)1);
-+  const uint32_t I = dst[-1 + 0 * BPS], J = dst[-1 + 1 * BPS];
-+  const uint32_t K = dst[-1 + 2 * BPS], L = dst[-1 + 3 * BPS];
-+  const u8x16 XA = Load64(dst - BPS - 1);
-+  const u8x16 all = vec_or(SetWord((uint32_t)(L | (K << 8) | (J << 16) | (I << 24))),
-+                           SLLI(XA, 4));
-+  const u8x16 k1 = SRLI(all, 1), j2 = SRLI(all, 2);
-+  const u8x16 a1 = vec_avg(j2, all), lsb = vec_and(vec_xor(j2, all), one);
-+  const u8x16 r = vec_avg(vec_subs(a1, lsb), k1);
-+  StoreWord(GetWord(r), dst + 3 * BPS);
-+  StoreWord(GetWord(SRLI(r, 1)), dst + 2 * BPS);
-+  StoreWord(GetWord(SRLI(r, 2)), dst + 1 * BPS);
-+  StoreWord(GetWord(SRLI(r, 3)), dst + 0 * BPS);
-+}
-+static void TM4_VSX(uint8_t* dst) {
-+  const u8x16 zero = vec_splats((unsigned char)0);
-+  const u8x16 t = Load64(dst - BPS);
-+  const i16x8 tb = (i16x8)vec_mergeh(t, zero);
-+  const int c = dst[-BPS - 1];
-+  int y;
-+  for (y = 0; y < 4; ++y) {
-+    const i16x8 b = vec_splats((short)(dst[-1 + y * BPS] - c));
-+    const u8x16 o = (u8x16)vec_packsu(vec_add(b, tb), vec_splats((short)0));
-+    StoreWord(GetWord(o), dst + y * BPS);
-+  }
-+}
-+#undef SRLI
-+#undef SLLI
-+#undef INS16
-+#undef AVG3C
-+
-+extern void VP8DspInitVSX(void);
-+
-+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitVSX(void) {
-+  VP8Transform = Transform_VSX;
-+  VP8SimpleVFilter16 = SimpleVFilter16_VSX;
-+  VP8SimpleVFilter16i = SimpleVFilter16i_VSX;
-+  VP8SimpleHFilter16 = SimpleHFilter16_VSX;
-+  VP8SimpleHFilter16i = SimpleHFilter16i_VSX;
-+  VP8VFilter16 = VFilter16_VSX;
-+  VP8VFilter16i = VFilter16i_VSX;
-+  VP8HFilter16 = HFilter16_VSX;
-+  VP8HFilter16i = HFilter16i_VSX;
-+  VP8VFilter8 = VFilter8_VSX;
-+  VP8VFilter8i = VFilter8i_VSX;
-+  VP8HFilter8 = HFilter8_VSX;
-+  VP8HFilter8i = HFilter8i_VSX;
-+
-+  VP8PredLuma16[0] = DC16_VSX;
-+  VP8PredLuma16[1] = TM16_VSX;
-+  VP8PredLuma16[2] = VE16_VSX;
-+  VP8PredLuma16[3] = HE16_VSX;
-+  VP8PredLuma16[4] = DC16NoTop_VSX;
-+  VP8PredLuma16[5] = DC16NoLeft_VSX;
-+  VP8PredLuma16[6] = DC16NoTopLeft_VSX;
-+  VP8PredChroma8[0] = DC8uv_VSX;
-+  VP8PredChroma8[1] = TM8uv_VSX;
-+  VP8PredChroma8[2] = VE8uv_VSX;
-+  VP8PredChroma8[4] = DC8uvNoTop_VSX;
-+  VP8PredChroma8[5] = DC8uvNoLeft_VSX;
-+  VP8PredChroma8[6] = DC8uvNoTopLeft_VSX;
-+  VP8PredLuma4[1] = TM4_VSX;
-+  VP8PredLuma4[2] = VE4_VSX;
-+  VP8PredLuma4[4] = RD4_VSX;
-+  VP8PredLuma4[5] = VR4_VSX;
-+  VP8PredLuma4[6] = LD4_VSX;
-+  VP8PredLuma4[7] = VL4_VSX;
-+}
-+
-+#else  // !WEBP_USE_VSX
-+
-+WEBP_DSP_INIT_STUB(VP8DspInitVSX)
-+
-+#endif  // WEBP_USE_VSX
-diff --git a/media/libwebp/src/dsp/filters.c b/media/libwebp/src/dsp/filters.c
-index 38da5252df3a..9962e1287402 100644
---- a/media/libwebp/src/dsp/filters.c
-+++ b/media/libwebp/src/dsp/filters.c
-@@ -217,6 +217,7 @@ extern void VP8FiltersInitMIPSdspR2(void);
- extern void VP8FiltersInitMSA(void);
- extern void VP8FiltersInitNEON(void);
- extern void VP8FiltersInitSSE2(void);
-+extern void VP8FiltersInitVSX(void);
- 
- WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
-   WebPUnfilters[WEBP_FILTER_NONE] = NoneUnfilter_C;
-@@ -248,6 +249,11 @@ WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
-     if (VP8GetCPUInfo(kMSA)) {
-       VP8FiltersInitMSA();
-     }
-+#endif
-+#if defined(WEBP_HAVE_VSX)
-+    if (VP8GetCPUInfo(kVSX)) {
-+      VP8FiltersInitVSX();
-+    }
- #endif
-   }
- 
-diff --git a/media/libwebp/src/dsp/filters_vsx.c b/media/libwebp/src/dsp/filters_vsx.c
-new file mode 100644
-index 000000000000..ae8e57ac685c
---- /dev/null
-+++ b/media/libwebp/src/dsp/filters_vsx.c
-@@ -0,0 +1,162 @@
-+// Copyright 2015 Google Inc. All Rights Reserved.
-+//
-+// Use of this source code is governed by a BSD-style license
-+// that can be found in the COPYING file in the root of the source
-+// tree. An additional intellectual property rights grant can be found
-+// in the file PATENTS. All contributing project authors may
-+// be found in the AUTHORS file in the root of the source tree.
-+// -----------------------------------------------------------------------------
-+//
-+// VSX (PowerPC) version of filtering functions.
-+
-+#include "src/dsp/dsp.h"
-+
-+#if defined(WEBP_USE_VSX)
-+
-+#include <altivec.h>
-+#include <assert.h>
-+#include <string.h>
-+
-+#include "src/dsp/cpu.h"
-+#include "src/webp/types.h"
-+
-+typedef __vector unsigned char u8x16;
-+typedef __vector unsigned short u16x8;
-+typedef __vector signed short i16x8;
-+typedef __vector unsigned long long u64x2;
-+
-+// Byte-wise shifts of the whole 128-bit register, matching the little-endian
-+// semantics of _mm_slli_si128 / _mm_srli_si128. 'n' must be a literal.
-+#define SLLI(x, n) vec_sld((x), zero, (n))
-+#define SRLI(x, n) vec_sld(zero, (x), 16 - (n))
-+
-+// Loads 8 bytes from 'p' into the low half of a vector (high half undefined).
-+static WEBP_INLINE u8x16 Load8(const uint8_t* p) {
-+  uint64_t v;
-+  memcpy(&v, p, 8);
-+  return (u8x16)vec_splats(v);
-+}
-+
-+//------------------------------------------------------------------------------
-+// Horizontal unfilter: out[i] = in[i] + out[i - 1] (a prefix sum).
-+
-+static void HorizontalUnfilter_VSX(const uint8_t* prev, const uint8_t* in,
-+                                   uint8_t* out, int width) {
-+  const u8x16 zero = vec_splats((unsigned char)0);
-+  const u64x2 sh56 = vec_splats((unsigned long long)56);
-+  u8x16 last;
-+  int i;
-+  out[0] = (uint8_t)(in[0] + (prev == NULL ? 0 : prev[0]));
-+  if (width <= 1) return;
-+  last = vec_insert(out[0], zero, 0);
-+  for (i = 1; i + 8 <= width; i += 8) {
-+    const u8x16 A0 = Load8(in + i);
-+    const u8x16 A1 = vec_add(A0, last);
-+    const u8x16 A2 = SLLI(A1, 1);
-+    const u8x16 A3 = vec_add(A1, A2);
-+    const u8x16 A4 = SLLI(A3, 2);
-+    const u8x16 A5 = vec_add(A3, A4);
-+    const u8x16 A6 = SLLI(A5, 4);
-+    const u8x16 A7 = vec_add(A5, A6);
-+    memcpy(out + i, &A7, 8);
-+    last = (u8x16)vec_sr((u64x2)A7, sh56);  // broadcast out[i + 7] to byte 0
-+  }
-+  for (; i < width; ++i) out[i] = (uint8_t)(in[i] + out[i - 1]);
-+}
-+
-+//------------------------------------------------------------------------------
-+// Vertical unfilter: out[i] = in[i] + prev[i].
-+
-+static void VerticalUnfilter_VSX(const uint8_t* prev, const uint8_t* in,
-+                                 uint8_t* out, int width) {
-+  if (prev == NULL) {
-+    HorizontalUnfilter_VSX(NULL, in, out, width);
-+  } else {
-+    int i;
-+    const int max_pos = width & ~31;
-+    for (i = 0; i < max_pos; i += 32) {
-+      const u8x16 A0 = vec_xl(0, (unsigned char*)&in[i + 0]);
-+      const u8x16 A1 = vec_xl(0, (unsigned char*)&in[i + 16]);
-+      const u8x16 B0 = vec_xl(0, (unsigned char*)&prev[i + 0]);
-+      const u8x16 B1 = vec_xl(0, (unsigned char*)&prev[i + 16]);
-+      vec_xst(vec_add(A0, B0), 0, (unsigned char*)&out[i + 0]);
-+      vec_xst(vec_add(A1, B1), 0, (unsigned char*)&out[i + 16]);
-+    }
-+    for (; i < width; ++i) out[i] = (uint8_t)(in[i] + prev[i]);
-+  }
-+}
-+
-+//------------------------------------------------------------------------------
-+// Gradient unfilter: row[i] = in[i] + clip(row[i-1] + top[i] - top[i-1]).
-+
-+static WEBP_INLINE int GradientPredictor_VSX(uint8_t a, uint8_t b, uint8_t c) {
-+  const int g = a + b - c;
-+  return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255;
-+}
-+
-+static void GradientPredictInverse_VSX(const uint8_t* in, const uint8_t* top,
-+                                       uint8_t* row, int length) {
-+  if (length > 0) {
-+    int i;
-+    const int max_pos = length & ~7;
-+    const u8x16 zero = vec_splats((unsigned char)0);
-+    u8x16 A = vec_insert((unsigned char)row[-1], zero, 0);  // left sample
-+    for (i = 0; i < max_pos; i += 8) {
-+      const u8x16 t0 = Load8(top + i);
-+      const u8x16 t1 = Load8(top + i - 1);
-+      const u16x8 B = (u16x8)vec_mergeh(t0, zero);
-+      const u16x8 C = (u16x8)vec_mergeh(t1, zero);
-+      const u8x16 D = Load8(in + i);  // base input
-+      const u16x8 E = vec_sub(B, C);  // unclipped gradient basis b - c
-+      u8x16 out = zero;               // accumulator for output
-+      u8x16 mask_hi = vec_insert((unsigned char)0xff, zero, 0);
-+      int k = 8;
-+      while (1) {
-+        const u16x8 tmp3 = vec_add((u16x8)A, E);  // delta = a + b - c
-+        const u8x16 tmp4 = vec_packsu((i16x8)tmp3, (i16x8)zero);  // sat. delta
-+        const u8x16 tmp5 = vec_add(tmp4, D);                      // add to in[]
-+        A = vec_and(tmp5, mask_hi);  // keep new sample
-+        out = vec_or(out, A);        // accumulate output
-+        if (--k == 0) break;
-+        A = SLLI(A, 1);                  // rotate left sample
-+        mask_hi = SLLI(mask_hi, 1);      // rotate mask
-+        A = (u8x16)vec_mergeh(A, zero);  // convert 8b -> 16b
-+      }
-+      A = SRLI(A, 7);  // prepare left sample for next iteration
-+      memcpy(row + i, &out, 8);
-+    }
-+    for (; i < length; ++i) {
-+      const int delta = GradientPredictor_VSX(row[i - 1], top[i], top[i - 1]);
-+      row[i] = (uint8_t)(in[i] + delta);
-+    }
-+  }
-+}
-+
-+static void GradientUnfilter_VSX(const uint8_t* prev, const uint8_t* in,
-+                                 uint8_t* out, int width) {
-+  if (prev == NULL) {
-+    HorizontalUnfilter_VSX(NULL, in, out, width);
-+  } else {
-+    out[0] = (uint8_t)(in[0] + prev[0]);  // predict from above
-+    GradientPredictInverse_VSX(in + 1, prev + 1, out + 1, width - 1);
-+  }
-+}
-+
-+#undef SLLI
-+#undef SRLI
-+
-+//------------------------------------------------------------------------------
-+
-+extern void VP8FiltersInitVSX(void);
-+
-+WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitVSX(void) {
-+  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_VSX;
-+  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_VSX;
-+  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_VSX;
-+}
-+
-+#else  // !WEBP_USE_VSX
-+
-+WEBP_DSP_INIT_STUB(VP8FiltersInitVSX)
-+
-+#endif  // WEBP_USE_VSX
-diff --git a/media/libwebp/src/dsp/lossless.c b/media/libwebp/src/dsp/lossless.c
-index 1a3d800c3fbc..48b5d4a3aedc 100644
---- a/media/libwebp/src/dsp/lossless.c
-+++ b/media/libwebp/src/dsp/lossless.c
-@@ -606,6 +606,7 @@ extern void VP8LDspInitAVX2(void);
- extern void VP8LDspInitNEON(void);
- extern void VP8LDspInitMIPSdspR2(void);
- extern void VP8LDspInitMSA(void);
-+extern void VP8LDspInitVSX(void);
- 
- #define COPY_PREDICTOR_ARRAY(IN, OUT) do {                \
-   (OUT)[0] = IN##0_C;                                     \
-@@ -673,6 +674,11 @@ WEBP_DSP_INIT_FUNC(VP8LDspInit) {
-     if (VP8GetCPUInfo(kMSA)) {
-       VP8LDspInitMSA();
-     }
-+#endif
-+#if defined(WEBP_HAVE_VSX)
-+    if (VP8GetCPUInfo(kVSX)) {
-+      VP8LDspInitVSX();
-+    }
- #endif
-   }
- 
-diff --git a/media/libwebp/src/dsp/lossless_vsx.c b/media/libwebp/src/dsp/lossless_vsx.c
-new file mode 100644
-index 000000000000..89da30c9589c
---- /dev/null
-+++ b/media/libwebp/src/dsp/lossless_vsx.c
-@@ -0,0 +1,449 @@
-+// Copyright 2014 Google Inc. All Rights Reserved.
-+//
-+// Use of this source code is governed by a BSD-style license
-+// that can be found in the COPYING file in the root of the source
-+// tree. An additional intellectual property rights grant can be found
-+// in the file PATENTS. All contributing project authors may
-+// be found in the AUTHORS file in the root of the source tree.
-+// -----------------------------------------------------------------------------
-+//
-+// VSX (PowerPC) version of lossless functions.
-+
-+#include "src/dsp/dsp.h"
-+
-+#if defined(WEBP_USE_VSX)
-+
-+#include <altivec.h>
-+#include <string.h>
-+
-+#include "src/dsp/cpu.h"
-+#include "src/dsp/lossless.h"
-+#include "src/dsp/lossless_common.h"
-+#include "src/webp/format_constants.h"
-+#include "src/webp/types.h"
-+
-+typedef __vector unsigned char u8x16;
-+typedef __vector unsigned short u16x8;
-+typedef __vector signed short i16x8;
-+typedef __vector unsigned int u32x4;
-+typedef __vector signed int i32x4;
-+
-+// Signed multiply-high of 16-bit lanes: (a * b) >> 16, matching
-+// _mm_mulhi_epi16.
-+static WEBP_INLINE i16x8 MulHiS16(i16x8 a, i16x8 b) {
-+  const u32x4 sh = vec_splats((unsigned int)16);
-+  const i32x4 e = vec_sra(vec_mule(a, b), sh);
-+  const i32x4 o = vec_sra(vec_mulo(a, b), sh);
-+  return (i16x8)vec_pack(vec_mergeh(e, o), vec_mergel(e, o));
-+}
-+
-+//------------------------------------------------------------------------------
-+// Color transforms.
-+
-+static void AddGreenToBlueAndRed_VSX(const uint32_t* src, int num_pixels,
-+                                     uint32_t* dst) {
-+  const u8x16 zero = vec_splats((unsigned char)0);
-+  // Replicate the green byte (offset 1 of each pixel) into the blue/red slots.
-+  const u8x16 kSpreadGreen = {1, 16, 1, 16, 5,  16, 5,  16,
-+                              9, 16, 9, 16, 13, 16, 13, 16};
-+  int i;
-+  for (i = 0; i + 4 <= num_pixels; i += 4) {
-+    const u8x16 in = (u8x16)vec_xl(0, (uint32_t*)&src[i]);
-+    const u8x16 g = vec_perm(in, zero, kSpreadGreen);  // 0 g 0 g per pixel
-+    vec_xst((u32x4)vec_add(in, g), 0, &dst[i]);
-+  }
-+  if (i != num_pixels) {
-+    VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i);
-+  }
-+}
-+
-+static void TransformColorInverse_VSX(const VP8LMultipliers* const m,
-+                                      const uint32_t* src, int num_pixels,
-+                                      uint32_t* dst) {
-+// sign-extended multiplying constants, pre-shifted by 5 (see lossless_sse2.c).
-+#define CST(X) (((int16_t)((m->X) << 8)) >> 5)
-+  const i16x8 mults_rb =
-+      (i16x8)vec_splats((int)(((uint32_t)(uint16_t)CST(green_to_red) << 16) |
-+                              ((uint16_t)CST(green_to_blue))));
-+  const i16x8 mults_b2 =
-+      (i16x8)vec_splats((int)((uint32_t)(uint16_t)CST(red_to_blue) << 16));
-+#undef CST
-+  const u8x16 zero = vec_splats((unsigned char)0);
-+  const u32x4 mask_ag = vec_splats((uint32_t)0xff00ff00);  // alpha/green
-+  const u16x8 sh8_16 = vec_splats((unsigned short)8);
-+  const u32x4 sh8_32 = vec_splats((unsigned int)8);
-+  // Broadcast the green byte (offset 1) into the high byte of both 16-bit
-+  // halves of each pixel: yields g << 8 in each lane.
-+  const u8x16 kGreenHi = {16, 1, 16, 1, 16, 5,  16, 5,
-+                          16, 9, 16, 9, 16, 13, 16, 13};
-+  int i;
-+  for (i = 0; i + 4 <= num_pixels; i += 4) {
-+    const u8x16 in = (u8x16)vec_xl(0, (uint32_t*)&src[i]);
-+    const u8x16 A = (u8x16)vec_and((u32x4)in, mask_ag);   // a 0 g 0
-+    const i16x8 C = (i16x8)vec_perm(A, zero, kGreenHi);   // g0g0 (g << 8)
-+    const u8x16 D = (u8x16)MulHiS16(C, mults_rb);         // x dr x db1
-+    const u8x16 E = vec_add(in, D);                       // x r' x b'
-+    const u16x8 F = vec_sl((u16x8)E, sh8_16);             // r' 0 b' 0
-+    const u8x16 G = (u8x16)MulHiS16((i16x8)F, mults_b2);  // x db2 0 0
-+    const u8x16 H = (u8x16)vec_sr((u32x4)G, sh8_32);      // 0 x db2 0
-+    const u16x8 I = (u16x8)vec_add(H, (u8x16)F);          // r' x b'' 0
-+    const u8x16 J = (u8x16)vec_sr(I, sh8_16);             // 0 r' 0 b''
-+    vec_xst(vec_or((u32x4)J, (u32x4)A), 0, &dst[i]);
-+  }
-+  if (i != num_pixels) {
-+    VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
-+  }
-+}
-+
-+//------------------------------------------------------------------------------
-+// Color-space conversion functions.
-+
-+static void ConvertBGRAToRGBA_VSX(const uint32_t* WEBP_RESTRICT src,
-+                                  int num_pixels, uint8_t* WEBP_RESTRICT dst) {
-+  // Swap the blue (offset 0) and red (offset 2) bytes of each pixel.
-+  const u8x16 kSwapBR = {2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
-+  int i;
-+  for (i = 0; i + 4 <= num_pixels; i += 4) {
-+    const u8x16 in = (u8x16)vec_xl(0, (uint32_t*)&src[i]);
-+    vec_xst(vec_perm(in, in, kSwapBR), 0, &dst[4 * i]);
-+  }
-+  if (i != num_pixels) {
-+    VP8LConvertBGRAToRGBA_C(src + i, num_pixels - i, dst + 4 * i);
-+  }
-+}
-+
-+static void ConvertBGRAToRGB_VSX(const uint32_t* WEBP_RESTRICT src,
-+                                 int num_pixels, uint8_t* WEBP_RESTRICT dst) {
-+  // BGRA -> RGB: gather R,G,B (offsets 2,1,0) of each pixel, drop alpha.
-+  const u8x16 kToRGB = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 0, 0, 0, 0};
-+  int i;
-+  for (i = 0; i + 4 <= num_pixels; i += 4) {
-+    const u8x16 in = (u8x16)vec_xl(0, (uint32_t*)&src[i]);
-+    const u8x16 out = vec_perm(in, in, kToRGB);
-+    memcpy(&dst[3 * i], &out, 12);
-+  }
-+  if (i != num_pixels) {
-+    VP8LConvertBGRAToRGB_C(src + i, num_pixels - i, dst + 3 * i);
-+  }
-+}
-+
-+static void ConvertBGRAToBGR_VSX(const uint32_t* WEBP_RESTRICT src,
-+                                 int num_pixels, uint8_t* WEBP_RESTRICT dst) {
-+  // BGRA -> BGR: gather B,G,R (offsets 0,1,2) of each pixel, drop alpha.
-+  const u8x16 kToBGR = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 0, 0, 0, 0};
-+  int i;
-+  for (i = 0; i + 4 <= num_pixels; i += 4) {
-+    const u8x16 in = (u8x16)vec_xl(0, (uint32_t*)&src[i]);
-+    const u8x16 out = vec_perm(in, in, kToBGR);
-+    memcpy(&dst[3 * i], &out, 12);
-+  }
-+  if (i != num_pixels) {
-+    VP8LConvertBGRAToBGR_C(src + i, num_pixels - i, dst + 3 * i);
-+  }
-+}
-+
-+//------------------------------------------------------------------------------
-+// Predictor transform.
-+
-+// Byte-wise shifts of the whole register (little-endian _mm_s{l,r}li_si128).
-+#define SLLI(x, n) vec_sld((x), kZero, (n))
-+#define SRLI(x, n) vec_sld(kZero, (x), 16 - (n))
-+static const u8x16 kZero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-+
-+// Per-byte floor average (a + b) >> 1, matching the C Average2().
-+static WEBP_INLINE u8x16 Average2_u8(u8x16 a, u8x16 b) {
-+  const u8x16 one = vec_splats((unsigned char)1);
-+  const u8x16 avg1 = vec_avg(a, b);  // (a + b + 1) >> 1
-+  return vec_sub(avg1, vec_and(vec_xor(a, b), one));
-+}
-+
-+static WEBP_INLINE u32x4 Lane0(uint32_t v) {
-+  const u32x4 r = {v, 0, 0, 0};
-+  return r;
-+}
-+
-+// Single-pixel helpers operating on the low 32-bit lane only.
-+static WEBP_INLINE u16x8 Unpack16(uint32_t a) {
-+  return (u16x8)vec_mergeh((u8x16)Lane0(a), kZero);
-+}
-+
-+static WEBP_INLINE uint32_t Average2_VSX(uint32_t a0, uint32_t a1) {
-+  return vec_extract((u32x4)Average2_u8((u8x16)Lane0(a0), (u8x16)Lane0(a1)), 0);
-+}
-+
-+static WEBP_INLINE u16x8 Average2_16(uint32_t a0, uint32_t a1) {
-+  const u16x8 one = vec_splats((unsigned short)1);
-+  return vec_sr(vec_add(Unpack16(a0), Unpack16(a1)), one);
-+}
-+
-+static WEBP_INLINE uint32_t Average3_VSX(uint32_t a0, uint32_t a1,
-+                                         uint32_t a2) {
-+  const u16x8 one = vec_splats((unsigned short)1);
-+  const u16x8 avg1 = Average2_16(a0, a2);
-+  const u16x8 avg2 = vec_sr(vec_add(avg1, Unpack16(a1)), one);
-+  return vec_extract((u32x4)vec_packsu((i16x8)avg2, (i16x8)avg2), 0);
-+}
-+
-+static WEBP_INLINE uint32_t Average4_VSX(uint32_t a0, uint32_t a1, uint32_t a2,
-+                                         uint32_t a3) {
-+  const u16x8 one = vec_splats((unsigned short)1);
-+  const u16x8 avg1 = Average2_16(a0, a1);
-+  const u16x8 avg2 = Average2_16(a2, a3);
-+  const u16x8 avg3 = vec_sr(vec_add(avg1, avg2), one);
-+  return vec_extract((u32x4)vec_packsu((i16x8)avg3, (i16x8)avg3), 0);
-+}
-+
-+static WEBP_INLINE uint32_t ClampedAddSubtractFull_VSX(uint32_t c0, uint32_t c1,
-+                                                       uint32_t c2) {
-+  const i16x8 v =
-+      vec_sub((i16x8)vec_add(Unpack16(c0), Unpack16(c1)), (i16x8)Unpack16(c2));
-+  return vec_extract((u32x4)vec_packsu(v, v), 0);
-+}
-+
-+static WEBP_INLINE uint32_t ClampedAddSubtractHalf_VSX(uint32_t c0, uint32_t c1,
-+                                                       uint32_t c2) {
-+  const u16x8 one = vec_splats((unsigned short)1);
-+  const u16x8 C0 = Unpack16(c0);
-+  const u16x8 C1 = Unpack16(c1);
-+  const u16x8 B0 = Unpack16(c2);
-+  const u16x8 A0 = vec_sr(vec_add(C1, C0), one);  // ave
-+  const i16x8 A1 = vec_sub((i16x8)A0, (i16x8)B0);
-+  const i16x8 BgtA = (i16x8)vec_cmpgt(B0, A0);  // 0 or -1
-+  const i16x8 A2 = vec_sub(A1, BgtA);
-+  const i16x8 A3 = vec_sra(A2, one);
-+  const i16x8 A4 = vec_add((i16x8)A0, A3);
-+  return vec_extract((u32x4)vec_packsu(A4, A4), 0);
-+}
-+
-+static WEBP_INLINE uint32_t Select_VSX(uint32_t a, uint32_t b, uint32_t c) {
-+  const u8x16 A = (u8x16)Lane0(a);
-+  const u8x16 B = (u8x16)Lane0(b);
-+  const u8x16 C = (u8x16)Lane0(c);
-+  const u32x4 sa = vec_sum4s(vec_or(vec_subs(A, C), vec_subs(C, A)),
-+                             vec_splats((unsigned int)0));
-+  const u32x4 sb = vec_sum4s(vec_or(vec_subs(B, C), vec_subs(C, B)),
-+                             vec_splats((unsigned int)0));
-+  return vec_extract((u32x4)vec_cmpgt(sb, sa), 0) ? b : a;
-+}
-+
-+static uint32_t Predictor5_VSX(const uint32_t* const left,
-+                               const uint32_t* const top) {
-+  return Average3_VSX(*left, top[0], top[1]);
-+}
-+static uint32_t Predictor6_VSX(const uint32_t* const left,
-+                               const uint32_t* const top) {
-+  return Average2_VSX(*left, top[-1]);
-+}
-+static uint32_t Predictor7_VSX(const uint32_t* const left,
-+                               const uint32_t* const top) {
-+  return Average2_VSX(*left, top[0]);
-+}
-+static uint32_t Predictor13_VSX(const uint32_t* const left,
-+                                const uint32_t* const top) {
-+  return ClampedAddSubtractHalf_VSX(*left, top[0], top[-1]);
-+}
-+
-+static void PredictorAdd0_VSX(const uint32_t* in, const uint32_t* upper,
-+                              int num_pixels, uint32_t* WEBP_RESTRICT out) {
-+  const u8x16 black = (u8x16)vec_splats((uint32_t)ARGB_BLACK);
-+  int i;
-+  for (i = 0; i + 4 <= num_pixels; i += 4) {
-+    const u8x16 src = (u8x16)vec_xl(0, (uint32_t*)&in[i]);
-+    vec_xst((u32x4)vec_add(src, black), 0, &out[i]);
-+  }
-+  if (i != num_pixels) {
-+    VP8LPredictorsAdd_C[0](in + i, NULL, num_pixels - i, out + i);
-+  }
-+  (void)upper;
-+}
-+
-+static void PredictorAdd1_VSX(const uint32_t* in, const uint32_t* upper,
-+                              int num_pixels, uint32_t* WEBP_RESTRICT out) {
-+  u32x4 prev = vec_splats(out[-1]);
-+  int i;
-+  for (i = 0; i + 4 <= num_pixels; i += 4) {
-+    const u8x16 src = (u8x16)vec_xl(0, (uint32_t*)&in[i]);
-+    const u8x16 sum0 = vec_add(src, SLLI(src, 4));    // a | a+b | b+c | c+d
-+    const u8x16 sum1 = vec_add(sum0, SLLI(sum0, 8));  // running sum
-+    const u8x16 res = vec_add(sum1, (u8x16)prev);
-+    vec_xst((u32x4)res, 0, &out[i]);
-+    prev = vec_splat((u32x4)res, 3);  // replicate last pixel
-+  }
-+  if (i != num_pixels) {
-+    VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i);
-+  }
-+}
-+
-+#define GENERATE_PREDICTOR_1_VSX(X, IN)                                        \
-+  static void PredictorAdd##X##_VSX(const uint32_t* in, const uint32_t* upper, \
-+                                    int num_pixels,                            \
-+                                    uint32_t* WEBP_RESTRICT out) {             \
-+    int i;                                                                     \
-+    for (i = 0; i + 4 <= num_pixels; i += 4) {                                 \
-+      const u8x16 src = (u8x16)vec_xl(0, (uint32_t*)&in[i]);                   \
-+      const u8x16 other = (u8x16)vec_xl(0, (uint32_t*)&(IN));                  \
-+      vec_xst((u32x4)vec_add(src, other), 0, &out[i]);                         \
-+    }                                                                          \
-+    if (i != num_pixels) {                                                     \
-+      VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i);    \
-+    }                                                                          \
-+  }
-+GENERATE_PREDICTOR_1_VSX(2, upper[i])      // Top.
-+GENERATE_PREDICTOR_1_VSX(3, upper[i + 1])  // Top-right.
-+GENERATE_PREDICTOR_1_VSX(4, upper[i - 1])  // Top-left.
-+#undef GENERATE_PREDICTOR_1_VSX
-+
-+// Predictors 5, 6, 7, 13 use integer averages and cannot be accumulated in
-+// parallel, so use the generic one-pixel-at-a-time batch.
-+GENERATE_PREDICTOR_ADD(Predictor5_VSX, PredictorAdd5_VSX)
-+GENERATE_PREDICTOR_ADD(Predictor6_VSX, PredictorAdd6_VSX)
-+GENERATE_PREDICTOR_ADD(Predictor7_VSX, PredictorAdd7_VSX)
-+GENERATE_PREDICTOR_ADD(Predictor13_VSX, PredictorAdd13_VSX)
-+
-+#define GENERATE_PREDICTOR_2_VSX(X, IN)                                        \
-+  static void PredictorAdd##X##_VSX(const uint32_t* in, const uint32_t* upper, \
-+                                    int num_pixels,                            \
-+                                    uint32_t* WEBP_RESTRICT out) {             \
-+    int i;                                                                     \
-+    for (i = 0; i + 4 <= num_pixels; i += 4) {                                 \
-+      const u8x16 Tother = (u8x16)vec_xl(0, (uint32_t*)&(IN));                 \
-+      const u8x16 T = (u8x16)vec_xl(0, (uint32_t*)&upper[i]);                  \
-+      const u8x16 src = (u8x16)vec_xl(0, (uint32_t*)&in[i]);                   \
-+      vec_xst((u32x4)vec_add(Average2_u8(T, Tother), src), 0, &out[i]);        \
-+    }                                                                          \
-+    if (i != num_pixels) {                                                     \
-+      VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i);    \
-+    }                                                                          \
-+  }
-+GENERATE_PREDICTOR_2_VSX(8, upper[i - 1])  // Average TL, T.
-+GENERATE_PREDICTOR_2_VSX(9, upper[i + 1])  // Average T, TR.
-+#undef GENERATE_PREDICTOR_2_VSX
-+
-+// Predictor10: average of (average(L, TL), average(T, TR)).
-+static void PredictorAdd10_VSX(const uint32_t* in, const uint32_t* upper,
-+                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
-+  u8x16 L = (u8x16)Lane0(out[-1]);
-+  int i, k;
-+  for (i = 0; i + 4 <= num_pixels; i += 4) {
-+    u8x16 src = (u8x16)vec_xl(0, (uint32_t*)&in[i]);
-+    u8x16 TL = (u8x16)vec_xl(0, (uint32_t*)&upper[i - 1]);
-+    const u8x16 T = (u8x16)vec_xl(0, (uint32_t*)&upper[i]);
-+    const u8x16 TR = (u8x16)vec_xl(0, (uint32_t*)&upper[i + 1]);
-+    u8x16 avgTTR = Average2_u8(T, TR);
-+    for (k = 0; k < 4; ++k) {
-+      const u8x16 avg = Average2_u8(avgTTR, Average2_u8(L, TL));
-+      L = vec_add(avg, src);
-+      out[i + k] = vec_extract((u32x4)L, 0);
-+      avgTTR = SRLI(avgTTR, 4);
-+      TL = SRLI(TL, 4);
-+      src = SRLI(src, 4);
-+    }
-+  }
-+  if (i != num_pixels) {
-+    VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
-+  }
-+}
-+
-+// Predictor11: select between T and L based on |T-TL| vs |L-TL|.
-+static void PredictorAdd11_VSX(const uint32_t* in, const uint32_t* upper,
-+                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
-+  const u32x4 z32 = vec_splats((unsigned int)0);
-+  u8x16 L = (u8x16)Lane0(out[-1]);
-+  int i, k;
-+  for (i = 0; i + 4 <= num_pixels; i += 4) {
-+    u8x16 T = (u8x16)vec_xl(0, (uint32_t*)&upper[i]);
-+    u8x16 TL = (u8x16)vec_xl(0, (uint32_t*)&upper[i - 1]);
-+    u8x16 src = (u8x16)vec_xl(0, (uint32_t*)&in[i]);
-+    u8x16 pa = (u8x16)vec_sum4s(vec_or(vec_subs(T, TL), vec_subs(TL, T)), z32);
-+    for (k = 0; k < 4; ++k) {
-+      const u32x4 pb = vec_sum4s(vec_or(vec_subs(L, TL), vec_subs(TL, L)), z32);
-+      const u32x4 mask = (u32x4)vec_cmpgt(pb, (u32x4)pa);  // pb > pa ? L : T
-+      const u8x16 pred = vec_sel(T, L, (u8x16)mask);
-+      L = vec_add(src, pred);
-+      out[i + k] = vec_extract((u32x4)L, 0);
-+      T = SRLI(T, 4);
-+      TL = SRLI(TL, 4);
-+      src = SRLI(src, 4);
-+      pa = SRLI(pa, 4);
-+    }
-+  }
-+  if (i != num_pixels) {
-+    VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
-+  }
-+}
-+
-+// Predictor12: ClampedAddSubtractFull. 'L' is kept unpacked to 16 bits in the
-+// low 4 lanes; 'diff' (= T - TL) holds two pixels, the active one in lanes 0-3.
-+#define DO_PRED12(DIFF)                                   \
-+  do {                                                    \
-+    const i16x8 all = vec_add((i16x8)L, (DIFF));          \
-+    const u8x16 res = vec_add(src, vec_packsu(all, all)); \
-+    out[i + out_idx++] = vec_extract((u32x4)res, 0);      \
-+    L = (u16x8)vec_mergeh(res, kZero);                    \
-+  } while (0)
-+
-+static void PredictorAdd12_VSX(const uint32_t* in, const uint32_t* upper,
-+                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
-+  u16x8 L = Unpack16(out[-1]);
-+  int i;
-+  for (i = 0; i + 4 <= num_pixels; i += 4) {
-+    int out_idx = 0;
-+    u8x16 src = (u8x16)vec_xl(0, (uint32_t*)&in[i]);
-+    const u8x16 T = (u8x16)vec_xl(0, (uint32_t*)&upper[i]);
-+    const u8x16 TL = (u8x16)vec_xl(0, (uint32_t*)&upper[i - 1]);
-+    // 16-bit gradient basis T - TL for the four pixels (low and high halves).
-+    i16x8 diff_lo =
-+        vec_sub((i16x8)vec_mergeh(T, kZero), (i16x8)vec_mergeh(TL, kZero));
-+    i16x8 diff_hi =
-+        vec_sub((i16x8)vec_mergel(T, kZero), (i16x8)vec_mergel(TL, kZero));
-+    DO_PRED12(diff_lo);
-+    diff_lo = (i16x8)SRLI((u8x16)diff_lo, 8);
-+    src = SRLI(src, 4);
-+    DO_PRED12(diff_lo);
-+    src = SRLI(src, 4);
-+    DO_PRED12(diff_hi);
-+    diff_hi = (i16x8)SRLI((u8x16)diff_hi, 8);
-+    src = SRLI(src, 4);
-+    DO_PRED12(diff_hi);
-+  }
-+  if (i != num_pixels) {
-+    VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i);
-+  }
-+}
-+#undef DO_PRED12
-+
-+#undef SLLI
-+#undef SRLI
-+
-+//------------------------------------------------------------------------------
-+
-+extern void VP8LDspInitVSX(void);
-+
-+WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitVSX(void) {
-+  VP8LPredictorsAdd[0] = PredictorAdd0_VSX;
-+  VP8LPredictorsAdd[1] = PredictorAdd1_VSX;
-+  VP8LPredictorsAdd[2] = PredictorAdd2_VSX;
-+  VP8LPredictorsAdd[3] = PredictorAdd3_VSX;
-+  VP8LPredictorsAdd[4] = PredictorAdd4_VSX;
-+  VP8LPredictorsAdd[5] = PredictorAdd5_VSX;
-+  VP8LPredictorsAdd[6] = PredictorAdd6_VSX;
-+  VP8LPredictorsAdd[7] = PredictorAdd7_VSX;
-+  VP8LPredictorsAdd[8] = PredictorAdd8_VSX;
-+  VP8LPredictorsAdd[9] = PredictorAdd9_VSX;
-+  VP8LPredictorsAdd[10] = PredictorAdd10_VSX;
-+  VP8LPredictorsAdd[11] = PredictorAdd11_VSX;
-+  VP8LPredictorsAdd[12] = PredictorAdd12_VSX;
-+  VP8LPredictorsAdd[13] = PredictorAdd13_VSX;
-+
-+  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_VSX;
-+  VP8LTransformColorInverse = TransformColorInverse_VSX;
-+  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_VSX;
-+  VP8LConvertBGRAToRGB = ConvertBGRAToRGB_VSX;
-+  VP8LConvertBGRAToBGR = ConvertBGRAToBGR_VSX;
-+}
-+
-+#else  // !WEBP_USE_VSX
-+
-+WEBP_DSP_INIT_STUB(VP8LDspInitVSX)
-+
-+#endif  // WEBP_USE_VSX
-diff --git a/media/libwebp/src/dsp/moz.build b/media/libwebp/src/dsp/moz.build
-index 8d6f8427c900..f3e9d1273110 100644
---- a/media/libwebp/src/dsp/moz.build
-+++ b/media/libwebp/src/dsp/moz.build
-@@ -118,6 +118,20 @@ elif CONFIG['TARGET_CPU'].startswith('mips'):
-         'yuv_mips32.c',
-         'yuv_mips_dsp_r2.c',
-     ]
-+elif CONFIG['TARGET_CPU'] == 'ppc64':
-+    SOURCES += [
-+        'alpha_processing_vsx.c',
-+        'dec_vsx.c',
-+        'filters_vsx.c',
-+        'lossless_vsx.c',
-+        'rescaler_vsx.c',
-+        'upsampling_vsx.c',
-+        'yuv_vsx.c',
-+    ]
-+    DEFINES['WEBP_HAVE_VSX'] = 1;
-+    for f in SOURCES:
-+      if f.endswith('vsx.c'):
-+        SOURCES[f].flags += ['-mvsx']
- 
- if CONFIG['CC_TYPE'] in ('clang', 'clang-cl'):
-     CFLAGS += ['-Wno-unreachable-code']
-diff --git a/media/libwebp/src/dsp/rescaler.c b/media/libwebp/src/dsp/rescaler.c
-index eafccd442f25..2c0c8c47a7a3 100644
---- a/media/libwebp/src/dsp/rescaler.c
-+++ b/media/libwebp/src/dsp/rescaler.c
-@@ -207,6 +207,7 @@ extern void WebPRescalerDspInitMIPS32(void);
- extern void WebPRescalerDspInitMIPSdspR2(void);
- extern void WebPRescalerDspInitMSA(void);
- extern void WebPRescalerDspInitNEON(void);
-+extern void WebPRescalerDspInitVSX(void);
- 
- WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
- #if !defined(WEBP_REDUCE_SIZE)
-@@ -238,6 +239,11 @@ WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
-     if (VP8GetCPUInfo(kMSA)) {
-       WebPRescalerDspInitMSA();
-     }
-+#endif
-+#if defined(WEBP_HAVE_VSX)
-+    if (VP8GetCPUInfo(kVSX)) {
-+      WebPRescalerDspInitVSX();
-+    }
- #endif
-   }
- 
-diff --git a/media/libwebp/src/dsp/rescaler_vsx.c b/media/libwebp/src/dsp/rescaler_vsx.c
-new file mode 100644
-index 000000000000..002f232d647a
---- /dev/null
-+++ b/media/libwebp/src/dsp/rescaler_vsx.c
-@@ -0,0 +1,201 @@
-+// Copyright 2015 Google Inc. All Rights Reserved.
-+//
-+// Use of this source code is governed by a BSD-style license
-+// that can be found in the COPYING file in the root of the source
-+// tree. An additional intellectual property rights grant can be found
-+// in the file PATENTS. All contributing project authors may
-+// be found in the AUTHORS file in the root of the source tree.
-+// -----------------------------------------------------------------------------
-+//
-+// VSX (PowerPC) version of rescaling functions.
-+
-+#include "src/dsp/dsp.h"
-+
-+#if defined(WEBP_USE_VSX) && !defined(WEBP_REDUCE_SIZE)
-+
-+#include <altivec.h>
-+#include <assert.h>
-+#include <string.h>
-+
-+#include "src/dsp/cpu.h"
-+#include "src/utils/rescaler_utils.h"
-+#include "src/webp/types.h"
-+
-+typedef __vector unsigned char u8x16;
-+typedef __vector signed short i16x8;
-+typedef __vector unsigned int u32x4;
-+typedef __vector signed int i32x4;
-+typedef __vector unsigned long long u64x2;
-+
-+#define ROUNDER (WEBP_RESCALER_ONE >> 1)
-+#define MULT_FIX_C(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
-+#define MULT_FIX_FLOOR_C(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
-+
-+#if (WEBP_RESCALER_RFIX != 32)
-+#error "MULT_FIX/WEBP_RESCALER_RFIX need some more work"
-+#endif
-+
-+// Returns (x * scale + ROUNDER) >> 32 for each of the four 32-bit lanes.
-+static WEBP_INLINE u32x4 MultFix_VSX(u32x4 x, uint32_t scale) {
-+  const u64x2 rounder = vec_splats((unsigned long long)ROUNDER);
-+  const u64x2 shift = vec_splats((unsigned long long)WEBP_RESCALER_RFIX);
-+  const u32x4 s = vec_splats(scale);
-+  // vec_mule/vec_mulo produce the 32x32->64 products of the even (0, 2) and
-+  // odd (1, 3) lanes respectively.
-+  u64x2 e = vec_add(vec_mule(x, s), rounder);
-+  u64x2 o = vec_add(vec_mulo(x, s), rounder);
-+  e = vec_sr(e, shift);
-+  o = vec_sr(o, shift);
-+  return vec_mergee((u32x4)e, (u32x4)o);
-+}
-+
-+// Returns (x * scale) >> 32 for each lane (no rounding).
-+static WEBP_INLINE u32x4 MultFixFloor_VSX(u32x4 x, uint32_t scale) {
-+  const u64x2 shift = vec_splats((unsigned long long)WEBP_RESCALER_RFIX);
-+  const u32x4 s = vec_splats(scale);
-+  u64x2 e = vec_sr(vec_mule(x, s), shift);
-+  u64x2 o = vec_sr(vec_mulo(x, s), shift);
-+  return vec_mergee((u32x4)e, (u32x4)o);
-+}
-+
-+// Returns (A * frow + B * irow + ROUNDER) >> 32 for each lane.
-+static WEBP_INLINE u32x4 Interpolate_VSX(const rescaler_t* WEBP_RESTRICT frow,
-+                                         const rescaler_t* WEBP_RESTRICT irow,
-+                                         uint32_t A, uint32_t B) {
-+  const u64x2 rounder = vec_splats((unsigned long long)ROUNDER);
-+  const u64x2 shift = vec_splats((unsigned long long)WEBP_RESCALER_RFIX);
-+  const u32x4 f = vec_xl(0, (uint32_t*)frow);
-+  const u32x4 ir = vec_xl(0, (uint32_t*)irow);
-+  const u32x4 va = vec_splats(A);
-+  const u32x4 vb = vec_splats(B);
-+  u64x2 e = vec_add(vec_mule(f, va), vec_mule(ir, vb));
-+  u64x2 o = vec_add(vec_mulo(f, va), vec_mulo(ir, vb));
-+  e = vec_sr(vec_add(e, rounder), shift);
-+  o = vec_sr(vec_add(o, rounder), shift);
-+  return vec_mergee((u32x4)e, (u32x4)o);
-+}
-+
-+// Saturated pack of two 32-bit lane vectors (8 values) into 8 bytes at dst.
-+static WEBP_INLINE void Store8_VSX(u32x4 lo, u32x4 hi, uint8_t* dst) {
-+  const i16x8 s16 = vec_packs((i32x4)lo, (i32x4)hi);
-+  const u8x16 s8 = vec_packsu(s16, s16);
-+  memcpy(dst, &s8, 8);
-+}
-+
-+static void RescalerExportRowExpand_VSX(WebPRescaler* const wrk) {
-+  int x_out;
-+  uint8_t* const dst = wrk->dst;
-+  rescaler_t* const irow = wrk->irow;
-+  const int x_out_max = wrk->dst_width * wrk->num_channels;
-+  const int max_span = x_out_max & ~7;
-+  const rescaler_t* const frow = wrk->frow;
-+  const uint32_t fy_scale = wrk->fy_scale;
-+  assert(!WebPRescalerOutputDone(wrk));
-+  assert(wrk->y_accum <= 0);
-+  assert(wrk->y_expand);
-+  assert(wrk->y_sub != 0);
-+  if (wrk->y_accum == 0) {
-+    for (x_out = 0; x_out < max_span; x_out += 8) {
-+      const u32x4 A0 = vec_xl(0, (uint32_t*)(frow + x_out + 0));
-+      const u32x4 A1 = vec_xl(0, (uint32_t*)(frow + x_out + 4));
-+      const u32x4 B0 = MultFix_VSX(A0, fy_scale);
-+      const u32x4 B1 = MultFix_VSX(A1, fy_scale);
-+      Store8_VSX(B0, B1, dst + x_out);
-+    }
-+    for (; x_out < x_out_max; ++x_out) {
-+      const uint32_t J = frow[x_out];
-+      const int v = (int)MULT_FIX_C(J, fy_scale);
-+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
-+    }
-+  } else {
-+    const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
-+    const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
-+    for (x_out = 0; x_out < max_span; x_out += 8) {
-+      const u32x4 C0 =
-+          Interpolate_VSX(frow + x_out + 0, irow + x_out + 0, A, B);
-+      const u32x4 C1 =
-+          Interpolate_VSX(frow + x_out + 4, irow + x_out + 4, A, B);
-+      const u32x4 D0 = MultFix_VSX(C0, fy_scale);
-+      const u32x4 D1 = MultFix_VSX(C1, fy_scale);
-+      Store8_VSX(D0, D1, dst + x_out);
-+    }
-+    for (; x_out < x_out_max; ++x_out) {
-+      const uint64_t I = (uint64_t)A * frow[x_out] + (uint64_t)B * irow[x_out];
-+      const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
-+      const int v = (int)MULT_FIX_C(J, fy_scale);
-+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
-+    }
-+  }
-+}
-+
-+static void RescalerExportRowShrink_VSX(WebPRescaler* const wrk) {
-+  int x_out;
-+  uint8_t* const dst = wrk->dst;
-+  rescaler_t* const irow = wrk->irow;
-+  const int x_out_max = wrk->dst_width * wrk->num_channels;
-+  const int max_span = x_out_max & ~7;
-+  const rescaler_t* const frow = wrk->frow;
-+  const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
-+  const uint32_t fxy_scale = wrk->fxy_scale;
-+  assert(!WebPRescalerOutputDone(wrk));
-+  assert(wrk->y_accum <= 0);
-+  assert(!wrk->y_expand);
-+  if (yscale) {
-+    for (x_out = 0; x_out < max_span; x_out += 8) {
-+      const u32x4 in0 = vec_xl(0, (uint32_t*)(frow + x_out + 0));
-+      const u32x4 in1 = vec_xl(0, (uint32_t*)(frow + x_out + 4));
-+      const u32x4 in2 = vec_xl(0, (uint32_t*)(irow + x_out + 0));
-+      const u32x4 in3 = vec_xl(0, (uint32_t*)(irow + x_out + 4));
-+      const u32x4 A0 = MultFixFloor_VSX(in0, yscale);
-+      const u32x4 A1 = MultFixFloor_VSX(in1, yscale);
-+      const u32x4 B0 = vec_sub(in2, A0);
-+      const u32x4 B1 = vec_sub(in3, A1);
-+      const u32x4 C0 = MultFix_VSX(B0, fxy_scale);
-+      const u32x4 C1 = MultFix_VSX(B1, fxy_scale);
-+      Store8_VSX(C0, C1, dst + x_out);
-+      vec_xst(A0, 0, (uint32_t*)(irow + x_out + 0));
-+      vec_xst(A1, 0, (uint32_t*)(irow + x_out + 4));
-+    }
-+    for (; x_out < x_out_max; ++x_out) {
-+      const uint32_t frac = (uint32_t)MULT_FIX_FLOOR_C(frow[x_out], yscale);
-+      const int v = (int)MULT_FIX_C(irow[x_out] - frac, fxy_scale);
-+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
-+      irow[x_out] = frac;  // new fractional start
-+    }
-+  } else {
-+    const u32x4 zero = vec_splats((uint32_t)0);
-+    for (x_out = 0; x_out < max_span; x_out += 8) {
-+      const u32x4 in0 = vec_xl(0, (uint32_t*)(irow + x_out + 0));
-+      const u32x4 in1 = vec_xl(0, (uint32_t*)(irow + x_out + 4));
-+      const u32x4 A0 = MultFix_VSX(in0, fxy_scale);
-+      const u32x4 A1 = MultFix_VSX(in1, fxy_scale);
-+      Store8_VSX(A0, A1, dst + x_out);
-+      vec_xst(zero, 0, (uint32_t*)(irow + x_out + 0));
-+      vec_xst(zero, 0, (uint32_t*)(irow + x_out + 4));
-+    }
-+    for (; x_out < x_out_max; ++x_out) {
-+      const int v = (int)MULT_FIX_C(irow[x_out], fxy_scale);
-+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
-+      irow[x_out] = 0;
-+    }
-+  }
-+}
-+
-+#undef MULT_FIX_FLOOR_C
-+#undef MULT_FIX_C
-+#undef ROUNDER
-+
-+//------------------------------------------------------------------------------
-+
-+extern void WebPRescalerDspInitVSX(void);
-+
-+WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitVSX(void) {
-+  WebPRescalerExportRowExpand = RescalerExportRowExpand_VSX;
-+  WebPRescalerExportRowShrink = RescalerExportRowShrink_VSX;
-+}
-+
-+#else  // !WEBP_USE_VSX
-+
-+WEBP_DSP_INIT_STUB(WebPRescalerDspInitVSX)
-+
-+#endif  // WEBP_USE_VSX
-diff --git a/media/libwebp/src/dsp/upsampling.c b/media/libwebp/src/dsp/upsampling.c
-index c57f66c3553f..faecdf277393 100644
---- a/media/libwebp/src/dsp/upsampling.c
-+++ b/media/libwebp/src/dsp/upsampling.c
-@@ -235,6 +235,7 @@ extern VP8CPUInfo VP8GetCPUInfo;
- extern void WebPInitYUV444ConvertersMIPSdspR2(void);
- extern void WebPInitYUV444ConvertersSSE2(void);
- extern void WebPInitYUV444ConvertersSSE41(void);
-+extern void WebPInitYUV444ConvertersVSX(void);
- 
- WEBP_DSP_INIT_FUNC(WebPInitYUV444Converters) {
-   WebPYUV444Converters[MODE_RGBA]      = WebPYuv444ToRgba_C;
-@@ -264,6 +265,11 @@ WEBP_DSP_INIT_FUNC(WebPInitYUV444Converters) {
-     if (VP8GetCPUInfo(kMIPSdspR2)) {
-       WebPInitYUV444ConvertersMIPSdspR2();
-     }
-+#endif
-+#if defined(WEBP_HAVE_VSX)
-+    if (VP8GetCPUInfo(kVSX)) {
-+      WebPInitYUV444ConvertersVSX();
-+    }
- #endif
-   }
- }
-@@ -276,6 +282,7 @@ extern void WebPInitUpsamplersSSE41(void);
- extern void WebPInitUpsamplersNEON(void);
- extern void WebPInitUpsamplersMIPSdspR2(void);
- extern void WebPInitUpsamplersMSA(void);
-+extern void WebPInitUpsamplersVSX(void);
- 
- WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
- #ifdef FANCY_UPSAMPLING
-@@ -314,6 +321,11 @@ WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
-     if (VP8GetCPUInfo(kMSA)) {
-       WebPInitUpsamplersMSA();
-     }
-+#endif
-+#if defined(WEBP_HAVE_VSX)
-+    if (VP8GetCPUInfo(kVSX)) {
-+      WebPInitUpsamplersVSX();
-+    }
- #endif
-   }
- 
-diff --git a/media/libwebp/src/dsp/upsampling_vsx.c b/media/libwebp/src/dsp/upsampling_vsx.c
-new file mode 100644
-index 000000000000..a7191972fc6e
---- /dev/null
-+++ b/media/libwebp/src/dsp/upsampling_vsx.c
-@@ -0,0 +1,151 @@
-+// Copyright 2011 Google Inc. All Rights Reserved.
-+//
-+// Use of this source code is governed by a BSD-style license
-+// that can be found in the COPYING file in the root of the source
-+// tree. An additional intellectual property rights grant can be found
-+// in the file PATENTS. All contributing project authors may
-+// be found in the AUTHORS file in the root of the source tree.
-+// -----------------------------------------------------------------------------
-+//
-+// VSX (PowerPC) version of YUV to RGB upsampling functions.
-+
-+#include "src/dsp/dsp.h"
-+
-+#if defined(WEBP_USE_VSX)
-+
-+#include <altivec.h>
-+#include <assert.h>
-+#include <string.h>
-+
-+#include "src/dsp/cpu.h"
-+#include "src/dsp/yuv.h"
-+#include "src/webp/decode.h"
-+#include "src/webp/types.h"
-+
-+typedef __vector unsigned char u8x16;
-+typedef __vector unsigned short u16x8;
-+
-+// Upsample 16 chroma pairs from rows r1/r2 (17 readable bytes each) into 32
-+// "top" bytes at out[0..31] and 32 "bottom" bytes at out[64..95], matching the
-+// fancy-upsampler diagonal weights (a + 3b + 3c + d) / 8 etc.
-+#define GET_M(ij, in) \
-+  vec_sub(vec_avg(k, (in)), \
-+          vec_and(vec_or(vec_and((ij), st), vec_xor(k, (in))), one))
-+
-+static void Upsample32Pixels(const uint8_t* WEBP_RESTRICT r1,
-+                             const uint8_t* WEBP_RESTRICT r2,
-+                             uint8_t* WEBP_RESTRICT out) {
-+  const u8x16 one = vec_splats((unsigned char)1);
-+  const u8x16 a = vec_xl(0, (const unsigned char*)r1);
-+  const u8x16 b = vec_xl(1, (const unsigned char*)r1);
-+  const u8x16 c = vec_xl(0, (const unsigned char*)r2);
-+  const u8x16 d = vec_xl(1, (const unsigned char*)r2);
-+  const u8x16 s = vec_avg(a, d);
-+  const u8x16 t = vec_avg(b, c);
-+  const u8x16 st = vec_xor(s, t);
-+  const u8x16 t3 =
-+      vec_and(vec_or(vec_or(vec_xor(a, d), vec_xor(b, c)), st), one);
-+  const u8x16 k = vec_sub(vec_avg(s, t), t3);
-+  const u8x16 diag1 = GET_M(vec_xor(b, c), t);
-+  const u8x16 diag2 = GET_M(vec_xor(a, d), s);
-+  const u8x16 ta = vec_avg(a, diag1), tb = vec_avg(b, diag2);
-+  const u8x16 tc = vec_avg(c, diag2), td = vec_avg(d, diag1);
-+  vec_xst(vec_mergeh(ta, tb), 0, out);
-+  vec_xst(vec_mergel(ta, tb), 0, out + 16);
-+  vec_xst(vec_mergeh(tc, td), 0, out + 64);
-+  vec_xst(vec_mergel(tc, td), 0, out + 80);
-+}
-+
-+#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, FUNC32)                                 \
-+static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y,                      \
-+                      const uint8_t* WEBP_RESTRICT bottom_y,                   \
-+                      const uint8_t* WEBP_RESTRICT top_u,                      \
-+                      const uint8_t* WEBP_RESTRICT top_v,                      \
-+                      const uint8_t* WEBP_RESTRICT cur_u,                      \
-+                      const uint8_t* WEBP_RESTRICT cur_v,                      \
-+                      uint8_t* WEBP_RESTRICT top_dst,                          \
-+                      uint8_t* WEBP_RESTRICT bottom_dst, int len) {            \
-+  int uv_pos, pos;                                                            \
-+  uint8_t uv_buf[14 * 32 + 15] = {0};                                         \
-+  uint8_t* const r_u = (uint8_t*)(((uintptr_t)(uv_buf + 15)) & ~(uintptr_t)15);\
-+  uint8_t* const r_v = r_u + 32;                                              \
-+  assert(top_y != NULL);                                                      \
-+  {                                                                           \
-+    const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                      \
-+    const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                      \
-+    FUNC(top_y[0], (top_u[0] + u_diag) >> 1, (top_v[0] + v_diag) >> 1,        \
-+         top_dst);                                                           \
-+    if (bottom_y != NULL) {                                                   \
-+      FUNC(bottom_y[0], (cur_u[0] + u_diag) >> 1, (cur_v[0] + v_diag) >> 1,   \
-+           bottom_dst);                                                      \
-+    }                                                                         \
-+  }                                                                           \
-+  for (pos = 1, uv_pos = 0; pos + 32 + 1 <= len; pos += 32, uv_pos += 16) {   \
-+    Upsample32Pixels(top_u + uv_pos, cur_u + uv_pos, r_u);                    \
-+    Upsample32Pixels(top_v + uv_pos, cur_v + uv_pos, r_v);                    \
-+    FUNC32(top_y + pos, r_u, r_v, top_dst + pos * 4);                         \
-+    if (bottom_y != NULL) {                                                   \
-+      FUNC32(bottom_y + pos, r_u + 64, r_v + 64, bottom_dst + pos * 4);       \
-+    }                                                                         \
-+  }                                                                           \
-+  if (len > 1) {                                                              \
-+    const int left_over = ((len + 1) >> 1) - (pos >> 1);                      \
-+    uint8_t* const tmp_top_dst = r_u + 4 * 32;                                \
-+    uint8_t* const tmp_bottom_dst = tmp_top_dst + 4 * 32;                     \
-+    uint8_t* const tmp_top = tmp_bottom_dst + 4 * 32;                         \
-+    uint8_t* const tmp_bottom = (bottom_y == NULL) ? NULL : tmp_top + 32;     \
-+    uint8_t r1[17], r2[17];                                                   \
-+    assert(left_over > 0);                                                    \
-+    memcpy(r1, top_u + uv_pos, left_over);                                    \
-+    memcpy(r2, cur_u + uv_pos, left_over);                                    \
-+    memset(r1 + left_over, r1[left_over - 1], 17 - left_over);                \
-+    memset(r2 + left_over, r2[left_over - 1], 17 - left_over);                \
-+    Upsample32Pixels(r1, r2, r_u);                                            \
-+    memcpy(r1, top_v + uv_pos, left_over);                                    \
-+    memcpy(r2, cur_v + uv_pos, left_over);                                    \
-+    memset(r1 + left_over, r1[left_over - 1], 17 - left_over);                \
-+    memset(r2 + left_over, r2[left_over - 1], 17 - left_over);                \
-+    Upsample32Pixels(r1, r2, r_v);                                            \
-+    memcpy(tmp_top, top_y + pos, len - pos);                                  \
-+    if (bottom_y != NULL) memcpy(tmp_bottom, bottom_y + pos, len - pos);      \
-+    FUNC32(tmp_top, r_u, r_v, tmp_top_dst);                                   \
-+    if (bottom_y != NULL) FUNC32(tmp_bottom, r_u + 64, r_v + 64,             \
-+                                 tmp_bottom_dst);                            \
-+    memcpy(top_dst + pos * 4, tmp_top_dst, (len - pos) * 4);                  \
-+    if (bottom_y != NULL) {                                                   \
-+      memcpy(bottom_dst + pos * 4, tmp_bottom_dst, (len - pos) * 4);          \
-+    }                                                                         \
-+  }                                                                           \
-+}
-+
-+UPSAMPLE_FUNC(UpsampleRgbaLinePair_VSX, VP8YuvToRgba, VP8YuvToRgba32_VSX)
-+UPSAMPLE_FUNC(UpsampleBgraLinePair_VSX, VP8YuvToBgra, VP8YuvToBgra32_VSX)
-+UPSAMPLE_FUNC(UpsampleArgbLinePair_VSX, VP8YuvToArgb, VP8YuvToArgb32_VSX)
-+
-+extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
-+
-+extern void WebPInitUpsamplersVSX(void);
-+
-+WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersVSX(void) {
-+  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_VSX;
-+  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_VSX;
-+  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_VSX;
-+  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_VSX;
-+#if !defined(WEBP_REDUCE_CSP)
-+  WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_VSX;
-+  WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_VSX;
-+#endif
-+}
-+
-+extern void WebPInitYUV444ConvertersVSX(void);
-+
-+// YUV444 point converters stay on the C path for now.
-+WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersVSX(void) {}
-+
-+#else  // !WEBP_USE_VSX
-+
-+WEBP_DSP_INIT_STUB(WebPInitYUV444ConvertersVSX)
-+
-+WEBP_DSP_INIT_STUB(WebPInitUpsamplersVSX)
-+
-+#endif  // WEBP_USE_VSX
-diff --git a/media/libwebp/src/dsp/yuv.c b/media/libwebp/src/dsp/yuv.c
-index 62f1ecc1567d..9a95c5de1e23 100644
---- a/media/libwebp/src/dsp/yuv.c
-+++ b/media/libwebp/src/dsp/yuv.c
-@@ -81,6 +81,7 @@ extern void WebPInitSamplersSSE2(void);
- extern void WebPInitSamplersSSE41(void);
- extern void WebPInitSamplersMIPS32(void);
- extern void WebPInitSamplersMIPSdspR2(void);
-+extern void WebPInitSamplersVSX(void);
- 
- WEBP_DSP_INIT_FUNC(WebPInitSamplers) {
-   WebPSamplers[MODE_RGB]       = YuvToRgbRow;
-@@ -117,6 +118,11 @@ WEBP_DSP_INIT_FUNC(WebPInitSamplers) {
-       WebPInitSamplersMIPSdspR2();
-     }
- #endif  // WEBP_USE_MIPS_DSP_R2
-+#if defined(WEBP_HAVE_VSX)
-+    if (VP8GetCPUInfo(kVSX)) {
-+      WebPInitSamplersVSX();
-+    }
-+#endif
-   }
- }
- 
-diff --git a/media/libwebp/src/dsp/yuv.h b/media/libwebp/src/dsp/yuv.h
-index 6f218cf7e07f..979891d3232d 100644
---- a/media/libwebp/src/dsp/yuv.h
-+++ b/media/libwebp/src/dsp/yuv.h
-@@ -182,6 +182,27 @@ void VP8YuvToRgb56532_SSE2(const uint8_t* WEBP_RESTRICT y,
- 
- #endif    // WEBP_USE_SSE2
- 
-+//-----------------------------------------------------------------------------
-+// VSX extra functions (mostly for upsampling_vsx.c)
-+
-+#if defined(WEBP_USE_VSX)
-+
-+// Process 32 pixels and store the 32b-per-pixel result in *dst.
-+void VP8YuvToRgba32_VSX(const uint8_t* WEBP_RESTRICT y,
-+                        const uint8_t* WEBP_RESTRICT u,
-+                        const uint8_t* WEBP_RESTRICT v,
-+                        uint8_t* WEBP_RESTRICT dst);
-+void VP8YuvToBgra32_VSX(const uint8_t* WEBP_RESTRICT y,
-+                        const uint8_t* WEBP_RESTRICT u,
-+                        const uint8_t* WEBP_RESTRICT v,
-+                        uint8_t* WEBP_RESTRICT dst);
-+void VP8YuvToArgb32_VSX(const uint8_t* WEBP_RESTRICT y,
-+                        const uint8_t* WEBP_RESTRICT u,
-+                        const uint8_t* WEBP_RESTRICT v,
-+                        uint8_t* WEBP_RESTRICT dst);
-+
-+#endif    // WEBP_USE_VSX
-+
- //-----------------------------------------------------------------------------
- // SSE41 extra functions (mostly for upsampling_sse41.c)
- 
-diff --git a/media/libwebp/src/dsp/yuv_vsx.c b/media/libwebp/src/dsp/yuv_vsx.c
-new file mode 100644
-index 000000000000..1fdc5c80ba16
---- /dev/null
-+++ b/media/libwebp/src/dsp/yuv_vsx.c
-@@ -0,0 +1,206 @@
-+// Copyright 2014 Google Inc. All Rights Reserved.
-+//
-+// Use of this source code is governed by a BSD-style license
-+// that can be found in the COPYING file in the root of the source
-+// tree. An additional intellectual property rights grant can be found
-+// in the file PATENTS. All contributing project authors may
-+// be found in the AUTHORS file in the root of the source tree.
-+// -----------------------------------------------------------------------------
-+//
-+// VSX (PowerPC) version of YUV->RGB conversion functions.
-+
-+#include "src/dsp/dsp.h"
-+
-+#if defined(WEBP_USE_VSX)
-+
-+#include <altivec.h>
-+#include <string.h>
-+
-+#include "src/dsp/yuv.h"
-+
-+typedef __vector unsigned char  u8x16;
-+typedef __vector unsigned short u16x8;
-+typedef __vector signed   short i16x8;
-+typedef __vector unsigned int   u32x4;
-+
-+// POWER8 has no "multiply-high unsigned halfword", so emulate _mm_mulhi_epu16
-+// via even/odd 16x16->32 products, >>16, then interleave back.
-+static WEBP_INLINE u16x8 MulHi16(u16x8 a, u16x8 b) {
-+  const u32x4 sh = vec_splats((unsigned int)16);
-+  const u32x4 e = vec_sr(vec_mule(a, b), sh);
-+  const u32x4 o = vec_sr(vec_mulo(a, b), sh);
-+  return vec_pack(vec_mergeh(e, o), vec_mergel(e, o));
-+}
-+
-+// 14b fixed-point ITU-R BT.601 YUV->RGB, matching the SSE2/scalar path.
-+// Inputs are samples pre-shifted into the high byte (<< 8).
-+static WEBP_INLINE void ConvertYUV444ToRGB(u16x8 Y0, u16x8 U0, u16x8 V0,
-+                                           i16x8* const R, i16x8* const G,
-+                                           u16x8* const B) {
-+  const u16x8 k19077 = vec_splats((unsigned short)19077);
-+  const u16x8 k26149 = vec_splats((unsigned short)26149);
-+  const u16x8 k14234 = vec_splats((unsigned short)14234);
-+  const u16x8 k33050 = vec_splats((unsigned short)33050);
-+  const u16x8 k17685 = vec_splats((unsigned short)17685);
-+  const u16x8 k6419  = vec_splats((unsigned short)6419);
-+  const u16x8 k13320 = vec_splats((unsigned short)13320);
-+  const u16x8 k8708  = vec_splats((unsigned short)8708);
-+  const u16x8 six    = vec_splats((unsigned short)6);
-+
-+  const u16x8 Y1 = MulHi16(Y0, k19077);
-+  const u16x8 R2 = vec_add(vec_sub(Y1, k14234), MulHi16(V0, k26149));
-+  const u16x8 G4 = vec_sub(vec_add(Y1, k8708),
-+                           vec_add(MulHi16(U0, k6419), MulHi16(V0, k13320)));
-+  // 33050 needs unsigned saturating arithmetic; B can exceed 32767.
-+  const u16x8 B2 = vec_subs(vec_adds(MulHi16(U0, k33050), Y1), k17685);
-+
-+  *R = vec_sra((i16x8)R2, six);
-+  *G = vec_sra((i16x8)G4, six);
-+  *B = vec_sr(B2, six);
-+}
-+
-+// Load 8 bytes into the high byte of 8 u16 lanes (i.e. sample << 8).
-+// Use an 8-byte copy (not a 16-byte vector load) to avoid reading past the
-+// end of the source row, matching the SSE2 _mm_loadl_epi64 behavior.
-+static WEBP_INLINE u16x8 LoadHi16(const uint8_t* WEBP_RESTRICT src) {
-+  const u8x16 zero = vec_splats((unsigned char)0);
-+  unsigned char tmp[16] = {0};
-+  memcpy(tmp, src, 8);
-+  return (u16x8)vec_mergeh(zero, vec_xl(0, tmp));
-+}
-+
-+// Load 4 U/V bytes, shift into the high byte, and replicate each sample.
-+static WEBP_INLINE u16x8 LoadUVHi8(const uint8_t* WEBP_RESTRICT src) {
-+  const u8x16 zero = vec_splats((unsigned char)0);
-+  unsigned char tmp[16] = {0};
-+  memcpy(tmp, src, 4);
-+  const u16x8 t = (u16x8)vec_mergeh(zero, vec_xl(0, tmp));
-+  return vec_mergeh(t, t);
-+}
-+
-+static WEBP_INLINE void YUV420ToRGB(const uint8_t* WEBP_RESTRICT y,
-+                                    const uint8_t* WEBP_RESTRICT u,
-+                                    const uint8_t* WEBP_RESTRICT v,
-+                                    i16x8* const R, i16x8* const G,
-+                                    u16x8* const B) {
-+  ConvertYUV444ToRGB(LoadHi16(y), LoadUVHi8(u), LoadUVHi8(v), R, G, B);
-+}
-+
-+// Pack four 8-lane channels into 32 interleaved bytes (c0 c1 c2 c3 per pixel).
-+static WEBP_INLINE void PackAndStore4(i16x8 c0, i16x8 c1, i16x8 c2, i16x8 c3,
-+                                      uint8_t* WEBP_RESTRICT dst) {
-+  const u8x16 c02 = vec_packsu(c0, c2);
-+  const u8x16 c13 = vec_packsu(c1, c3);
-+  const u8x16 lo8 = vec_mergeh(c02, c13);
-+  const u8x16 hi8 = vec_mergel(c02, c13);
-+  vec_xst((u8x16)vec_mergeh((u16x8)lo8, (u16x8)hi8), 0, dst);
-+  vec_xst((u8x16)vec_mergel((u16x8)lo8, (u16x8)hi8), 0, dst + 16);
-+}
-+
-+static const i16x8 kAlpha = {255, 255, 255, 255, 255, 255, 255, 255};
-+
-+static void YuvToRgbaRow_VSX(const uint8_t* WEBP_RESTRICT y,
-+                             const uint8_t* WEBP_RESTRICT u,
-+                             const uint8_t* WEBP_RESTRICT v,
-+                             uint8_t* WEBP_RESTRICT dst, int len) {
-+  int n;
-+  for (n = 0; n + 8 <= len; n += 8, dst += 32) {
-+    i16x8 R, G; u16x8 B;
-+    YUV420ToRGB(y, u, v, &R, &G, &B);
-+    PackAndStore4(R, G, (i16x8)B, kAlpha, dst);
-+    y += 8; u += 4; v += 4;
-+  }
-+  for (; n < len; ++n) {
-+    VP8YuvToRgba(y[0], u[0], v[0], dst);
-+    dst += 4; y += 1; u += (n & 1); v += (n & 1);
-+  }
-+}
-+
-+static void YuvToBgraRow_VSX(const uint8_t* WEBP_RESTRICT y,
-+                             const uint8_t* WEBP_RESTRICT u,
-+                             const uint8_t* WEBP_RESTRICT v,
-+                             uint8_t* WEBP_RESTRICT dst, int len) {
-+  int n;
-+  for (n = 0; n + 8 <= len; n += 8, dst += 32) {
-+    i16x8 R, G; u16x8 B;
-+    YUV420ToRGB(y, u, v, &R, &G, &B);
-+    PackAndStore4((i16x8)B, G, R, kAlpha, dst);
-+    y += 8; u += 4; v += 4;
-+  }
-+  for (; n < len; ++n) {
-+    VP8YuvToBgra(y[0], u[0], v[0], dst);
-+    dst += 4; y += 1; u += (n & 1); v += (n & 1);
-+  }
-+}
-+
-+static void YuvToArgbRow_VSX(const uint8_t* WEBP_RESTRICT y,
-+                             const uint8_t* WEBP_RESTRICT u,
-+                             const uint8_t* WEBP_RESTRICT v,
-+                             uint8_t* WEBP_RESTRICT dst, int len) {
-+  int n;
-+  for (n = 0; n + 8 <= len; n += 8, dst += 32) {
-+    i16x8 R, G; u16x8 B;
-+    YUV420ToRGB(y, u, v, &R, &G, &B);
-+    PackAndStore4(kAlpha, R, G, (i16x8)B, dst);
-+    y += 8; u += 4; v += 4;
-+  }
-+  for (; n < len; ++n) {
-+    VP8YuvToArgb(y[0], u[0], v[0], dst);
-+    dst += 4; y += 1; u += (n & 1); v += (n & 1);
-+  }
-+}
-+
-+// Convert 32 YUV444 pixels and store the 32b-per-pixel result. Used by the
-+// fancy upsampler in upsampling_vsx.c.
-+void VP8YuvToRgba32_VSX(const uint8_t* WEBP_RESTRICT y,
-+                        const uint8_t* WEBP_RESTRICT u,
-+                        const uint8_t* WEBP_RESTRICT v,
-+                        uint8_t* WEBP_RESTRICT dst) {
-+  int n;
-+  for (n = 0; n < 32; n += 8, dst += 32) {
-+    i16x8 R, G; u16x8 B;
-+    ConvertYUV444ToRGB(LoadHi16(y + n), LoadHi16(u + n), LoadHi16(v + n),
-+                       &R, &G, &B);
-+    PackAndStore4(R, G, (i16x8)B, kAlpha, dst);
-+  }
-+}
-+
-+void VP8YuvToBgra32_VSX(const uint8_t* WEBP_RESTRICT y,
-+                        const uint8_t* WEBP_RESTRICT u,
-+                        const uint8_t* WEBP_RESTRICT v,
-+                        uint8_t* WEBP_RESTRICT dst) {
-+  int n;
-+  for (n = 0; n < 32; n += 8, dst += 32) {
-+    i16x8 R, G; u16x8 B;
-+    ConvertYUV444ToRGB(LoadHi16(y + n), LoadHi16(u + n), LoadHi16(v + n),
-+                       &R, &G, &B);
-+    PackAndStore4((i16x8)B, G, R, kAlpha, dst);
-+  }
-+}
-+
-+void VP8YuvToArgb32_VSX(const uint8_t* WEBP_RESTRICT y,
-+                        const uint8_t* WEBP_RESTRICT u,
-+                        const uint8_t* WEBP_RESTRICT v,
-+                        uint8_t* WEBP_RESTRICT dst) {
-+  int n;
-+  for (n = 0; n < 32; n += 8, dst += 32) {
-+    i16x8 R, G; u16x8 B;
-+    ConvertYUV444ToRGB(LoadHi16(y + n), LoadHi16(u + n), LoadHi16(v + n),
-+                       &R, &G, &B);
-+    PackAndStore4(kAlpha, R, G, (i16x8)B, dst);
-+  }
-+}
-+
-+extern void WebPInitSamplersVSX(void);
-+
-+WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersVSX(void) {
-+  WebPSamplers[MODE_RGBA] = YuvToRgbaRow_VSX;
-+  WebPSamplers[MODE_BGRA] = YuvToBgraRow_VSX;
-+  WebPSamplers[MODE_ARGB] = YuvToArgbRow_VSX;
-+}
-+
-+#else  // !WEBP_USE_VSX
-+
-+WEBP_DSP_INIT_STUB(WebPInitSamplersVSX)
-+
-+#endif  // WEBP_USE_VSX
-diff --git a/media/libwebp/src/moz/cpu.cpp b/media/libwebp/src/moz/cpu.cpp
-index c6633170c923..82986d2f631e 100644
---- a/media/libwebp/src/moz/cpu.cpp
-+++ b/media/libwebp/src/moz/cpu.cpp
-@@ -35,6 +35,10 @@ static int MozCPUInfo(CPUFeature feature)
-     case kMIPSdspR2:
-     case kMSA:
-       return 1;
-+#endif
-+#if defined(WEBP_USE_VSX)
-+    case kVSX:
-+      return 1;
- #endif
-     default:
-       return 0;
--- 
-2.52.0
-

diff --git a/0003-Add-PPC64LE-JIT-backend.patch b/0003-Add-PPC64LE-JIT-backend.patch
deleted file mode 100644
index ee08b33..0000000
--- a/0003-Add-PPC64LE-JIT-backend.patch
+++ /dev/null
@@ -1,38205 +0,0 @@
-From c79926e41764c6aa6ae596812b23bc35b470028c Mon Sep 17 00:00:00 2001
-From: =?UTF-8?q?Trung=20L=C3=AA?= <8@tle.id.au>
-Date: Fri, 12 Jun 2026 16:02:28 +1000
-Subject: [PATCH 3/3] Add PPC64LE JIT backend
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Based on the work done by Cameron Kaiser and Justin Hibbits
-https://github.com/chmeeedalf/gecko-dev
-
-Co-authored-by: Cameron Kaiser <classilla@floodgap.com>
-Co-authored-by: Justin Hibbits <chmeeedalf@gmail.com>
-Assisted-by: Lance Albertson <lance@osuosl.org>
-Assisted-by: Thushan Fernando <thushan@thushanfernando.com>
-Assisted-by: Timothy Pearson <tpearson@solidsilicon.com>
-Assisted-by: Dan Horák <dan@danny.cz>
-Assisted-by: Hiếu Lê <modology@gmail.com>
-Assisted-by: Claude Fable 5 <noreply@anthropic.com>
----
- config/check_macroassembler_style.py          |    2 +
- js/moz.configure                              |   34 +-
- js/src/builtin/TestingFunctions.cpp           |   18 +
- js/src/irregexp/RegExpAPI.cpp                 |    5 +-
- .../irregexp/RegExpNativeMacroAssembler.cpp   |   28 +
- .../tests/baseline/ppc64-branch8-16-narrow.js |  103 +
- js/src/jit-test/tests/gc/gcparam.js           |    3 +-
- .../tests/ion/mod-constant-pow2-minus-one.js  |   78 +
- .../tests/ion/mod-pow2-negative-dividend.js   |   71 +
- .../tests/math-min-max-corner-cases.js        |   50 +
- js/src/jit-test/tests/wasm/atomicity.js       |    8 +-
- .../jit-test/tests/wasm/excessive-inlining.js |   19 +-
- .../jit-test/tests/wasm/memory-oob-message.js |   10 +-
- .../tests/wasm/ppc64-argon2-tiering.js        |  124 +
- .../tests/wasm/ppc64-compare-select-bench.js  |   70 +
- .../jit-test/tests/wasm/ppc64-extmul-alias.js |  107 +
- .../tests/wasm/ppc64-simd-vr-clobber.js       |  179 +
- js/src/jit-test/tests/wasm/profiling.js       |    7 +
- .../wasm/regress-ppc64-extract-lane-ctz.js    |   49 +
- .../wasm/regress-ppc64-select-condition.js    |   30 +
- .../wasm/regress-ppc64-trap-exit-simd-save.js |   64 +
- .../bug-ppc64-simd-reduce-and-branch.js       |    7 +
- .../bug-ppc64-simd-reduce-and-branch.wasm     |  Bin 0 -> 1148 bytes
- js/src/jit-test/tests/wasm/simd/bug1946618.js |    7 +-
- .../jit-test/tests/wasm/simd/ion-analysis.js  |    7 +-
- js/src/jit/Assembler.h                        |    2 +
- js/src/jit/BaselineIC.cpp                     |    2 +
- js/src/jit/CacheIRCompiler.cpp                |   16 +
- js/src/jit/CodeGenerator.cpp                  |    6 +
- js/src/jit/CodeGenerator.h                    |    2 +
- js/src/jit/EffectiveAddressAnalysis.cpp       |    2 +-
- js/src/jit/ExecutableAllocator.cpp            |   10 +-
- js/src/jit/FlushICache.cpp                    |    3 +-
- js/src/jit/FlushICache.h                      |   11 +-
- js/src/jit/GenerateABIFunctionType.py         |  100 +
- js/src/jit/JitContext.cpp                     |    4 +
- js/src/jit/JitFrames.cpp                      |   10 +
- js/src/jit/JitFrames.h                        |   12 +-
- js/src/jit/LIR.cpp                            |    4 +-
- js/src/jit/LIR.h                              |   10 +-
- js/src/jit/LIROps.yaml                        |   82 +-
- js/src/jit/Label.h                            |    2 +-
- js/src/jit/Lowering.cpp                       |    2 +-
- js/src/jit/Lowering.h                         |    2 +
- js/src/jit/MacroAssembler-inl.h               |    2 +
- js/src/jit/MacroAssembler.cpp                 |   25 +-
- js/src/jit/MacroAssembler.h                   |  647 +-
- js/src/jit/MoveEmitter.h                      |    2 +
- js/src/jit/MoveResolver.cpp                   |   16 +
- js/src/jit/RegisterAllocator.h                |    7 +-
- js/src/jit/Registers.h                        |    2 +
- js/src/jit/Safepoints.cpp                     |   11 +
- js/src/jit/SharedICHelpers-inl.h              |    2 +
- js/src/jit/SharedICHelpers.h                  |    2 +
- js/src/jit/SharedICRegisters.h                |    2 +
- js/src/jit/Simulator.h                        |    2 +
- js/src/jit/moz.build                          |   12 +
- js/src/jit/ppc64/Architecture-ppc64.cpp       |  221 +
- js/src/jit/ppc64/Architecture-ppc64.h         |  581 ++
- js/src/jit/ppc64/Assembler-ppc64.cpp          | 3028 +++++++
- js/src/jit/ppc64/Assembler-ppc64.h            | 2114 +++++
- js/src/jit/ppc64/CodeGenerator-ppc64.cpp      | 3647 ++++++++
- js/src/jit/ppc64/CodeGenerator-ppc64.h        |  101 +
- js/src/jit/ppc64/LIR-ppc64.h                  |  135 +
- js/src/jit/ppc64/Lowering-ppc64.cpp           | 1324 +++
- js/src/jit/ppc64/Lowering-ppc64.h             |  105 +
- js/src/jit/ppc64/MacroAssembler-ppc64-inl.h   | 6142 ++++++++++++++
- js/src/jit/ppc64/MacroAssembler-ppc64.cpp     | 3467 ++++++++
- js/src/jit/ppc64/MacroAssembler-ppc64.h       | 2031 +++++
- js/src/jit/ppc64/MoveEmitter-ppc64.cpp        |  357 +
- js/src/jit/ppc64/MoveEmitter-ppc64.h          |   64 +
- js/src/jit/ppc64/SharedICHelpers-ppc64-inl.h  |   83 +
- js/src/jit/ppc64/SharedICHelpers-ppc64.h      |   97 +
- js/src/jit/ppc64/SharedICRegisters-ppc64.h    |   46 +
- js/src/jit/ppc64/Simulator-ppc64.cpp          | 7296 +++++++++++++++++
- js/src/jit/ppc64/Simulator-ppc64.h            |  556 ++
- js/src/jit/ppc64/Trampoline-ppc64.cpp         |  648 ++
- js/src/jit/shared/Assembler-shared.h          |    5 +-
- .../AtomicOperations-feeling-lucky-gcc.h      |    3 +-
- js/src/jit/shared/CodeGenerator-shared.cpp    |    6 +-
- js/src/jit/shared/Lowering-shared-inl.h       |    2 +-
- js/src/js-config.mozbuild                     |    1 +
- js/src/jsapi-tests/testJitABIcalls.cpp        |    3 +
- js/src/jsapi-tests/testWasmReturnCalls.cpp    |   10 +-
- js/src/jsapi-tests/testsJit.cpp               |   20 +
- js/src/shell/js.cpp                           |   25 +
- js/src/shell/jsshell.h                        |    3 +-
- js/src/tests/shell/os.js                      |    8 +-
- js/src/util/Poison.h                          |    2 +
- js/src/wasm/WasmAnyRef.h                      |    7 +-
- js/src/wasm/WasmBCDefs.h                      |    7 +
- js/src/wasm/WasmBCMemory.cpp                  |   47 +-
- js/src/wasm/WasmBCRegDefs.h                   |   12 +-
- js/src/wasm/WasmBaselineCompile.cpp           |  148 +-
- js/src/wasm/WasmCodegenConstants.h            |    3 +-
- js/src/wasm/WasmCodegenTypes.cpp              |   11 +-
- js/src/wasm/WasmCompile.cpp                   |    6 +-
- js/src/wasm/WasmFrameIter.cpp                 |  118 +
- js/src/wasm/WasmGC.cpp                        |    8 +
- js/src/wasm/WasmGenerator.cpp                 |   18 +-
- js/src/wasm/WasmIonCompile.cpp                |    2 +-
- js/src/wasm/WasmMemory.cpp                    |    4 +-
- js/src/wasm/WasmSignalHandlers.cpp            |   20 +-
- js/src/wasm/WasmStacks.cpp                    |   31 +-
- js/src/wasm/WasmStubs.cpp                     |   43 +-
- js/src/wasm/WasmSummarizeInsn.cpp             |  163 +
- js/src/wasm/WasmValue.cpp                     |    2 +-
- mfbt/Assertions.h                             |    5 +
- 108 files changed, 34442 insertions(+), 438 deletions(-)
- create mode 100644 js/src/jit-test/tests/baseline/ppc64-branch8-16-narrow.js
- create mode 100644 js/src/jit-test/tests/ion/mod-constant-pow2-minus-one.js
- create mode 100644 js/src/jit-test/tests/ion/mod-pow2-negative-dividend.js
- create mode 100644 js/src/jit-test/tests/math-min-max-corner-cases.js
- create mode 100644 js/src/jit-test/tests/wasm/ppc64-argon2-tiering.js
- create mode 100644 js/src/jit-test/tests/wasm/ppc64-compare-select-bench.js
- create mode 100644 js/src/jit-test/tests/wasm/ppc64-extmul-alias.js
- create mode 100644 js/src/jit-test/tests/wasm/ppc64-simd-vr-clobber.js
- create mode 100644 js/src/jit-test/tests/wasm/regress-ppc64-extract-lane-ctz.js
- create mode 100644 js/src/jit-test/tests/wasm/regress-ppc64-select-condition.js
- create mode 100644 js/src/jit-test/tests/wasm/regress-ppc64-trap-exit-simd-save.js
- create mode 100644 js/src/jit-test/tests/wasm/regress/bug-ppc64-simd-reduce-and-branch.js
- create mode 100644 js/src/jit-test/tests/wasm/regress/bug-ppc64-simd-reduce-and-branch.wasm
- create mode 100644 js/src/jit/ppc64/Architecture-ppc64.cpp
- create mode 100644 js/src/jit/ppc64/Architecture-ppc64.h
- create mode 100644 js/src/jit/ppc64/Assembler-ppc64.cpp
- create mode 100644 js/src/jit/ppc64/Assembler-ppc64.h
- create mode 100644 js/src/jit/ppc64/CodeGenerator-ppc64.cpp
- create mode 100644 js/src/jit/ppc64/CodeGenerator-ppc64.h
- create mode 100644 js/src/jit/ppc64/LIR-ppc64.h
- create mode 100644 js/src/jit/ppc64/Lowering-ppc64.cpp
- create mode 100644 js/src/jit/ppc64/Lowering-ppc64.h
- create mode 100644 js/src/jit/ppc64/MacroAssembler-ppc64-inl.h
- create mode 100644 js/src/jit/ppc64/MacroAssembler-ppc64.cpp
- create mode 100644 js/src/jit/ppc64/MacroAssembler-ppc64.h
- create mode 100644 js/src/jit/ppc64/MoveEmitter-ppc64.cpp
- create mode 100644 js/src/jit/ppc64/MoveEmitter-ppc64.h
- create mode 100644 js/src/jit/ppc64/SharedICHelpers-ppc64-inl.h
- create mode 100644 js/src/jit/ppc64/SharedICHelpers-ppc64.h
- create mode 100644 js/src/jit/ppc64/SharedICRegisters-ppc64.h
- create mode 100644 js/src/jit/ppc64/Simulator-ppc64.cpp
- create mode 100644 js/src/jit/ppc64/Simulator-ppc64.h
- create mode 100644 js/src/jit/ppc64/Trampoline-ppc64.cpp
-
-diff --git a/config/check_macroassembler_style.py b/config/check_macroassembler_style.py
-index aa1a54104e26..ba73de388099 100644
---- a/config/check_macroassembler_style.py
-+++ b/config/check_macroassembler_style.py
-@@ -33,6 +33,7 @@ all_architecture_names = set([
-     "arm64",
-     "loong64",
-     "riscv64",
-+    "ppc64",
-     "wasm32",
- ])
- all_shared_architecture_names = set([
-@@ -41,6 +42,7 @@ all_shared_architecture_names = set([
-     "arm64",
-     "loong64",
-     "riscv64",
-+    "ppc64",
-     "wasm32",
- ])
- 
-diff --git a/js/moz.configure b/js/moz.configure
-index 26cc85622654..5310dd08506f 100644
---- a/js/moz.configure
-+++ b/js/moz.configure
-@@ -264,6 +264,7 @@ def jit_default(target, enable_portable_baseline_interp):
-         "aarch64",
-         "mips64",
-         "loongarch64",
-+        "ppc64",
-         "riscv64",
-     ):
-         return True
-@@ -285,7 +286,7 @@ def report_deprecated(value):
- # =======================================================
- option(
-     "--enable-simulator",
--    choices=("arm", "arm64", "mips64", "loong64", "riscv64"),
-+    choices=("arm", "arm64", "mips64", "loong64", "riscv64", "ppc64"),
-     nargs=1,
-     help="Enable a JIT code simulator for the specified architecture",
- )
-@@ -302,7 +303,7 @@ def simulator(jit_enabled, simulator_enabled, target):
-         if target.cpu != "x86":
-             die("The %s simulator only works on x86." % sim_cpu)
- 
--    if sim_cpu in ("arm64", "mips64", "loong64", "riscv64"):
-+    if sim_cpu in ("arm64", "mips64", "loong64", "riscv64", "ppc64"):
-         if target.cpu != "x86_64" and target.cpu != "aarch64":
-             die("The %s simulator only works on x86-64 or arm64." % sim_cpu)
- 
-@@ -315,12 +316,14 @@ set_config("JS_SIMULATOR_ARM64", simulator.arm64)
- set_config("JS_SIMULATOR_MIPS64", simulator.mips64)
- set_config("JS_SIMULATOR_LOONG64", simulator.loong64)
- set_config("JS_SIMULATOR_RISCV64", simulator.riscv64)
-+set_config("JS_SIMULATOR_PPC64", simulator.ppc64)
- set_define("JS_SIMULATOR", depends_if(simulator)(lambda x: True))
- set_define("JS_SIMULATOR_ARM", simulator.arm)
- set_define("JS_SIMULATOR_ARM64", simulator.arm64)
- set_define("JS_SIMULATOR_MIPS64", simulator.mips64)
- set_define("JS_SIMULATOR_LOONG64", simulator.loong64)
- set_define("JS_SIMULATOR_RISCV64", simulator.riscv64)
-+set_define("JS_SIMULATOR_PPC64", simulator.ppc64)
- 
- 
- @depends("--enable-jit", simulator, target)
-@@ -337,6 +340,8 @@ def jit_codegen(jit_enabled, simulator, target):
-         return namespace(x64=True)
-     elif target.cpu == "loongarch64":
-         return namespace(loong64=True)
-+    elif target.cpu == "ppc64":
-+        return namespace(ppc64=True)
-     elif target.cpu == "riscv64":
-         return namespace(riscv64=True)
- 
-@@ -348,6 +353,7 @@ set_config("JS_CODEGEN_ARM", jit_codegen.arm)
- set_config("JS_CODEGEN_ARM64", jit_codegen.arm64)
- set_config("JS_CODEGEN_MIPS64", jit_codegen.mips64)
- set_config("JS_CODEGEN_LOONG64", jit_codegen.loong64)
-+set_config("JS_CODEGEN_PPC64", jit_codegen.ppc64)
- set_config("JS_CODEGEN_RISCV64", jit_codegen.riscv64)
- set_config("JS_CODEGEN_X86", jit_codegen.x86)
- set_config("JS_CODEGEN_X64", jit_codegen.x64)
-@@ -358,6 +364,7 @@ set_define("JS_CODEGEN_ARM", jit_codegen.arm)
- set_define("JS_CODEGEN_ARM64", jit_codegen.arm64)
- set_define("JS_CODEGEN_MIPS64", jit_codegen.mips64)
- set_define("JS_CODEGEN_LOONG64", jit_codegen.loong64)
-+set_define("JS_CODEGEN_PPC64", jit_codegen.ppc64)
- set_define("JS_CODEGEN_RISCV64", jit_codegen.riscv64)
- set_define("JS_CODEGEN_X86", jit_codegen.x86)
- set_define("JS_CODEGEN_X64", jit_codegen.x64)
-@@ -728,7 +735,7 @@ def default_wasm_jspi(
-         return
- 
-     if simulator:
--        return simulator[0] in ("arm64", "arm", "loong64", "mips64", "riscv64")
-+        return simulator[0] in ("arm64", "arm", "loong64", "mips64", "ppc64", "riscv64")
- 
-     if target.cpu in (
-         "x86_64",
-@@ -737,6 +744,7 @@ def default_wasm_jspi(
-         "arm",
-         "loongarch64",
-         "mips64",
-+        "ppc64",
-         "riscv64",
-     ):
-         return True
-@@ -768,10 +776,11 @@ def wasm_jspi(value, jit_enabled, simulator, no_experimental, target):
-         "arm",
-         "loong64",
-         "mips64",
-+        "ppc64",
-         "riscv64",
-     ):
-         die(
--            "--enable-wasm-jspi is only supported for arm64/arm/loong64/mips64/riscv64 simulators"
-+            "--enable-wasm-jspi is only supported for arm64/arm/loong64/mips64/ppc64/riscv64 simulators"
-         )
- 
-     if target.cpu in (
-@@ -781,12 +790,13 @@ def wasm_jspi(value, jit_enabled, simulator, no_experimental, target):
-         "arm",
-         "loongarch64",
-         "mips64",
-+        "ppc64",
-         "riscv64",
-     ):
-         return True
- 
-     die(
--        "--enable-wasm-jspi only possible when targeting the x86_64/x86/arm64/arm/loongarch64/mips64/riscv64 jits"
-+        "--enable-wasm-jspi only possible when targeting the x86_64/x86/arm64/arm/loongarch64/mips64/ppc64/riscv64 jits"
-     )
- 
- 
-@@ -821,10 +831,10 @@ def default_wasm_simd(jit_enabled, simulator, target):
-     if not jit_enabled:
-         return
- 
--    if simulator and (simulator[0] != "arm64"):
-+    if simulator and simulator[0] not in ("arm64", "ppc64"):
-         return
- 
--    if target.cpu in ("x86_64", "x86", "aarch64"):
-+    if target.cpu in ("x86_64", "x86", "aarch64", "ppc64"):
-         return True
- 
- 
-@@ -849,13 +859,15 @@ def wasm_simd(value, jit_enabled, simulator, target, no_experimental):
-     if not jit_enabled:
-         die("--enable-wasm-simd requires --enable-jit")
- 
--    if simulator and (simulator[0] != "arm64"):
--        die("--enable-wasm-simd is not supported for simulators, except arm64")
-+    if simulator and simulator[0] not in ("arm64", "ppc64"):
-+        die(
-+            "--enable-wasm-simd is not supported for simulators, except arm64 and ppc64"
-+        )
- 
--    if target.cpu in ("x86_64", "x86", "aarch64"):
-+    if target.cpu in ("x86_64", "x86", "aarch64", "ppc64"):
-         return True
- 
--    die("--enable-wasm-simd only possible when targeting the x86_64/x86/arm64 jits")
-+    die("--enable-wasm-simd only possible when targeting the x86_64/x86/arm64/ppc64 jits")
- 
- 
- set_config("ENABLE_WASM_SIMD", wasm_simd)
-diff --git a/js/src/builtin/TestingFunctions.cpp b/js/src/builtin/TestingFunctions.cpp
-index be8b3d0e16b6..2291d58dc0a1 100644
---- a/js/src/builtin/TestingFunctions.cpp
-+++ b/js/src/builtin/TestingFunctions.cpp
-@@ -447,6 +447,15 @@ static bool GetBuildConfiguration(JSContext* cx, unsigned argc, Value* vp) {
-     return false;
-   }
- 
-+#ifdef JS_CODEGEN_PPC64
-+  value = BooleanValue(true);
-+#else
-+  value = BooleanValue(false);
-+#endif
-+  if (!JS_SetProperty(cx, info, "ppc64", value)) {
-+    return false;
-+  }
-+
- #ifdef JS_CODEGEN_LOONG64
-   value = BooleanValue(true);
- #else
-@@ -483,6 +492,15 @@ static bool GetBuildConfiguration(JSContext* cx, unsigned argc, Value* vp) {
-     return false;
-   }
- 
-+#ifdef JS_SIMULATOR_PPC64
-+  value = BooleanValue(true);
-+#else
-+  value = BooleanValue(false);
-+#endif
-+  if (!JS_SetProperty(cx, info, "ppc64-simulator", value)) {
-+    return false;
-+  }
-+
- #ifdef MOZ_ASAN
-   value = BooleanValue(true);
- #else
-diff --git a/js/src/irregexp/RegExpAPI.cpp b/js/src/irregexp/RegExpAPI.cpp
-index 310cd85c6a20..377509574f28 100644
---- a/js/src/irregexp/RegExpAPI.cpp
-+++ b/js/src/irregexp/RegExpAPI.cpp
-@@ -495,7 +495,10 @@ class RegExpDepthCheck final : public v8::internal::regexp::Visitor {
- 
-   // This size is picked to be comfortably larger than any
-   // RegExp*::ToNode stack frame.
--#if !defined(DEBUG) && !defined(MOZ_CODE_COVERAGE)
-+#if defined(__powerpc64__)
-+  // PPC64 ELFv2 has larger minimum stack frames.
-+  static const size_t FRAME_PADDING = 256 * 4;
-+#elif !defined(DEBUG) && !defined(MOZ_CODE_COVERAGE)
-   static const size_t FRAME_PADDING = 256;
- #else
-   // Use a slightly larger padding for debug and code coverage builds.
-diff --git a/js/src/irregexp/RegExpNativeMacroAssembler.cpp b/js/src/irregexp/RegExpNativeMacroAssembler.cpp
-index ae351226797b..a396aeb3c731 100644
---- a/js/src/irregexp/RegExpNativeMacroAssembler.cpp
-+++ b/js/src/irregexp/RegExpNativeMacroAssembler.cpp
-@@ -990,8 +990,21 @@ void SMRegExpMacroAssembler::CheckBacktrackStackLimit() {
-       AbsoluteAddress(isolate()->regexp_stack()->limit_address_address()),
-       backtrack_stack_pointer_, &no_stack_overflow);
- 
-+#ifdef JS_CODEGEN_PPC64
-+  // LR on PowerPC isn't a GPR, so we have to explicitly save it before
-+  // calling or the regexp's return address will be clobbered.
-+  masm_.xs_mflr(temp1_);
-+  masm_.as_stdu(temp1_, masm_.getStackPointer(), -8);
-+#endif
-+
-   masm_.call(&stack_overflow_label_);
- 
-+#ifdef JS_CODEGEN_PPC64
-+  masm_.as_ld(temp1_, masm_.getStackPointer(), 0);
-+  masm_.xs_mtlr(temp1_);
-+  masm_.as_addi(masm_.getStackPointer(), masm_.getStackPointer(), 8);
-+#endif
-+
-   // Exit with an exception if the call failed
-   masm_.branchTest32(Assembler::Zero, temp0_, temp0_,
-                      &exit_with_exception_label_);
-@@ -1080,6 +1093,13 @@ void SMRegExpMacroAssembler::createStackFrame() {
-   masm_.initPseudoStackPtr();
- #endif
- 
-+#ifdef JS_CODEGEN_PPC64
-+  // PPC64's link register is an SPR, not a GPR, so it cannot be included in
-+  // SavedNonVolatileRegisters. Save it explicitly before the frame pointer
-+  // so that abiret()'s blr can return to the caller after we restore it.
-+  masm_.pushReturnAddress();
-+#endif
-+
-   masm_.Push(js::jit::FramePointer);
-   masm_.moveStackPtrTo(js::jit::FramePointer);
- 
-@@ -1308,6 +1328,9 @@ void SMRegExpMacroAssembler::exitHandler() {
-   // Perform a plain Ret(), as abiret() will move SP <- PSP and that is wrong.
-   masm_.Ret(vixl::lr);
- #else
-+#  ifdef JS_CODEGEN_PPC64
-+  masm_.popReturnAddress();
-+#  endif
-   masm_.abiret();
- #endif
- 
-@@ -1351,6 +1374,11 @@ void SMRegExpMacroAssembler::stackOverflowHandler() {
- 
-   // Adjust for the return address on the stack.
-   size_t frameOffset = sizeof(void*);
-+#ifdef JS_CODEGEN_PPC64
-+  // CheckBacktrackStackLimit pushes LR before calling us, so there's a
-+  // second return address on the stack.
-+  frameOffset += sizeof(void*);
-+#endif
- 
-   volatileRegs.takeUnchecked(temp0_);
-   volatileRegs.takeUnchecked(temp1_);
-diff --git a/js/src/jit-test/tests/baseline/ppc64-branch8-16-narrow.js b/js/src/jit-test/tests/baseline/ppc64-branch8-16-narrow.js
-new file mode 100644
-index 000000000000..fc1074a9ef8b
---- /dev/null
-+++ b/js/src/jit-test/tests/baseline/ppc64-branch8-16-narrow.js
-@@ -0,0 +1,103 @@
-+// Regression test for PPC64 branch8/branch16 width-narrowing under Equal /
-+// NotEqual / unsigned comparisons. Two prior bugs:
-+//
-+//   1. Sign-extending the load while move32(Imm32) zero-extended the imm
-+//      caused spurious mismatch when the loaded byte/halfword had its high
-+//      bit set (e.g. "ÀÁÂ".startsWith("ÀÁÂ") returned false because byte 0xC0
-+//      sign-extended to 0xFF...C0 but the imm 0xC0 zero-extended to 0x00C0,
-+//      so cmpw on the low 32 bits saw a negative vs positive value).
-+//
-+//   2. Always zero-extending the load broke `byte == Imm32(-1)` because -1
-+//      sign-extends in the imm path: the loaded 0x000000FF didn't match the
-+//      materialized 0xFFFFFFFF.
-+//
-+// Fix: cast the immediate to uint8/uint16 (equality + unsigned) or int8/int16
-+// (signed relational) so both sides have matching bit patterns regardless of
-+// how move32(Imm32) chose to materialize it. Match ARM64/LoongArch64/RISC-V.
-+//
-+// We exercise both byte and halfword branch paths via TypedArray loads and
-+// String.prototype.startsWith with a constant search string (the original
-+// failing site lowered to branch16(NotEqual, addr, Imm32(0xC1C0))).
-+
-+// --- Direct byte/halfword equality through TypedArray ---
-+{
-+  let u8 = new Uint8Array([0, 1, 0x7F, 0x80, 0xC0, 0xC1, 0xFE, 0xFF]);
-+  let i8 = new Int8Array(u8.buffer);
-+  let u16 = new Uint16Array([0x0000, 0x7FFF, 0x8000, 0xC1C0, 0xFFFE, 0xFFFF]);
-+  let i16 = new Int16Array(u16.buffer);
-+
-+  // Force baseline + Ion to specialize the comparisons.
-+  function eqU8(arr, idx, val) {
-+    return arr[idx] === val;
-+  }
-+  function eqI8(arr, idx, val) {
-+    return arr[idx] === val;
-+  }
-+  function eqU16(arr, idx, val) {
-+    return arr[idx] === val;
-+  }
-+  function eqI16(arr, idx, val) {
-+    return arr[idx] === val;
-+  }
-+
-+  for (let i = 0; i < 200; i++) {
-+    // High-bit-set bytes: bit pattern equality must hold both signed and
-+    // unsigned interpretations of the immediate.
-+    assertEq(eqU8(u8, 4, 0xC0), true);   // unsigned compare 0xC0 == 0xC0
-+    assertEq(eqU8(u8, 4, 0xC1), false);
-+    assertEq(eqU8(u8, 7, 0xFF), true);
-+    assertEq(eqU8(u8, 7, -1 & 0xFF), true);   // 0xFF written as -1&0xFF
-+
-+    // Signed Int8 view: 0xFF is -1, 0xC0 is -64.
-+    assertEq(eqI8(i8, 4, -64), true);
-+    assertEq(eqI8(i8, 7, -1), true);
-+    assertEq(eqI8(i8, 4, -63), false);
-+
-+    // Halfword variants: the original startswith failure pattern was
-+    // (Latin-1 char 0xC1C0) — a 16-bit value with bit 15 set.
-+    assertEq(eqU16(u16, 3, 0xC1C0), true);
-+    assertEq(eqU16(u16, 3, 0xC1C1), false);
-+    assertEq(eqU16(u16, 5, 0xFFFF), true);
-+    assertEq(eqU16(u16, 5, -1 & 0xFFFF), true);
-+
-+    assertEq(eqI16(i16, 3, -15936), true);  // 0xC1C0 as i16 = -15936
-+    assertEq(eqI16(i16, 5, -1), true);
-+    assertEq(eqI16(i16, 5, -2), false);
-+  }
-+}
-+
-+// --- String.prototype.startsWith with a Latin-1 constant search ---
-+// This was the original failing site — Ion lowers a constant search string
-+// of length 1..32 into a sequence of byte-wise comparisons.
-+{
-+  let s = "ÀÁÂ";  // Latin-1 length 3, bytes 0xC0 0xC1 0xC2 (all high-bit set)
-+  function check() {
-+    return s.startsWith("ÀÁÂ");
-+  }
-+  for (let i = 0; i < 200; i++) {
-+    assertEq(check(), true);
-+  }
-+
-+  // Mismatch on a single high-bit byte must report not-equal.
-+  let s2 = "ÀÁÃ";  // last byte 0xC3 instead of 0xC2
-+  function check2() {
-+    return s2.startsWith("ÀÁÂ");
-+  }
-+  for (let i = 0; i < 200; i++) {
-+    assertEq(check2(), false);
-+  }
-+}
-+
-+// --- Signed relational comparisons still work (we kept the sign-extend path) ---
-+{
-+  let i8 = new Int8Array([0x7F, -1, -128, 1, 0]);
-+  function ltZero(idx) {
-+    return i8[idx] < 0;
-+  }
-+  for (let i = 0; i < 200; i++) {
-+    assertEq(ltZero(0), false);  // 0x7F = +127
-+    assertEq(ltZero(1), true);   // -1
-+    assertEq(ltZero(2), true);   // -128
-+    assertEq(ltZero(3), false);  // 1
-+  }
-+}
-diff --git a/js/src/jit-test/tests/gc/gcparam.js b/js/src/jit-test/tests/gc/gcparam.js
-index 51d58662193f..48e5a97c135f 100644
---- a/js/src/jit-test/tests/gc/gcparam.js
-+++ b/js/src/jit-test/tests/gc/gcparam.js
-@@ -30,7 +30,8 @@ testGetParam("chunkBytes");
- testGetParam("helperThreadCount");
- 
- testChangeParam("maxBytes");
--testChangeParam("minNurseryBytes", 16 * 1024);
-+var pageSize = gcparam("systemPageSizeKB") * 1024;
-+testChangeParam("minNurseryBytes", pageSize);
- testChangeParam("maxNurseryBytes", 1024 * 1024);
- testChangeParam("incrementalGCEnabled");
- testChangeParam("perZoneGCEnabled");
-diff --git a/js/src/jit-test/tests/ion/mod-constant-pow2-minus-one.js b/js/src/jit-test/tests/ion/mod-constant-pow2-minus-one.js
-new file mode 100644
-index 000000000000..9028f5587c65
---- /dev/null
-+++ b/js/src/jit-test/tests/ion/mod-constant-pow2-minus-one.js
-@@ -0,0 +1,78 @@
-+// Regression test for a PPC64 Ion miscompile of integer modulo by a
-+// constant of the form 2^n - 1 (e.g. 65535).
-+//
-+// lowerModI routes `x % (2^n - 1)` to LModMaskI, whose codegen
-+// (ma_mod_mask) materialized the mask 2^n - 1 with xs_li(). xs_li takes a
-+// signed int16_t, so a mask of 0xFFFF was truncated to -1, corrupting the
-+// digit-summing reduction. The bug only affected masks that do not fit in a
-+// signed 16-bit immediate, i.e. divisors >= 65535 (n >= 16); smaller
-+// 2^n - 1 divisors such as 255 were unaffected.
-+//
-+// The reference uses a non-constant divisor, which lowers to the
-+// hardware-divide modulo path (LModI) and is therefore independent of the
-+// LModMaskI codegen under test.
-+
-+function refmod(x, d) {
-+  // d is not a constant here -> divide-based modulo, not LModMaskI.
-+  return (x % d) | 0;
-+}
-+
-+// One function per constant divisor so the divisor is a literal and the
-+// LModMaskI path is selected.
-+function mod255(x) { return (x % 255) | 0; }
-+function mod32767(x) { return (x % 32767) | 0; }
-+function mod65535(x) { return (x % 65535) | 0; }
-+function mod131071(x) { return (x % 131071) | 0; }
-+function mod1048575(x) { return (x % 1048575) | 0; }
-+
-+const cases = [
-+  [mod255, 255],
-+  [mod32767, 32767],
-+  [mod65535, 65535],
-+  [mod131071, 131071],
-+  [mod1048575, 1048575],
-+];
-+
-+// Inputs spanning small values, values with bits above the mask width
-+// (so the multi-digit reduction is exercised), and negatives.
-+const inputs = [];
-+for (let i = 0; i < 64; i++) {
-+  inputs.push(Math.imul(i, 2654435761) | 0);
-+  inputs.push((i * 65535 + i) | 0);
-+  inputs.push((i * 131071 - 7) | 0);
-+  inputs.push(-Math.imul(i, 40503) | 0);
-+}
-+inputs.push(0, 1, -1, 65534, 65535, 65536, 0x7fffffff, -0x80000000);
-+
-+// Warm up through the tiers, then assert each constant-divisor result
-+// matches the divide-based reference.
-+for (let iter = 0; iter < 2000; iter++) {
-+  for (const [fn, d] of cases) {
-+    for (const x of inputs) {
-+      assertEq(fn(x), refmod(x, d));
-+    }
-+  }
-+}
-+
-+// Register-pressure variant: mirrors the shape that exposed the bug (many
-+// live locals forcing the mask materialization to interact with spills).
-+function pressure(buf, i) {
-+  let v0 = i, v1 = i + 1, v2 = i + 2, v3 = i + 3, v4 = i + 4, v5 = i + 5;
-+  let v6 = i + 6, v7 = i + 7, v8 = i + 8, v9 = i + 9, v10 = i + 10, v11 = i + 11;
-+  let v12 = i + 12, v13 = i + 13, v14 = i + 14, v15 = i + 15;
-+  const r = (buf[i & 63] % 65535) | 0;
-+  // Keep every local live to the return without altering r.
-+  const live = (v0 ^ v1 ^ v2 ^ v3 ^ v4 ^ v5 ^ v6 ^ v7 ^
-+                v8 ^ v9 ^ v10 ^ v11 ^ v12 ^ v13 ^ v14 ^ v15) & 0;
-+  return r + live;
-+}
-+
-+const buf = new Int32Array(64);
-+for (let i = 0; i < buf.length; i++) {
-+  buf[i] = Math.imul(i, 2654435761) | 0;
-+}
-+for (let iter = 0; iter < 5000; iter++) {
-+  for (let i = 0; i < 64; i++) {
-+    assertEq(pressure(buf, i), refmod(buf[i & 63], 65535));
-+  }
-+}
-diff --git a/js/src/jit-test/tests/ion/mod-pow2-negative-dividend.js b/js/src/jit-test/tests/ion/mod-pow2-negative-dividend.js
-new file mode 100644
-index 000000000000..9905cc4a8f36
---- /dev/null
-+++ b/js/src/jit-test/tests/ion/mod-pow2-negative-dividend.js
-@@ -0,0 +1,71 @@
-+// Regression test for a PPC64 Ion miscompile of integer modulo by a constant
-+// power of two (e.g. 65536) with a negative dividend.
-+//
-+// lowerModI routes `x % 2^n` to LModPowTwoI, whose codegen tested the sign of
-+// the dividend with branchPtr (a 64-bit compare). When the int32 dividend was
-+// held zero-extended in its register, the 64-bit test misclassified a negative
-+// value as non-negative and took the unmasked positive path, returning
-+// `x & (2^n - 1)` instead of the correct (negative) `x % 2^n`. Fixed by using a
-+// 32-bit sign test (branch32).
-+//
-+// The reference uses a non-constant divisor, which lowers to the divide-based
-+// modulo path (LModI), independent of LModPowTwoI.
-+
-+function refmod(x, d) {
-+  return (x % d) | 0;
-+}
-+
-+function mod256(x) { return (x % 256) | 0; }
-+function mod1024(x) { return (x % 1024) | 0; }
-+function mod4096(x) { return (x % 4096) | 0; }
-+function mod65536(x) { return (x % 65536) | 0; }
-+function mod1048576(x) { return (x % 1048576) | 0; }
-+function mod1073741824(x) { return (x % 1073741824) | 0; }
-+
-+const cases = [
-+  [mod256, 256],
-+  [mod1024, 1024],
-+  [mod4096, 4096],
-+  [mod65536, 65536],
-+  [mod1048576, 1048576],
-+  [mod1073741824, 1073741824],
-+];
-+
-+// Heavy on negative dividends (the broken path), plus boundary values.
-+const inputs = [];
-+for (let i = 1; i <= 64; i++) {
-+  inputs.push(-Math.imul(i, 2654435761) | 0);
-+  inputs.push(-(i * 168));
-+  inputs.push(-(i * 70001));
-+  inputs.push(Math.imul(i, 40503) | 0);
-+}
-+inputs.push(0, -1, 1, -168, -65535, -65536, -65537, 168,
-+            0x7fffffff, -0x80000000, -0x7fffffff);
-+
-+for (let iter = 0; iter < 3000; iter++) {
-+  for (const [fn, d] of cases) {
-+    for (const x of inputs) {
-+      assertEq(fn(x), refmod(x, d));
-+    }
-+  }
-+}
-+
-+// Register-pressure variant: a negative dividend produced at runtime
-+// (float->int) with many live locals, mirroring the shape that exposed the bug.
-+function pressure(seed) {
-+  let v0 = seed, v1 = seed + 1, v2 = seed + 2, v3 = seed + 3, v4 = seed + 4;
-+  let v5 = seed + 5, v6 = seed + 6, v7 = seed + 7, v8 = seed + 8, v9 = seed + 9;
-+  let v10 = seed + 10, v11 = seed + 11, v12 = seed + 12, v13 = seed + 13;
-+  let d0 = seed * 0.5, d1 = seed * 1.5, d2 = -seed * 2.5;
-+  const neg = (Math.fround(-(Math.abs(seed) + 0.7)) | 0);
-+  const r = (neg % 65536) | 0;
-+  const live = (v0 ^ v1 ^ v2 ^ v3 ^ v4 ^ v5 ^ v6 ^ v7 ^ v8 ^ v9 ^
-+                v10 ^ v11 ^ v12 ^ v13 ^ (d0 | 0) ^ (d1 | 0) ^ (d2 | 0)) & 0;
-+  return r + live;
-+}
-+for (let iter = 0; iter < 5000; iter++) {
-+  for (let s = 1; s <= 200; s++) {
-+    const expect = ((Math.fround(-(s + 0.7)) | 0) % 65536) | 0;
-+    assertEq(pressure(s), expect);
-+  }
-+}
-diff --git a/js/src/jit-test/tests/math-min-max-corner-cases.js b/js/src/jit-test/tests/math-min-max-corner-cases.js
-new file mode 100644
-index 000000000000..7ac2c59caeff
---- /dev/null
-+++ b/js/src/jit-test/tests/math-min-max-corner-cases.js
-@@ -0,0 +1,50 @@
-+// Math.min / Math.max corner cases. Exercises the POWER9 xsminjdp /
-+// xsmaxjdp J-form fast path on PPC64 (and the fcmpu/branch fallback on
-+// POWER8 forced); other backends already cover this via shared fp tests
-+// but the truth table is small and worth pinning explicitly.
-+//
-+// JS semantics (ECMA-262):
-+//   - Math.max(-0, +0) === +0; Math.min(-0, +0) === -0
-+//   - Math.max(-0, -0) === -0; Math.min(+0, +0) === +0
-+//   - Any NaN operand → NaN
-+//   - ±Inf and ordinary numerics by value
-+
-+function objectIsPositiveZero(v) {
-+  return v === 0 && Object.is(v, 0);
-+}
-+function objectIsNegativeZero(v) {
-+  return v === 0 && Object.is(v, -0);
-+}
-+
-+// Direct calls — these get inlined by Ion as MMinMax intrinsics, which
-+// emit the relevant min/max helper.
-+function check() {
-+  // Max corner cases.
-+  assertEq(objectIsPositiveZero(Math.max(-0, +0)), true);
-+  assertEq(objectIsPositiveZero(Math.max(+0, -0)), true);
-+  assertEq(objectIsNegativeZero(Math.max(-0, -0)), true);
-+  assertEq(objectIsPositiveZero(Math.max(+0, +0)), true);
-+  assertEq(Number.isNaN(Math.max(NaN, 5)), true);
-+  assertEq(Number.isNaN(Math.max(5, NaN)), true);
-+  assertEq(Number.isNaN(Math.max(NaN, NaN)), true);
-+  assertEq(Math.max(-Infinity, 5), 5);
-+  assertEq(Math.max(Infinity, 5), Infinity);
-+  assertEq(Math.max(1, 2), 2);
-+  assertEq(Math.max(-1, -2), -1);
-+  assertEq(Math.max(1.5, 2.5), 2.5);
-+
-+  // Min corner cases.
-+  assertEq(objectIsNegativeZero(Math.min(-0, +0)), true);
-+  assertEq(objectIsNegativeZero(Math.min(+0, -0)), true);
-+  assertEq(objectIsNegativeZero(Math.min(-0, -0)), true);
-+  assertEq(objectIsPositiveZero(Math.min(+0, +0)), true);
-+  assertEq(Number.isNaN(Math.min(NaN, 5)), true);
-+  assertEq(Number.isNaN(Math.min(5, NaN)), true);
-+  assertEq(Math.min(-Infinity, 5), -Infinity);
-+  assertEq(Math.min(Infinity, 5), 5);
-+  assertEq(Math.min(1, 2), 1);
-+}
-+
-+// Run cold (Baseline) and hot (Ion).
-+check();
-+for (let i = 0; i < 50000; i++) check();
-diff --git a/js/src/jit-test/tests/wasm/atomicity.js b/js/src/jit-test/tests/wasm/atomicity.js
-index 34327ec95741..ac1516083325 100644
---- a/js/src/jit-test/tests/wasm/atomicity.js
-+++ b/js/src/jit-test/tests/wasm/atomicity.js
-@@ -8,7 +8,11 @@
- const DEBUG = 0;
- 
- // The longer we run, the better, really, but we don't want to time out.
--const ITERATIONS = 100000;
-+// Real PPC64 hardware retries lwarx/stwcx. reservation loops under
-+// contention, which makes the default count exceed jit-test's 150 s
-+// budget on POWER8 and (less so) POWER9/POWER10. Quarter the count
-+// there to keep coverage while fitting the default budget.
-+const ITERATIONS = getBuildConfiguration("ppc64") ? 25000 : 100000;
- 
- // If you change NUMWORKERS you must also change the tables for INIT, VAL, and
- // RESULT for all the operations, below, by adding or removing bits.
-@@ -39,7 +43,7 @@ if (getCoreCount() < NUMAGENTS) {
- 
- if (getBuildConfiguration("arm-simulator") || getBuildConfiguration("arm64-simulator") ||
-     getBuildConfiguration("mips64-simulator") || getBuildConfiguration("riscv64-simulator") ||
--    getBuildConfiguration("loong64-simulator"))
-+    getBuildConfiguration("loong64-simulator") || getBuildConfiguration("ppc64-simulator"))
- {
-     if (DEBUG > 0)
-         print("Atomicity test disabled on simulator");
-diff --git a/js/src/jit-test/tests/wasm/excessive-inlining.js b/js/src/jit-test/tests/wasm/excessive-inlining.js
-index 91ec710e4e46..a7d3b3211515 100644
---- a/js/src/jit-test/tests/wasm/excessive-inlining.js
-+++ b/js/src/jit-test/tests/wasm/excessive-inlining.js
-@@ -74,23 +74,26 @@ assertEq(tier2codeBytesUsed > 2000, true);
- 
- // But not an excessive amount.  This is the assertion that checks that
- // the inlining-budget cutoff mechanism is working.
--assertEq(tier2codeBytesUsed < 15000, true);
-+// PPC64 generates larger code due to fixed-width 4-byte instructions,
-+// multi-instruction branch stanzas, and longer constant-loading sequences.
-+let tier2limit = getBuildConfiguration("ppc64") ? 25000 : 15000;
-+assertEq(tier2codeBytesUsed < tier2limit, true);
- 
- // The thresholds above are based on the following measurements.
- //
- // tier1codeBytesUsed (baseline size)
- //
--//     x64      x32    arm64    arm32
-+//     x64      x32    arm64    arm32    ppc64
- //
--//    1378     1010     1408     1008    --enable-debug build
--//    1218      866     1248      856    --disable-debug build
-+//    1378     1010     1408     1008     2736    --enable-debug build
-+//    1218      866     1248      856            --disable-debug build
- //
- // tier2codeBytesUsed (optimized size), with inline-size budgeting enabled
- //
--//     x64      x32    arm64    arm32
-+//     x64      x32    arm64    arm32    ppc64
- //
--//    5186     6994     7136     5472    --enable-debug build
--//    3698     3730     5472     3888    --disable-debug build
-+//    5186     6994     7136     5472    17408    --enable-debug build
-+//    3698     3730     5472     3888            --disable-debug build
- //
- // tier2codeBytesUsed (optimized size), with inline-size budgeting disabled
- //
-@@ -108,7 +111,7 @@ assertEq(tier2codeBytesUsed < 15000, true);
- // (2) the optimized size will be at least 2000 bytes
- //
- // (3) if the inline-budget mechanism is working as intended, the optimized
--//     size will be less than 15000 bytes
-+//     size will be less than 15000 bytes (25000 on PPC64)
- //
- //
- // Note (for future testing): inline-size budgeting was disabled by changing
-diff --git a/js/src/jit-test/tests/wasm/memory-oob-message.js b/js/src/jit-test/tests/wasm/memory-oob-message.js
-index 75248c6e6a56..c08e49bcc6e4 100644
---- a/js/src/jit-test/tests/wasm/memory-oob-message.js
-+++ b/js/src/jit-test/tests/wasm/memory-oob-message.js
-@@ -8,8 +8,16 @@ const hasOffsetMessage = wasmHugeMemoryEnabled();
- 
- function oobPattern(memIdx, byteOffset) {
-     if (hasOffsetMessage) {
-+        // The reported address is whatever the kernel returned in
-+        // siginfo.si_addr for the faulting instruction. Most backends emit
-+        // the wasm access directly so si_addr equals byteOffset. PPC64 emits
-+        // a 1-byte probing load at byteOffset + (size - 1) before each
-+        // multi-byte access (to enforce wasm-spec atomicity on POWER ISA),
-+        // so si_addr there can be up to 15 bytes past byteOffset.
-+        const offsets = [];
-+        for (let i = 0; i < 16; ++i) offsets.push(`${byteOffset + i}`);
-         return new RegExp(
--            `out of bounds: memory ${memIdx} access at memory address ${byteOffset}`
-+            `out of bounds: memory ${memIdx} access at memory address (?:${offsets.join('|')})`
-         );
-     }
-     return /index out of bounds/;
-diff --git a/js/src/jit-test/tests/wasm/ppc64-argon2-tiering.js b/js/src/jit-test/tests/wasm/ppc64-argon2-tiering.js
-new file mode 100644
-index 000000000000..04dad9240539
---- /dev/null
-+++ b/js/src/jit-test/tests/wasm/ppc64-argon2-tiering.js
-@@ -0,0 +1,124 @@
-+// Test for wasm tiering correctness with argon2-style SIMD computation.
-+// The argon2 fBlaMka function uses i64x2.extmul_low_i32x4_u, i64x2.shl,
-+// i64x2.add, v128.xor, v128.or, i64x2.shr_u, and i8x16.shuffle.
-+// A tiering bug can cause hash and verify to produce different results
-+// when tier-up happens between them.
-+//
-+// This test runs the computation under both baseline and optimizing
-+// compilers and verifies they produce identical results.
-+
-+var mod = new WebAssembly.Module(wasmTextToBinary(`
-+  (module
-+    (memory (export "mem") 10)
-+    ;; Argon2 fBlaMka: a + b + 2 * trunc32(a) * trunc32(b)
-+    ;; then rotations by 32, 24, 16, 63
-+    (func $G_round (param i32)
-+      (local v128 v128 v128 v128 v128 v128 v128 v128 v128)
-+      (local.set 1 (v128.load (i32.add (local.get 0) (i32.const 0))))
-+      (local.set 2 (v128.load (i32.add (local.get 0) (i32.const 16))))
-+      (local.set 3 (v128.load (i32.add (local.get 0) (i32.const 32))))
-+      (local.set 4 (v128.load (i32.add (local.get 0) (i32.const 48))))
-+      (local.set 5 (v128.load (i32.add (local.get 0) (i32.const 64))))
-+      (local.set 6 (v128.load (i32.add (local.get 0) (i32.const 80))))
-+      (local.set 7 (v128.load (i32.add (local.get 0) (i32.const 96))))
-+      (local.set 8 (v128.load (i32.add (local.get 0) (i32.const 112))))
-+
-+      ;; fBlaMka(v0, v2) + rotr32
-+      (local.set 1 (i64x2.add (i64x2.add (local.get 1) (local.get 3))
-+        (i64x2.shl (i64x2.extmul_low_i32x4_u
-+          (i8x16.shuffle 0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 (local.get 1) (local.get 1))
-+          (i8x16.shuffle 0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 (local.get 3) (local.get 3)))
-+          (i32.const 1))))
-+      (local.set 9 (v128.xor (local.get 7) (local.get 1)))
-+      (local.set 7 (v128.or (i64x2.shl (local.get 9) (i32.const 32)) (i64x2.shr_u (local.get 9) (i32.const 32))))
-+
-+      ;; fBlaMka(v4, v6) + rotr24
-+      (local.set 5 (i64x2.add (i64x2.add (local.get 5) (local.get 7))
-+        (i64x2.shl (i64x2.extmul_low_i32x4_u
-+          (i8x16.shuffle 0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 (local.get 5) (local.get 5))
-+          (i8x16.shuffle 0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 (local.get 7) (local.get 7)))
-+          (i32.const 1))))
-+      (local.set 9 (v128.xor (local.get 3) (local.get 5)))
-+      (local.set 3 (v128.or (i64x2.shl (local.get 9) (i32.const 40)) (i64x2.shr_u (local.get 9) (i32.const 24))))
-+
-+      ;; fBlaMka(v0, v2) + rotr16
-+      (local.set 1 (i64x2.add (i64x2.add (local.get 1) (local.get 3))
-+        (i64x2.shl (i64x2.extmul_low_i32x4_u
-+          (i8x16.shuffle 0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 (local.get 1) (local.get 1))
-+          (i8x16.shuffle 0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 (local.get 3) (local.get 3)))
-+          (i32.const 1))))
-+      (local.set 9 (v128.xor (local.get 7) (local.get 1)))
-+      (local.set 7 (v128.or (i64x2.shl (local.get 9) (i32.const 48)) (i64x2.shr_u (local.get 9) (i32.const 16))))
-+
-+      ;; fBlaMka(v4, v6) + rotr63
-+      (local.set 5 (i64x2.add (i64x2.add (local.get 5) (local.get 7))
-+        (i64x2.shl (i64x2.extmul_low_i32x4_u
-+          (i8x16.shuffle 0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 (local.get 5) (local.get 5))
-+          (i8x16.shuffle 0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 (local.get 7) (local.get 7)))
-+          (i32.const 1))))
-+      (local.set 9 (v128.xor (local.get 3) (local.get 5)))
-+      (local.set 3 (v128.or (i64x2.shl (local.get 9) (i32.const 1)) (i64x2.shr_u (local.get 9) (i32.const 63))))
-+
-+      (v128.store (i32.add (local.get 0) (i32.const 0)) (local.get 1))
-+      (v128.store (i32.add (local.get 0) (i32.const 16)) (local.get 2))
-+      (v128.store (i32.add (local.get 0) (i32.const 32)) (local.get 3))
-+      (v128.store (i32.add (local.get 0) (i32.const 48)) (local.get 4))
-+      (v128.store (i32.add (local.get 0) (i32.const 64)) (local.get 5))
-+      (v128.store (i32.add (local.get 0) (i32.const 80)) (local.get 6))
-+      (v128.store (i32.add (local.get 0) (i32.const 96)) (local.get 7))
-+      (v128.store (i32.add (local.get 0) (i32.const 112)) (local.get 8)))
-+
-+    (func (export "hash") (param i32) (result i64)
-+      (local i32)
-+      ;; Init with Blake2b IV
-+      (v128.store (i32.const 0) (v128.const i64x2 0x6a09e667f3bcc908 0xbb67ae8584caa73b))
-+      (v128.store (i32.const 16) (v128.const i64x2 0x3c6ef372fe94f82b 0xa54ff53a5f1d36f1))
-+      (v128.store (i32.const 32) (v128.const i64x2 0x510e527fade682d1 0x9b05688c2b3e6c1f))
-+      (v128.store (i32.const 48) (v128.const i64x2 0x1f83d9abfb41bd6b 0x5be0cd19137e2179))
-+      (v128.store (i32.const 64) (v128.const i64x2 0x0123456789abcdef 0xfedcba9876543210))
-+      (v128.store (i32.const 80) (v128.const i64x2 0xdeadbeefcafebabe 0x1122334455667788))
-+      (v128.store (i32.const 96) (v128.const i64x2 0xaabbccdd11223344 0x5566778899aabbcc))
-+      (v128.store (i32.const 112) (v128.const i64x2 0xddeeff0011223344 0x5566778899aabbcc))
-+      (local.set 1 (i32.const 0))
-+      (block (loop
-+        (call $G_round (i32.const 0))
-+        (local.set 1 (i32.add (local.get 1) (i32.const 1)))
-+        (br_if 1 (i32.ge_u (local.get 1) (local.get 0)))
-+        (br 0)))
-+      (i64.xor (i64.load (i32.const 0))
-+        (i64.xor (i64.load (i32.const 8))
-+          (i64.xor (i64.load (i32.const 16))
-+            (i64.xor (i64.load (i32.const 24))
-+              (i64.xor (i64.load (i32.const 32))
-+                (i64.xor (i64.load (i32.const 40))
-+                  (i64.xor (i64.load (i32.const 48))
-+                    (i64.xor (i64.load (i32.const 56))
-+                      (i64.xor (i64.load (i32.const 64))
-+                        (i64.xor (i64.load (i32.const 72))
-+                          (i64.xor (i64.load (i32.const 80))
-+                            (i64.xor (i64.load (i32.const 88))
-+                              (i64.xor (i64.load (i32.const 96))
-+                                (i64.xor (i64.load (i32.const 104))
-+                                  (i64.xor (i64.load (i32.const 112))
-+                                    (i64.load (i32.const 120))))))))))))))))))
-+  )
-+`));
-+
-+var inst = new WebAssembly.Instance(mod);
-+
-+// Get a reference result from the first call.
-+var reference = inst.exports.hash(100);
-+
-+// Run many times to trigger tier-up, then verify result stays the same.
-+var pass = true;
-+for (var i = 0; i < 1000; i++) {
-+    var r = inst.exports.hash(100);
-+    if (r !== reference) {
-+        pass = false;
-+        throw new Error("Tiering mismatch at iteration " + i +
-+            ": got 0x" + BigInt.asUintN(64, r).toString(16) +
-+            ", expected 0x" + BigInt.asUintN(64, reference).toString(16));
-+    }
-+}
-+
-+assertEq(pass, true);
-diff --git a/js/src/jit-test/tests/wasm/ppc64-compare-select-bench.js b/js/src/jit-test/tests/wasm/ppc64-compare-select-bench.js
-new file mode 100644
-index 000000000000..c11ce713f514
---- /dev/null
-+++ b/js/src/jit-test/tests/wasm/ppc64-compare-select-bench.js
-@@ -0,0 +1,70 @@
-+// |jit-test| skip-if: true
-+//
-+// Benchmark only, not a correctness test. Invoke manually as shown below.
-+//
-+// Microbenchmark for wasm compare+select fusion on PPC64.
-+//
-+// Run with:
-+//   $JS --wasm-compiler=optimizing \
-+//       js/src/jit-test/tests/wasm/ppc64-compare-select-bench.js
-+//
-+// Prints timings for four variants (i32, i64, f32, f64) that exercise a
-+// tight loop of N select-on-compare operations. Used to decide whether
-+// specializing lowerWasmCompareAndSelect beyond Int32 is worth the code.
-+//
-+// The kernel is a 10-stage select chain so the per-op overhead dominates
-+// the loop frame. Each iteration touches 10 compare+select ops plus
-+// ~trivial address math.
-+
-+const N_ITERS = 1_000_000;
-+
-+function buildModule(kind) {
-+  const types = {i32: ['i32', 'i32', 'i32.lt_s'],
-+                 u32: ['i32', 'i32', 'i32.lt_u'],
-+                 i64: ['i64', 'i64', 'i64.lt_s'],
-+                 f32: ['f32', 'i32', 'f32.lt'],
-+                 f64: ['f64', 'i32', 'f64.lt']}[kind];
-+  const [ty, iterTy, cmpOp] = types;
-+  // Load a, b; compute chain of (b < a ? b : a) 10 times per iter.
-+  const stage = `
-+    (local.set $a
-+      (select (result ${ty})
-+        (local.get $b) (local.get $a)
-+        (${cmpOp} (local.get $b) (local.get $a))))`;
-+  const body = Array(10).fill(stage).join('\n');
-+  const text = `
-+    (module
-+      (func (export "run") (param $n i32) (result ${ty})
-+        (local $i i32) (local $a ${ty}) (local $b ${ty})
-+        (local.set $a (${ty}.const ${kind === 'f32' || kind === 'f64' ? '3.14' : '12345'}))
-+        (local.set $b (${ty}.const ${kind === 'f32' || kind === 'f64' ? '2.71' : '67890'}))
-+        (loop $L
-+          ${body}
-+          (local.set $i (i32.add (local.get $i) (i32.const 1)))
-+          (br_if $L (i32.lt_s (local.get $i) (local.get $n))))
-+        (local.get $a)))`;
-+  return new WebAssembly.Module(wasmTextToBinary(text));
-+}
-+
-+function bench(kind) {
-+  const inst = new WebAssembly.Instance(buildModule(kind));
-+  // Warmup — ensure Ion compiles.
-+  for (let i = 0; i < 3; i++) inst.exports.run(N_ITERS);
-+  const t0 = dateNow();
-+  const res = inst.exports.run(N_ITERS);
-+  const t1 = dateNow();
-+  return {ms: t1 - t0, result: res};
-+}
-+
-+const kinds = ['i32', 'u32', 'i64', 'f32', 'f64'];
-+const runs = 5;
-+print(`\nwasm compare+select microbench (${N_ITERS.toLocaleString()} iters, 10 ops/iter):`);
-+print(`  Each timing is the best of ${runs} runs.\n`);
-+for (const kind of kinds) {
-+  const samples = [];
-+  for (let i = 0; i < runs; i++) samples.push(bench(kind).ms);
-+  samples.sort((a, b) => a - b);
-+  const best = samples[0];
-+  const median = samples[(runs / 2) | 0];
-+  print(`  ${kind.padEnd(4)} best=${best.toFixed(1)}ms  median=${median.toFixed(1)}ms  (samples: ${samples.map(s => s.toFixed(0)).join(',')})`);
-+}
-diff --git a/js/src/jit-test/tests/wasm/ppc64-extmul-alias.js b/js/src/jit-test/tests/wasm/ppc64-extmul-alias.js
-new file mode 100644
-index 000000000000..2aa9507751b6
---- /dev/null
-+++ b/js/src/jit-test/tests/wasm/ppc64-extmul-alias.js
-@@ -0,0 +1,107 @@
-+// Regression test for PPC64 i64x2.extmul_{low,high}_i32x4_{s,u} when the
-+// Ion register allocator picks dest == rhs.
-+//
-+// On PPC64 LE, the old implementation extracted lanes via mtvsrd/mfvsrd and
-+// wrote the low-lane product to dest before reading rhs for the high lane.
-+// `mtvsrd XT, RA` leaves DW1 of XT undefined (POWER9 zeros it), so when
-+// dest aliased rhs the high-lane extract from rhs read garbage, producing
-+// zero in the high i64 lane. On POWER8 the ExtractLaneToGPR fallback
-+// additionally clobbered ScratchSimd128Reg between the two extracts.
-+//
-+// The loop below, discovered via wasm-reduce from argon2.wasm, reliably
-+// reproduced the miscompile: the result's high i64 lane went to 0 on
-+// POWER9 Ion / garbage on POWER8 Ion, while baseline kept the correct
-+// value (lane1 = 48*48 = 2304 in the final iteration).
-+
-+var mod = new WebAssembly.Module(wasmTextToBinary(`
-+  (module
-+    (memory (export "mem") 1)
-+    (func (export "run_u") (param $out i32)
-+      (local $i i32) (local $v4 v128) (local $v5 v128) (local $v9 v128)
-+      (loop
-+        (local.set $v9
-+          (i64x2.add
-+            (v128.const i32x4 1 0 0 0)
-+            (i64x2.extmul_low_i32x4_u (local.get $v5) (local.get $v9))))
-+        (local.set $v4 (local.get $v9))
-+        (local.set $v5 (local.get $v4))
-+        (v128.store (i32.const 0) (local.get $v5))
-+        (local.set $i (i32.add (local.get $i) (i32.const 1)))
-+        (br_if 0 (i32.ne (local.get $i) (i32.const 8))))
-+      (v128.store (local.get $out) (local.get $v9)))
-+
-+    (func (export "run_s") (param $out i32)
-+      (local $i i32) (local $v v128)
-+      (local.set $v (v128.const i32x4 2 3 5 7))
-+      (loop
-+        ;; Force dest==rhs aliasing: v = extmul_low_i32x4_s(const, v).
-+        (local.set $v
-+          (i64x2.extmul_low_i32x4_s
-+            (v128.const i32x4 2 3 5 7)
-+            (local.get $v)))
-+        (local.set $i (i32.add (local.get $i) (i32.const 1)))
-+        (br_if 0 (i32.ne (local.get $i) (i32.const 2))))
-+      (v128.store (local.get $out) (local.get $v)))
-+
-+    (func (export "run_high_u") (param $out i32)
-+      (local $i i32) (local $v v128)
-+      (local.set $v (v128.const i32x4 0 0 2 3))
-+      (loop
-+        (local.set $v
-+          (i64x2.extmul_high_i32x4_u
-+            (v128.const i32x4 0 0 2 3)
-+            (local.get $v)))
-+        (local.set $i (i32.add (local.get $i) (i32.const 1)))
-+        (br_if 0 (i32.ne (local.get $i) (i32.const 2))))
-+      (v128.store (local.get $out) (local.get $v)))
-+
-+    (func (export "run_high_s") (param $out i32)
-+      (local $i i32) (local $v v128)
-+      (local.set $v (v128.const i32x4 0 0 2 3))
-+      (loop
-+        (local.set $v
-+          (i64x2.extmul_high_i32x4_s
-+            (v128.const i32x4 0 0 2 3)
-+            (local.get $v)))
-+        (local.set $i (i32.add (local.get $i) (i32.const 1)))
-+        (br_if 0 (i32.ne (local.get $i) (i32.const 2))))
-+      (v128.store (local.get $out) (local.get $v))))
-+`));
-+
-+function runAndCheck(inst) {
-+  inst.exports.run_u(0);
-+  // After 8 iterations, the value in memory should have lane1 == 2304 = 0x900.
-+  // Bytes 8-15 (i64 lane 1, little-endian) = 0x0000000000000900.
-+  var buf = new Uint8Array(inst.exports.mem.buffer, 0, 16);
-+  var hex = Array.from(buf).map(b => b.toString(16).padStart(2,'0')).join('');
-+  // Expect bytes 8-9 = "00 09" and bytes 10-15 = "00 00 00 00 00 00".
-+  assertEq(hex.slice(16, 32), "0009000000000000");
-+
-+  inst.exports.run_s(16);
-+  // After 2 iterations of v = extmul_low_s(const(2,3,5,7), v) starting v=(2,3,5,7):
-+  //   iter 1: i64x2 lane0 = 2*2 = 4, lane1 = 3*3 = 9.
-+  //           v becomes i32x4 [4, 0, 9, 0] (each i64 lane occupies two i32 lanes).
-+  //   iter 2: extmul_low_s reads i32 lanes 0, 1 of v = (4, 0).
-+  //           i64 lane0 = 2*4 = 8; i64 lane1 = 3*0 = 0.
-+  var buf2 = new Uint8Array(inst.exports.mem.buffer, 16, 16);
-+  var hex2 = Array.from(buf2).map(b => b.toString(16).padStart(2,'0')).join('');
-+  assertEq(hex2, "08000000000000000000000000000000");
-+
-+  inst.exports.run_high_u(32);
-+  // v = (0, 0, 2, 3). extmul_high picks lanes 2 and 3.
-+  //   iter 1: lane2_prod = 2*2 = 4; lane3_prod = 3*3 = 9. Result stored at bytes 0-7 (lane2_prod) and 8-15 (lane3_prod).
-+  //   iter 2: v now has i64x2 lane0 = 4, lane1 = 9, i.e. i32x4 lanes [4, 0, 9, 0].
-+  //           extmul_high_u(const(0,0,2,3), v) reads lanes 2, 3 of both:
-+  //           const lane2 = 2, lane3 = 3; v lane2 = 9, lane3 = 0.
-+  //           result: lane2_prod = 2*9 = 18 at bytes 0-7; lane3_prod = 3*0 = 0 at bytes 8-15.
-+  var buf3 = new Uint8Array(inst.exports.mem.buffer, 32, 16);
-+  var hex3 = Array.from(buf3).map(b => b.toString(16).padStart(2,'0')).join('');
-+  assertEq(hex3, "12000000000000000000000000000000");
-+
-+  inst.exports.run_high_s(48);
-+  var buf4 = new Uint8Array(inst.exports.mem.buffer, 48, 16);
-+  var hex4 = Array.from(buf4).map(b => b.toString(16).padStart(2,'0')).join('');
-+  assertEq(hex4, "12000000000000000000000000000000");
-+}
-+
-+runAndCheck(new WebAssembly.Instance(mod));
-diff --git a/js/src/jit-test/tests/wasm/ppc64-simd-vr-clobber.js b/js/src/jit-test/tests/wasm/ppc64-simd-vr-clobber.js
-new file mode 100644
-index 000000000000..d5f79a1840a6
---- /dev/null
-+++ b/js/src/jit-test/tests/wasm/ppc64-simd-vr-clobber.js
-@@ -0,0 +1,179 @@
-+// |jit-test| skip-if: !wasmSimdEnabled()
-+//
-+// Regression tests for PPC64 SIMD helpers that use VR1..VR5 as undeclared
-+// scratch and silently corrupt live wasm v128 values the register allocator
-+// has placed in those VRs.
-+//
-+// Background: PPC64 Simd128 lives in VR0..VR31. VR0 is non-allocatable
-+// (= ScratchSimd128Reg); VR1..VR31 are allocatable. The helpers below
-+// historically used VR1..VR5 as undeclared scratch:
-+//
-+//   negInt8x16, negInt16x8                    : clobber VR1 (all CPUs)
-+//   negInt32x4, negInt64x2 (POWER8 fallback)  : clobber VR1 (POWER8 only)
-+//   extAddPairwiseInt8x16  (signed/unsigned)  : clobber VR1, VR2, VR3
-+//   extAddPairwiseInt16x8  (signed/unsigned)  : clobber VR1, VR2, VR3
-+//   unsignedWidenHighInt32x4                  : clobber VR1
-+//
-+// Each test:
-+//   - loads `nLive` "preserve" v128 values from memory at offsets 16..16+16*nLive
-+//   - loads ONE additional "input" v128 = repeat(0x18) at offset 128
-+//   - applies the suspect helper to the input
-+//   - stores the nLive preserved values back to memory at offsets 0..16*nLive
-+//   - stores the helper result at offset 16*nLive
-+//
-+// Without the fix, one of the preserved locals (whichever the allocator
-+// placed in the clobbered VR) reads back as the staged input value (0x18)
-+// instead of its original. With the fix (the helper using ScratchSimd128Scope
-+// or proper VR-namespace emit), all preserved locals retain their values.
-+
-+const PRESERVE_PATTERNS = [0xA1, 0xB2, 0xC3, 0xD4, 0xE5, 0xF6, 0x07, 0x29];
-+const INPUT_BYTE = 0x18;
-+
-+function init(mem) {
-+  // Slots at offset 16, 32, ..., 16+16*7 hold the preserve patterns.
-+  for (let slot = 0; slot < PRESERVE_PATTERNS.length; slot++) {
-+    for (let i = 0; i < 16; i++) {
-+      mem[16 + slot * 16 + i] = PRESERVE_PATTERNS[slot];
-+    }
-+  }
-+  // The helper input is at offset 128 (= 16 + 16*7 + 16 = 144? no, 16 + 16*8 = 144).
-+  // Use a fixed offset PAST the preserve area. With nLive max 7, preserve uses
-+  // 16..(16+16*7-1) = 16..127. Input goes at 144 to leave a 16-byte gap.
-+  const INPUT_OFFSET = 144;
-+  for (let i = 0; i < 16; i++) mem[INPUT_OFFSET + i] = INPUT_BYTE;
-+}
-+
-+function repeat(byte) {
-+  const a = new Array(16);
-+  for (let i = 0; i < 16; i++) a[i] = byte;
-+  return a;
-+}
-+
-+// Verify nLive preserved slots match PRESERVE_PATTERNS at output offsets
-+// 0..16*nLive, and that the result slot at 16*nLive matches `expectedResult`.
-+function check(opName, mem, nLive, expectedResult) {
-+  for (let slot = 0; slot < nLive; slot++) {
-+    for (let i = 0; i < 16; i++) {
-+      const got = mem[slot * 16 + i];
-+      const want = PRESERVE_PATTERNS[slot];
-+      assertEq(got, want,
-+               `${opName}: live slot ${slot} byte ${i}: got 0x${got.toString(16)}, expected 0x${want.toString(16)} (allocator-clobbered VR?)`);
-+    }
-+  }
-+  for (let i = 0; i < 16; i++) {
-+    const got = mem[nLive * 16 + i];
-+    const want = expectedResult[i];
-+    assertEq(got, want,
-+             `${opName}: result byte ${i}: got 0x${got.toString(16)}, expected 0x${want.toString(16)}`);
-+  }
-+}
-+
-+// Build a wasm module that:
-+//  - loads `nLive` preserve v128 locals from memory at offsets 16..16*nLive
-+//  - loads ONE input v128 from offset 144
-+//  - applies `op` to the input
-+//  - stores all `nLive + 1` v128 values back to memory at offsets 0..16*nLive
-+function buildModule(op, nLive) {
-+  const localDecls = [];
-+  const initLoads = [];
-+  const finalStores = [];
-+  for (let i = 0; i < nLive; i++) {
-+    localDecls.push(`(local $v${i} v128)`);
-+    initLoads.push(`(local.set $v${i} (v128.load (i32.const ${16 + i * 16})))`);
-+    finalStores.push(`(v128.store (i32.const ${i * 16}) (local.get $v${i}))`);
-+  }
-+  // The helper input + result.
-+  localDecls.push(`(local $input v128)`);
-+  initLoads.push(`(local.set $input (v128.load (i32.const 144)))`);
-+  finalStores.push(`(v128.store (i32.const ${nLive * 16}) (local.get $input))`);
-+
-+  const text = `
-+    (module
-+      (memory (export "mem") 1)
-+      (func (export "run")
-+        ${localDecls.join('\n        ')}
-+        ${initLoads.join('\n        ')}
-+        (local.set $input (${op} (local.get $input)))
-+        ${finalStores.join('\n        ')}
-+      )
-+    )`;
-+  return new WebAssembly.Module(wasmTextToBinary(text));
-+}
-+
-+function runOne(opName, op, nLive, expectedResult) {
-+  const mod = buildModule(op, nLive);
-+  const inst = new WebAssembly.Instance(mod);
-+  const mem = new Uint8Array(inst.exports.mem.buffer);
-+  // Run many times so Baseline + Ion both see it.
-+  for (let warm = 0; warm < 50; warm++) {
-+    init(mem);
-+    inst.exports.run();
-+    check(opName, mem, nLive, expectedResult);
-+  }
-+}
-+
-+// ---- Negate helpers ----
-+//
-+// Input lane = 0x18 = 24. neg(24) = -24.
-+// i8x16.neg : -24 mod 256 = 232 = 0xE8 per byte.
-+// i16x8.neg : lane = 0x1818 = 6168, neg = -6168 mod 65536 = 0xE7E8.
-+//             Memory LE: per i16 lane bytes 0xE8 0xE7.
-+// i32x4.neg : lane = 0x18181818 = 404232216, neg = 0xE7E7E7E8.
-+//             Memory LE: per i32 lane bytes 0xE8 0xE7 0xE7 0xE7.
-+// i64x2.neg : lane = 0x1818181818181818, neg = 0xE7E7E7E7E7E7E7E8.
-+//             Memory LE: per i64 lane bytes 0xE8 0xE7 0xE7 0xE7 0xE7 0xE7 0xE7 0xE7.
-+
-+runOne("i8x16.neg", "i8x16.neg", 4, repeat(0xE8));
-+runOne("i16x8.neg", "i16x8.neg", 4,
-+       [0xE8,0xE7, 0xE8,0xE7, 0xE8,0xE7, 0xE8,0xE7,
-+        0xE8,0xE7, 0xE8,0xE7, 0xE8,0xE7, 0xE8,0xE7]);
-+runOne("i32x4.neg", "i32x4.neg", 4,
-+       [0xE8,0xE7,0xE7,0xE7, 0xE8,0xE7,0xE7,0xE7,
-+        0xE8,0xE7,0xE7,0xE7, 0xE8,0xE7,0xE7,0xE7]);
-+runOne("i64x2.neg", "i64x2.neg", 4,
-+       [0xE8,0xE7,0xE7,0xE7,0xE7,0xE7,0xE7,0xE7,
-+        0xE8,0xE7,0xE7,0xE7,0xE7,0xE7,0xE7,0xE7]);
-+
-+// ---- extAddPairwise helpers ----
-+//
-+// extadd_pairwise reads adjacent pairs and widens-then-sums them.
-+// Input = repeat(0x18) = 24.
-+// i16x8.extadd_pairwise_i8x16_s : 24 + 24 = 48 = 0x0030 per i16 lane.
-+//                                  Memory LE: 0x30 0x00 per lane × 8 lanes.
-+// i16x8.extadd_pairwise_i8x16_u : same since input is positive.
-+// i32x4.extadd_pairwise_i16x8_s : i16 lane = 0x1818 = 6168, sum = 12336 = 0x00003030.
-+//                                  Memory LE: 0x30 0x30 0x00 0x00 per lane × 4 lanes.
-+// i32x4.extadd_pairwise_i16x8_u : same since input is positive.
-+
-+runOne("i16x8.extadd_pairwise_i8x16_s",
-+       "i16x8.extadd_pairwise_i8x16_s", 4,
-+       [0x30,0x00, 0x30,0x00, 0x30,0x00, 0x30,0x00,
-+        0x30,0x00, 0x30,0x00, 0x30,0x00, 0x30,0x00]);
-+
-+runOne("i16x8.extadd_pairwise_i8x16_u",
-+       "i16x8.extadd_pairwise_i8x16_u", 4,
-+       [0x30,0x00, 0x30,0x00, 0x30,0x00, 0x30,0x00,
-+        0x30,0x00, 0x30,0x00, 0x30,0x00, 0x30,0x00]);
-+
-+runOne("i32x4.extadd_pairwise_i16x8_s",
-+       "i32x4.extadd_pairwise_i16x8_s", 4,
-+       [0x30,0x30,0x00,0x00, 0x30,0x30,0x00,0x00,
-+        0x30,0x30,0x00,0x00, 0x30,0x30,0x00,0x00]);
-+
-+runOne("i32x4.extadd_pairwise_i16x8_u",
-+       "i32x4.extadd_pairwise_i16x8_u", 4,
-+       [0x30,0x30,0x00,0x00, 0x30,0x30,0x00,0x00,
-+        0x30,0x30,0x00,0x00, 0x30,0x30,0x00,0x00]);
-+
-+// ---- unsignedWidenHighInt32x4 ----
-+//
-+// i64x2.extend_high_i32x4_u: take the high two i32 lanes (lanes 2 and 3) of
-+// the input, zero-extend each to i64, lay them out as i64x2.
-+// Input lane = 0x18181818 (positive, =404232216).
-+// Result: two i64 lanes, each = 0x0000000018181818.
-+// Memory LE: per i64 lane bytes 0x18 0x18 0x18 0x18 0x00 0x00 0x00 0x00.
-+
-+runOne("i64x2.extend_high_i32x4_u",
-+       "i64x2.extend_high_i32x4_u", 4,
-+       [0x18,0x18,0x18,0x18,0x00,0x00,0x00,0x00,
-+        0x18,0x18,0x18,0x18,0x00,0x00,0x00,0x00]);
-diff --git a/js/src/jit-test/tests/wasm/profiling.js b/js/src/jit-test/tests/wasm/profiling.js
-index f4872b07cde8..ccd9690a262f 100644
---- a/js/src/jit-test/tests/wasm/profiling.js
-+++ b/js/src/jit-test/tests/wasm/profiling.js
-@@ -117,6 +117,13 @@ for (let type of ['f32', 'f64']) {
-         if (getBuildConfiguration("arm64")) {
-             continue;
-         }
-+        // PPC64 inlines ceil/floor/trunc as frip/frim/friz (see
-+        // Assembler-ppc64.h HasRoundInstruction), so no builtin thunk
-+        // frames exist to profile. `nearest` still goes through the
-+        // thunk because PPC64's frin is not IEEE round-to-even.
-+        if (getBuildConfiguration("ppc64") && func !== 'nearest') {
-+            continue;
-+        }
-         test(`(module
-             (func (export "") (param ${type}) (result ${type})
-                 local.get 0
-diff --git a/js/src/jit-test/tests/wasm/regress-ppc64-extract-lane-ctz.js b/js/src/jit-test/tests/wasm/regress-ppc64-extract-lane-ctz.js
-new file mode 100644
-index 000000000000..e2cf5def541e
---- /dev/null
-+++ b/js/src/jit-test/tests/wasm/regress-ppc64-extract-lane-ctz.js
-@@ -0,0 +1,49 @@
-+// |jit-test| --wasm-compiler=optimizing; skip-if: !wasmSimdEnabled()
-+//
-+// Regression test for a PPC64 i32x4.extract_lane canonicalization bug.
-+//
-+// ExtractLaneToGPR leaves the adjacent lane in the high 32 bits of the GPR for
-+// the unshifted lanes (0 and 2), so extractLaneInt32x4 must sign-extend its i32
-+// result (as the i8x16/i16x8 extracts do). Without that, a consumer that reads
-+// the full 64-bit register sees garbage in the high half. The POWER8 i32.ctz
-+// emulation is such a consumer: its 64-bit neg/and. zero-check disagrees with
-+// its 32-bit cntlzw, so ctz of a zero lane sitting next to a nonzero neighbour
-+// returned -1 instead of 32.
-+//
-+// The vector comes from memory (runtime, not constant-foldable) and is passed
-+// through a SIMD op so the extract is a genuine vector-register extract. Run
-+// under MOZ_PPC64_FORCE_POWER8=1 to exercise the emulated ctz path; in every
-+// other mode this is simply a correctness check.
-+
-+const ins = wasmEvalText(`(module
-+  (memory (export "mem") 1)
-+  (func $v (result v128)
-+    ;; identity AND keeps the value in a vector register and forces a real
-+    ;; extractLaneInt32x4 rather than an extract-of-load fold.
-+    (v128.and (v128.load (i32.const 0)) (v128.const i32x4 -1 -1 -1 -1)))
-+  (func (export "ctz0") (result i32) (i32.ctz (i32x4.extract_lane 0 (call $v))))
-+  (func (export "ctz1") (result i32) (i32.ctz (i32x4.extract_lane 1 (call $v))))
-+  (func (export "ctz2") (result i32) (i32.ctz (i32x4.extract_lane 2 (call $v))))
-+  (func (export "ctz3") (result i32) (i32.ctz (i32x4.extract_lane 3 (call $v))))
-+  (func (export "sext0") (result i64) (i64.extend_i32_s (i32x4.extract_lane 0 (call $v))))
-+  (func (export "sext2") (result i64) (i64.extend_i32_s (i32x4.extract_lane 2 (call $v))))
-+)`).exports;
-+
-+const mem = new Int32Array(ins.mem.buffer);
-+function setLanes(a, b, c, d) { mem[0] = a; mem[1] = b; mem[2] = c; mem[3] = d; }
-+
-+// Each lane = 0 surrounded by nonzero neighbours: ctz must be 32, never -1.
-+setLanes(0, -1, -1, -1); assertEq(ins.ctz0(), 32);
-+setLanes(-1, 0, -1, -1); assertEq(ins.ctz1(), 32);
-+setLanes(-1, -1, 0, -1); assertEq(ins.ctz2(), 32);
-+setLanes(-1, -1, -1, 0); assertEq(ins.ctz3(), 32);
-+
-+// Nonzero lanes: ctz of the lane value, regardless of neighbours.
-+setLanes(0x10, -1, 0x100000, -1);
-+assertEq(ins.ctz0(), 4);
-+assertEq(ins.ctz2(), 20);
-+
-+// A negative lane must sign-extend correctly (the canonicalization is extsw).
-+setLanes(-2, 7, -3, 7);
-+assertEq(ins.sext0(), -2n);
-+assertEq(ins.sext2(), -3n);
-diff --git a/js/src/jit-test/tests/wasm/regress-ppc64-select-condition.js b/js/src/jit-test/tests/wasm/regress-ppc64-select-condition.js
-new file mode 100644
-index 000000000000..c38975dce859
---- /dev/null
-+++ b/js/src/jit-test/tests/wasm/regress-ppc64-select-condition.js
-@@ -0,0 +1,30 @@
-+// |jit-test| --wasm-compiler=optimizing; skip-if: !wasmSimdEnabled()
-+//
-+// Regression test for a PPC64 wasm Ion miscompile of `select` with a 32-bit
-+// condition. visitWasmSelect tested the i32 condition with a 64-bit compare
-+// (cmpdi / branchTestPtr). When the condition was zero in its low 32 bits but
-+// had garbage in the high 32 bits (as can happen under register pressure), the
-+// 64-bit test read it as non-zero and select returned the wrong operand.
-+//
-+// Here the condition `$x3` is 0; `select($x8, -952809828, $x3)` must therefore
-+// return -952809828. The surrounding SIMD shuffle/bitselect/swizzle chain
-+// supplies the v128 register pressure that exposed the bug.
-+
-+const wat = `(module (func (export "f") (result i64)
-+  (local $x3 i32)(local $x7 i32)(local $x8 i32)
-+  (local $w0 v128)(local $w1 v128)(local $w2 v128)(local $w3 v128)
-+  (local $w4 v128)(local $w5 v128)(local $w6 v128)(local $w7 v128)
-+  (local.set $w0 (v128.const i32x4 1708443454 1532218695 2107423610 -1265775005))
-+  (local.set $w2 (v128.const i32x4 -752312355 -625530572 -844666500 832036408))
-+  (local.set $w7 (v128.const i32x4 115003496 -970441117 -43225935 1874128204))
-+  (local.set $w4 (i8x16.shuffle 15 18 13 2 6 22 20 8 19 10 12 8 11 5 6 28 (local.get $w7) (local.get $w3)))
-+  (local.set $w6 (v128.bitselect (local.get $w4) (local.get $w0) (local.get $w7)))
-+  (local.set $w1 (v128.const i32x4 -1635025264 -629784132 1517869852 1651771825))
-+  (local.set $w7 (v128.bitselect (local.get $w6) (local.get $w2) (local.get $w2)))
-+  (local.set $w6 (i8x16.swizzle (local.get $w1) (local.get $w7)))
-+  (local.set $x3 (i32x4.extract_lane 2 (local.get $w6)))
-+  (local.set $x7 (select (local.get $x8) (i32.const -952809828) (local.get $x3)))
-+  (i64.extend_i32_s (local.get $x7))))`;
-+
-+const ins = new WebAssembly.Instance(new WebAssembly.Module(wasmTextToBinary(wat)));
-+assertEq(ins.exports.f(), -952809828n);
-diff --git a/js/src/jit-test/tests/wasm/regress-ppc64-trap-exit-simd-save.js b/js/src/jit-test/tests/wasm/regress-ppc64-trap-exit-simd-save.js
-new file mode 100644
-index 000000000000..4887f8df119c
---- /dev/null
-+++ b/js/src/jit-test/tests/wasm/regress-ppc64-trap-exit-simd-save.js
-@@ -0,0 +1,64 @@
-+// |jit-test| exitstatus: 0; skip-if: !wasmSimdEnabled()
-+//
-+// Regression test for the PPC64 wasm trap exit losing live v128 state.
-+//
-+// On PPC64, doubles live in the FPRs (VSR0-31) while wasm v128 values live in
-+// the VRs (VSR32-63) -- disjoint physical pools. The trap exit's
-+// RegsToPreserve used AllDoubleMask only, so a trap firing while a v128 was
-+// live resumed with whatever the C++ interrupt path's libc left in the VRs
-+// (glibc's misaligned vector memcpy leaves lvsl alignment-control byte
-+// patterns there). Interrupt checks fire via traps at loop back-edges, where
-+// a loop-carried v128 accumulator is exactly what is live.
-+//
-+// The loop below keeps an i32x4 accumulator live across every back-edge while
-+// interrupts fire repeatedly; the callback does large misaligned copies to
-+// pull libc's vector memcpy through the VRs. On an unfixed build (real
-+// silicon; the simulator's VRs are insulated from native libc) the
-+// accumulator comes back holding garbage and the final lane values are wrong.
-+
-+const ins = wasmEvalText(`(module
-+  (func (export "run") (param $n i32) (result i32)
-+    (local $acc v128)
-+    (block $done
-+      (loop $top
-+        (br_if $done (i32.eqz (local.get $n)))
-+        (local.set $acc (i32x4.add (local.get $acc) (v128.const i32x4 1 2 3 4)))
-+        (local.set $n (i32.sub (local.get $n) (i32.const 1)))
-+        (br $top)))
-+    ;; Fold the four lanes so any lane corruption shows up.
-+    (i32.xor
-+      (i32.xor (i32x4.extract_lane 0 (local.get $acc))
-+               (i32.rotl (i32x4.extract_lane 1 (local.get $acc)) (i32.const 8)))
-+      (i32.xor (i32.rotl (i32x4.extract_lane 2 (local.get $acc)) (i32.const 16))
-+               (i32.rotl (i32x4.extract_lane 3 (local.get $acc)) (i32.const 24)))))
-+)`).exports;
-+
-+// Misaligned big copies drive glibc's lvsl/vperm memcpy path on PPC.
-+const big = new Uint8Array(1 << 20);
-+const src = big.subarray(1, (1 << 19) + 1);
-+const dst = new Uint8Array(1 << 19);
-+
-+let fires = 0;
-+function onInterrupt() {
-+  fires++;
-+  for (let i = 0; i < 4; i++) {
-+    dst.set(src);
-+  }
-+  if (fires < 25) {
-+    timeout(0.02, onInterrupt);
-+  }
-+  return true;
-+}
-+
-+function expected(n) {
-+  const r = (x, k) => ((x << k) | (x >>> (32 - k))) | 0;
-+  const l = [n | 0, (2 * n) | 0, (3 * n) | 0, (4 * n) | 0];
-+  return ((l[0] ^ r(l[1], 8)) ^ (r(l[2], 16) ^ r(l[3], 24))) | 0;
-+}
-+
-+const N = 1 << 26;
-+timeout(0.02, onInterrupt);
-+const got = ins.run(N);
-+// Cancel any pending watchdog before finishing.
-+timeout(-1);
-+assertEq(got, expected(N));
-diff --git a/js/src/jit-test/tests/wasm/regress/bug-ppc64-simd-reduce-and-branch.js b/js/src/jit-test/tests/wasm/regress/bug-ppc64-simd-reduce-and-branch.js
-new file mode 100644
-index 000000000000..b7ec0d9548bb
---- /dev/null
-+++ b/js/src/jit-test/tests/wasm/regress/bug-ppc64-simd-reduce-and-branch.js
-@@ -0,0 +1,7 @@
-+// Regression test for a PPC64-specific wasm Ion crash in
-+// CodeGenerator::visitWasmReduceAndBranchSimd128 — it called
-+// LBlock::label() directly on the branch targets without going through
-+// skipTrivialBlocks(), so a trivial goto-only successor tripped
-+// LBlock::label()'s !isTrivial() assertion. Reduced from grantkot.com/poly
-+// with wasm-reduce. Triggers the bug under --wasm-compiler=optimizing.
-+new WebAssembly.Module(os.file.readFile(scriptdir + "/bug-ppc64-simd-reduce-and-branch.wasm", "binary"));
-diff --git a/js/src/jit-test/tests/wasm/simd/bug1946618.js b/js/src/jit-test/tests/wasm/simd/bug1946618.js
-index cc02d0d8dfd7..fcf3a2a35e82 100644
---- a/js/src/jit-test/tests/wasm/simd/bug1946618.js
-+++ b/js/src/jit-test/tests/wasm/simd/bug1946618.js
-@@ -48,7 +48,12 @@ for (let op of ["f32x4.relaxed_min", "f32x4.relaxed_max",
-     // baseline.
-     let result1 = i.exports.variant1();
-     let result2 = i.exports.variant2();
--    if (getBuildConfiguration("arm64")) {
-+    if (getBuildConfiguration("ppc64")) {
-+      // PPC64: xvminsp/xvmaxsp always returns the non-NaN operand,
-+      // regardless of operand order. Both variants give zero (non-NaN).
-+      assertEq(result1, 0);
-+      assertEq(result2, 0);
-+    } else if (getBuildConfiguration("arm64")) {
-       // The relaxed_min/max operation appears to propagate NaNs symmetrically
-       // from either arg
-       assertEq(result1, 65535);
-diff --git a/js/src/jit-test/tests/wasm/simd/ion-analysis.js b/js/src/jit-test/tests/wasm/simd/ion-analysis.js
-index d12af6e6fbc9..335f831ff6a9 100644
---- a/js/src/jit-test/tests/wasm/simd/ion-analysis.js
-+++ b/js/src/jit-test/tests/wasm/simd/ion-analysis.js
-@@ -12,6 +12,7 @@
- // generates the expected result.
- 
- var isArm64 = getBuildConfiguration("arm64");
-+var isPPC64 = getBuildConfiguration("ppc64");
- 
- // 32-bit permutation that is not a rotation.
- let perm32x4_pattern = [4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 0, 1, 2, 3];
-@@ -846,7 +847,7 @@ for ( let [ty128,size] of [['i8x16',1], ['i16x8',2], ['i32x4',4]] ) {
-     let ops = { all_true: allTrue, any_true: anyTrue, bitmask };
- 
-     for ( let op of ['any_true', 'all_true', 'bitmask'] ) {
--        let folded = op != 'bitmask' || (size == 2 && !isArm64);
-+        let folded = op != 'bitmask' || (size == 2 && !isArm64 && !isPPC64);
-         let operation = op == 'any_true' ? 'v128.any_true' : `${ty128}.${op}`;
-         let positive =
-             wasmCompile(
-@@ -898,12 +899,12 @@ for ( let [ty128,size] of [['i8x16',1], ['i16x8',2], ['i32x4',4]] ) {
- 
- // Bitselect with constant mask folded into shuffle operation
- 
--if (!isArm64) {
-+if (!isArm64 && !isPPC64) {
-   wasmCompile(`
-   (module (func (param v128) (param v128) (result v128)
-     (v128.bitselect (local.get 0) (local.get 1) (v128.const i8x16 0 -1 -1 0 0 0 0 0 -1 -1 -1 -1 -1 -1 0 0))))
-   `);
--      assertEq(wasmSimdAnalysis(), "shuffle -> blend 8x16");  
-+      assertEq(wasmSimdAnalysis(), "shuffle -> blend 8x16");
- }
- 
- // Library
-diff --git a/js/src/jit/Assembler.h b/js/src/jit/Assembler.h
-index 97c2e337625b..cb7244776605 100644
---- a/js/src/jit/Assembler.h
-+++ b/js/src/jit/Assembler.h
-@@ -19,6 +19,8 @@
- #  include "jit/loong64/Assembler-loong64.h"
- #elif defined(JS_CODEGEN_RISCV64)
- #  include "jit/riscv64/Assembler-riscv64.h"
-+#elif defined(JS_CODEGEN_PPC64)
-+#  include "jit/ppc64/Assembler-ppc64.h"
- #elif defined(JS_CODEGEN_WASM32)
- #  include "jit/wasm32/Assembler-wasm32.h"
- #elif defined(JS_CODEGEN_NONE)
-diff --git a/js/src/jit/BaselineIC.cpp b/js/src/jit/BaselineIC.cpp
-index c356538a024e..5ab631838f0e 100644
---- a/js/src/jit/BaselineIC.cpp
-+++ b/js/src/jit/BaselineIC.cpp
-@@ -120,6 +120,8 @@ AllocatableGeneralRegisterSet BaselineICAvailableGeneralRegs(size_t numInputs) {
-   MOZ_ASSERT(!regs.has(PseudoStackPointer));
-   MOZ_ASSERT(!regs.has(RealStackPointer));
-   MOZ_ASSERT(!regs.has(ICTailCallReg));
-+#elif defined(JS_CODEGEN_PPC64)
-+  regs.take(ICTailCallReg);
- #endif
-   regs.take(ICStubReg);
- 
-diff --git a/js/src/jit/CacheIRCompiler.cpp b/js/src/jit/CacheIRCompiler.cpp
-index 4eb952e497e3..ee4888495103 100644
---- a/js/src/jit/CacheIRCompiler.cpp
-+++ b/js/src/jit/CacheIRCompiler.cpp
-@@ -10302,6 +10302,14 @@ bool CacheIRCompiler::emitConcatStringsResult(StringOperandId lhsId,
-     liveRegs.add(ICTailCallReg);
- #endif
-     liveRegs.takeUnchecked(output.valueReg());
-+
-+#ifdef JS_CODEGEN_PPC64
-+    // On PPC64, LR is an SPR, not a GPR, so ICTailCallReg is a regular
-+    // GPR that does not shadow LR. The inner bctrl will clobber LR, so
-+    // save/restore it explicitly.
-+    masm.xs_mflr(r0);
-+    masm.push(r0);
-+#endif
-     masm.PushRegsInMask(liveRegs);
- 
-     // The stub expects lhs in CallTempReg0 and rhs in CallTempReg1.
-@@ -10322,11 +10330,19 @@ bool CacheIRCompiler::emitConcatStringsResult(StringOperandId lhsId,
-     masm.branchTestPtr(Assembler::Zero, CallTempReg5, CallTempReg5, &vmCall);
-     masm.tagValue(JSVAL_TYPE_STRING, CallTempReg5, output.valueReg());
-     masm.PopRegsInMask(liveRegs);
-+#ifdef JS_CODEGEN_PPC64
-+    masm.pop(r0);
-+    masm.xs_mtlr(r0);
-+#endif
-     masm.jump(&done);
- 
-     masm.bind(&vmCall);
-     masm.setFramePushed(framePushed);
-     masm.PopRegsInMask(liveRegs);
-+#ifdef JS_CODEGEN_PPC64
-+    masm.pop(r0);
-+    masm.xs_mtlr(r0);
-+#endif
-   }
- 
-   {
-diff --git a/js/src/jit/CodeGenerator.cpp b/js/src/jit/CodeGenerator.cpp
-index a1c01409e9f7..2a2c6007aec0 100644
---- a/js/src/jit/CodeGenerator.cpp
-+++ b/js/src/jit/CodeGenerator.cpp
-@@ -2519,6 +2519,12 @@ static bool PrepareAndExecuteRegExp(MacroAssembler& masm, Register regexp,
-   masm.computeEffectiveAddress(Address(FramePointer, ioOffset), temp2);
-   masm.PushRegsInMask(volatileRegs);
-   masm.setupUnalignedABICall(temp3);
-+#if defined(JS_CODEGEN_PPC64)
-+  // temp1 aliases argregs on this platform, so we need to reuse temp3
-+  // or we'll stomp on the code pointer when we pass the first ABI argument.
-+  masm.movePtr(codePointer, temp3);
-+  codePointer = temp3;
-+#endif
-   masm.passABIArg(temp2);
-   masm.callWithABI(codePointer);
-   masm.storeCallInt32Result(temp1);
-diff --git a/js/src/jit/CodeGenerator.h b/js/src/jit/CodeGenerator.h
-index 58c047dea41b..3781b9595dfd 100644
---- a/js/src/jit/CodeGenerator.h
-+++ b/js/src/jit/CodeGenerator.h
-@@ -23,6 +23,8 @@
- #  include "jit/loong64/CodeGenerator-loong64.h"
- #elif defined(JS_CODEGEN_RISCV64)
- #  include "jit/riscv64/CodeGenerator-riscv64.h"
-+#elif defined(JS_CODEGEN_PPC64)
-+#  include "jit/ppc64/CodeGenerator-ppc64.h"
- #elif defined(JS_CODEGEN_WASM32)
- #  include "jit/wasm32/CodeGenerator-wasm32.h"
- #elif defined(JS_CODEGEN_NONE)
-diff --git a/js/src/jit/EffectiveAddressAnalysis.cpp b/js/src/jit/EffectiveAddressAnalysis.cpp
-index e1bd1bd045ef..88697c06907c 100644
---- a/js/src/jit/EffectiveAddressAnalysis.cpp
-+++ b/js/src/jit/EffectiveAddressAnalysis.cpp
-@@ -60,7 +60,7 @@ static bool OffsetIsSmallEnough(int32_t imm) {
-   // `movn #imm`.  arm32 is similar.
-   return imm >= -0xFFFF && imm <= 0xFFFF;
- #elif defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_LOONG64) || \
--    defined(JS_CODEGEN_MIPS64)
-+    defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_PPC64)
-   return imm >= -0xFFF && imm <= 0xFFF;
- #elif defined(JS_CODEGEN_WASM32) || defined(JS_CODEGEN_NONE)
-   return true;
-diff --git a/js/src/jit/ExecutableAllocator.cpp b/js/src/jit/ExecutableAllocator.cpp
-index 340a63964b52..c9336fe8ec4e 100644
---- a/js/src/jit/ExecutableAllocator.cpp
-+++ b/js/src/jit/ExecutableAllocator.cpp
-@@ -306,13 +306,19 @@ void ExecutableAllocator::poisonCode(JSRuntime* rt,
-     }
-   }
- 
--  // Make the pools executable again and drop references. We don't flush the
--  // ICache here to not add extra overhead.
-+  // Make the pools executable again and drop references. On architectures with
-+  // incoherent ICache (PPC64), we must flush to prevent stale instruction
-+  // execution when code regions are reused after sweeping.
-   for (size_t i = 0; i < ranges.length(); i++) {
-     ExecutablePool* pool = ranges[i].pool;
-     if (pool->isMarked()) {
-+#ifdef JS_CODEGEN_PPC64
-+      reprotectPool(rt, pool, ProtectionSetting::Executable,
-+                    MustFlushICache::Yes);
-+#else
-       reprotectPool(rt, pool, ProtectionSetting::Executable,
-                     MustFlushICache::No);
-+#endif
-       pool->unmark();
-     }
-     pool->release();
-diff --git a/js/src/jit/FlushICache.cpp b/js/src/jit/FlushICache.cpp
-index d3b1657a6be2..9590687c9803 100644
---- a/js/src/jit/FlushICache.cpp
-+++ b/js/src/jit/FlushICache.cpp
-@@ -13,7 +13,8 @@
- #  include "jit/arm64/vixl/Simulator-vixl.h"
- #endif
- 
--#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64)
-+#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64) || \
-+    defined(JS_CODEGEN_PPC64)
- 
- #  ifdef __linux__
- #    include <linux/version.h>
-diff --git a/js/src/jit/FlushICache.h b/js/src/jit/FlushICache.h
-index af79da356ee5..58396f62ae0d 100644
---- a/js/src/jit/FlushICache.h
-+++ b/js/src/jit/FlushICache.h
-@@ -21,7 +21,7 @@ inline void FlushICache(void* code, size_t size) {
- }
- #elif (defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64)) ||  \
-     defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
--    defined(JS_CODEGEN_RISCV64)
-+    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
- 
- // Invalidate the given code range from the icache. This will also flush the
- // execution context for this core. If this code is to be executed on another
-@@ -37,7 +37,7 @@ inline void FlushICache(void* code, size_t size) { MOZ_CRASH(); }
- #  error "Unknown architecture!"
- #endif
- 
--#if (defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_X64)) ||      \
-+#if (defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_X64)) ||    \
-     defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
-     defined(JS_CODEGEN_RISCV64)
- 
-@@ -55,10 +55,11 @@ inline void FlushExecutionContext() { MOZ_CRASH(); }
- inline bool CanFlushExecutionContextForAllThreads() { MOZ_CRASH(); }
- inline void FlushExecutionContextForAllThreads() { MOZ_CRASH(); }
- 
--#elif defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64)
-+#elif defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64) || \
-+    defined(JS_CODEGEN_PPC64)
- 
--// ARM and ARM64 must flush the instruction pipeline of the current core
--// before executing newly JIT'ed code. This will remove any stale data from
-+// ARM, ARM64, and PPC64 must flush the instruction pipeline of the current
-+// core before executing newly JIT'ed code. This will remove any stale data from
- // the pipeline that may have referenced invalidated instructions.
- //
- // `FlushICache` will perform this for the thread that compiles the code, but
-diff --git a/js/src/jit/GenerateABIFunctionType.py b/js/src/jit/GenerateABIFunctionType.py
-index 04be10d1de2a..815427ec6771 100644
---- a/js/src/jit/GenerateABIFunctionType.py
-+++ b/js/src/jit/GenerateABIFunctionType.py
-@@ -538,6 +538,102 @@ def riscv64_simulator_dispatch(func_types):
-     return contents
- 
- 
-+# PPC64 ELFv2 ABI: 8 int arg regs (r3-r10), 13 FP arg regs (f1-f13).
-+# Each floating-point argument consumes BOTH a float-arg slot AND a
-+# general-purpose-register shadow slot (capped at 8 GPR slots), matching
-+# what GCC and the JIT's ABIArgGenerator do for ELFv2 PPC64LE. Without
-+# the shadow, integer args following a float go to the wrong register
-+# at the call boundary, producing a use-after-free / wrong-pointer crash
-+# in the C callee. (Verified empirically by disassembling
-+# NumberBigIntCompare(double, BigInt*) on real PPC64: BigInt* is read
-+# from r4, not r3.)
-+def ppc64_args(func_type):
-+    contents = ""
-+    numIntArgRegs = 8
-+    numFloatArgRegs = 13
-+    intRegIndex = 0
-+    floatRegIndex = 0
-+    stackOffset = 0
-+    for i, arg in enumerate(func_type["args"]):
-+        if i != 0:
-+            contents += ", "
-+
-+        if arg == "General":
-+            if intRegIndex == numIntArgRegs:
-+                contents += f"sp_[{stackOffset}]"
-+                stackOffset += 1
-+            else:
-+                contents += f"a{intRegIndex}_"
-+                intRegIndex += 1
-+        elif arg == "Int32":
-+            if intRegIndex == numIntArgRegs:
-+                contents += f"I32(sp_[{stackOffset}])"
-+                stackOffset += 1
-+            else:
-+                contents += f"I32(a{intRegIndex}_)"
-+                intRegIndex += 1
-+        elif arg == "Int64":
-+            if intRegIndex == numIntArgRegs:
-+                contents += f"sp_[{stackOffset}]"
-+                stackOffset += 1
-+            else:
-+                contents += f"a{intRegIndex}_"
-+                intRegIndex += 1
-+        elif arg == "Float32":
-+            if floatRegIndex == numFloatArgRegs:
-+                contents += f"*mozilla::BitwiseCast<float*>(sp_[{stackOffset}])"
-+                stackOffset += 1
-+            else:
-+                contents += f"f{floatRegIndex}_s"
-+                floatRegIndex += 1
-+            # ELFv2: FP arg also consumes a GPR shadow slot.
-+            if intRegIndex < numIntArgRegs:
-+                intRegIndex += 1
-+        elif arg == "Float64":
-+            if floatRegIndex == numFloatArgRegs:
-+                contents += f"mozilla::BitwiseCast<double>(sp_[{stackOffset}])"
-+                stackOffset += 1
-+            else:
-+                contents += f"f{floatRegIndex}_d"
-+                floatRegIndex += 1
-+            # ELFv2: FP arg also consumes a GPR shadow slot.
-+            if intRegIndex < numIntArgRegs:
-+                intRegIndex += 1
-+    assert intRegIndex <= numIntArgRegs
-+    assert floatRegIndex <= numFloatArgRegs
-+    return contents
-+
-+
-+def ppc64_simulator_dispatch(func_types):
-+    contents = ""
-+    for func_type in func_types:
-+        args = ppc64_args(func_type)
-+        contents += f"case js::jit::Args_{func_type_name(func_type)}: {{\\\n"
-+        contents += f"  auto target = reinterpret_cast<Prototype_{func_type_name(func_type)}>(nativeFn);\\\n"
-+        ret = func_type["ret"]
-+        if ret == "Void":
-+            contents += f"  target({args});\\\n"
-+        else:
-+            contents += f"  auto ret = target({args});\\\n"
-+        if ret == "Void":
-+            pass
-+        elif ret == "General":
-+            contents += "  setCallResult(ret);\\\n"
-+        elif ret == "Int32":
-+            contents += "  setCallResult(I64(ret));\\\n"
-+        elif ret == "Int64":
-+            contents += "  setCallResult(ret);\\\n"
-+        elif ret == "Float32":
-+            contents += "  setCallResultFloat(ret);\\\n"
-+        elif ret == "Float64":
-+            contents += "  setCallResultDouble(ret);\\\n"
-+        else:
-+            raise ValueError(f"Unknown ret type: {ret}")
-+        contents += "  break;\\\n"
-+        contents += "}\\\n"
-+    return contents
-+
-+
- def main(c_out, yaml_path):
-     func_types = load_yaml(yaml_path)
- 
-@@ -581,4 +677,8 @@ def main(c_out, yaml_path):
-     contents += riscv64_simulator_dispatch(func_types)
-     contents += "\n"
- 
-+    contents += "#define ABI_FUNCTION_TYPE_PPC64_SIM_DISPATCH \\\n"
-+    contents += ppc64_simulator_dispatch(func_types)
-+    contents += "\n"
-+
-     generate_header(c_out, "jit_ABIFunctionTypeGenerated_h", contents)
-diff --git a/js/src/jit/JitContext.cpp b/js/src/jit/JitContext.cpp
-index 79b22d9f249f..d399ddd36fd4 100644
---- a/js/src/jit/JitContext.cpp
-+++ b/js/src/jit/JitContext.cpp
-@@ -121,6 +121,10 @@ bool jit::InitializeJit() {
-   RVFlags::Init();
- #endif
- 
-+#ifdef JS_CODEGEN_PPC64
-+  PPC64Flags::Init();
-+#endif
-+
- #ifndef JS_CODEGEN_NONE
-   MOZ_ASSERT(js::jit::CPUFlagsHaveBeenComputed());
- #endif
-diff --git a/js/src/jit/JitFrames.cpp b/js/src/jit/JitFrames.cpp
-index 3653af3a21f4..bbd1376dec69 100644
---- a/js/src/jit/JitFrames.cpp
-+++ b/js/src/jit/JitFrames.cpp
-@@ -1824,7 +1824,12 @@ Value SnapshotIterator::allocationValue(const RValueAllocation& alloc,
-       return DoubleValue(fromRegister<double>(alloc.fpuReg()));
- 
-     case RValueAllocation::FLOAT32_REG:
-+#if defined(JS_CODEGEN_PPC64) || defined(JS_CODEGEN_RISCV64)
-+      return Float32Value(
-+          float(fromRegister<double>(alloc.fpuReg().asDouble())));
-+#else
-       return Float32Value(fromRegister<float>(alloc.fpuReg()));
-+#endif
- 
-     case RValueAllocation::FLOAT32_STACK:
-       return Float32Value(ReadFrameFloat32Slot(fp_, alloc.stackOffset()));
-@@ -2625,7 +2630,12 @@ uintptr_t MachineState::read(Register reg) const {
- 
- template <typename T>
- T MachineState::read(FloatRegister reg) const {
-+#if defined(JS_CODEGEN_PPC64) || defined(JS_CODEGEN_RISCV64)
-+  // PPC64/RISCV64 always store FloatRegisters as 64-bit doubles.
-+  MOZ_ASSERT(reg.size() >= sizeof(T));
-+#else
-   MOZ_ASSERT(reg.size() == sizeof(T));
-+#endif
- 
- #if !defined(JS_CODEGEN_NONE) && !defined(JS_CODEGEN_WASM32)
-   if (state_.is<BailoutState>()) {
-diff --git a/js/src/jit/JitFrames.h b/js/src/jit/JitFrames.h
-index ac7005a5fcfc..490834e62fec 100644
---- a/js/src/jit/JitFrames.h
-+++ b/js/src/jit/JitFrames.h
-@@ -322,6 +322,16 @@ enum class ExceptionResumeKind : int32_t {
- 
- // Data needed to recover from an exception.
- struct ResumeFromException {
-+#if defined(JS_CODEGEN_PPC64)
-+  // This struct is built on the stack as part of exception returns. Because
-+  // it goes right on top of the stack, an ABI-compliant routine can wreck
-+  // it, so we implement a minimum Power ISA linkage area (four doublewords).
-+  void* _ppc_sp_;
-+  void* _ppc_cr_;
-+  void* _ppc_lr_;
-+  void* _ppc_toc_;
-+#endif
-+
-   uint8_t* framePointer;
-   uint8_t* stackPointer;
-   uint8_t* target;
-@@ -373,7 +383,7 @@ struct ResumeFromException {
-   }
- };
- 
--#if defined(JS_CODEGEN_ARM64)
-+#if defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_PPC64)
- static_assert(sizeof(ResumeFromException) % 16 == 0,
-               "ResumeFromException should be aligned");
- #endif
-diff --git a/js/src/jit/LIR.cpp b/js/src/jit/LIR.cpp
-index 2f89fb407349..a9f634b7fcc1 100644
---- a/js/src/jit/LIR.cpp
-+++ b/js/src/jit/LIR.cpp
-@@ -779,8 +779,8 @@ bool LMoveGroup::add(LAllocation from, LAllocation to, LDefinition::Type type) {
-     // CodeGeneratorShared::CodeGeneratorShared and in general everywhere
-     // SimdMemoryAignment is used.  Likely, alignment requirements will return.
- #   if defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_X64) || \
--       defined(JS_CODEGEN_ARM64)
--      // No need for any check on x86/x64/arm64.
-+       defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_PPC64)
-+      // No need for any check on x86/x64/arm64/ppc64.
- #   else
- #     error "Need to consider SIMD alignment on this target."
-       // The following code may be of use if we need alignment checks on
-diff --git a/js/src/jit/LIR.h b/js/src/jit/LIR.h
-index 3f4efeda7955..3354cb96b0cb 100644
---- a/js/src/jit/LIR.h
-+++ b/js/src/jit/LIR.h
-@@ -200,7 +200,7 @@ class LUse : public LAllocation {
-   static const uint32_t POLICY_BITS = 3;
-   static const uint32_t POLICY_SHIFT = 0;
-   static const uint32_t POLICY_MASK = (1 << POLICY_BITS) - 1;
--#ifdef JS_CODEGEN_ARM64
-+#if defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_PPC64)
-   static const uint32_t REG_BITS = 7;
- #else
-   static const uint32_t REG_BITS = 6;
-@@ -619,12 +619,18 @@ class LDefinition {
-   Type type() const { return (Type)((bits_ >> TYPE_SHIFT) & TYPE_MASK); }
- 
-   static bool isFloatRegCompatible(Type type, FloatRegister reg) {
-+#if defined(JS_CODEGEN_PPC64) || defined(JS_CODEGEN_RISCV64)
-+    if (type == FLOAT32 || type == DOUBLE) {
-+      return reg.isSingle() || reg.isDouble();
-+    }
-+#else
-     if (type == FLOAT32) {
-       return reg.isSingle();
-     }
-     if (type == DOUBLE) {
-       return reg.isDouble();
-     }
-+#endif
-     MOZ_ASSERT(type == SIMD128);
-     return reg.isSimd128();
-   }
-@@ -2292,6 +2298,8 @@ AnyRegister LAllocation::toAnyRegister() const {
- #  include "jit/loong64/LIR-loong64.h"
- #elif defined(JS_CODEGEN_RISCV64)
- #  include "jit/riscv64/LIR-riscv64.h"
-+#elif defined(JS_CODEGEN_PPC64)
-+#  include "jit/ppc64/LIR-ppc64.h"
- #elif defined(JS_CODEGEN_MIPS64)
- #  include "jit/mips-shared/LIR-mips-shared.h"
- #  include "jit/mips64/LIR-mips64.h"
-diff --git a/js/src/jit/LIROps.yaml b/js/src/jit/LIROps.yaml
-index 315ff5fd5348..7fbea9e2ebec 100644
---- a/js/src/jit/LIROps.yaml
-+++ b/js/src/jit/LIROps.yaml
-@@ -2210,7 +2210,7 @@
-     oldval: WordSized
-     newval: WordSized
-   # Needs additional temps on LL/SC platforms to extract/insert bits of word.
--#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
-+#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-   num_temps: 4
- #else
-   num_temps: 1
-@@ -2224,7 +2224,7 @@
-     index: WordSized
-     value: WordSized
-   # Needs additional temps on LL/SC platforms to extract/insert bits of word.
--#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
-+#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-   num_temps: 4
- #else
-   num_temps: 1
-@@ -2238,7 +2238,7 @@
-     index: WordSized
-     value: WordSized
-   # Needs additional temps on LL/SC platforms to extract/insert bits of word.
--#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
-+#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-   num_temps: 4
- #else
-   num_temps: 2
-@@ -2255,7 +2255,7 @@
-   # Needs additional temps on LL/SC platforms to extract/insert bits of word.
- #if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64)
-   num_temps: 1
--#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
-+#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-   num_temps: 3
- #endif
-   mir_op: AtomicTypedArrayElementBinop
-@@ -3066,7 +3066,7 @@
-   operands:
-     ptr: WordSized
-     memoryBase: WordSized
--#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
-+#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-   num_temps: 1
- #endif
-   mir_op: true
-@@ -3078,7 +3078,7 @@
-     memoryBase: WordSized
- #ifdef JS_CODEGEN_ARM
-   num_temps: 2
--#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
-+#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-   num_temps: 1
- #endif
-   mir_op: WasmLoad
-@@ -3088,7 +3088,7 @@
-     ptr: WordSized
-     value: WordSized
-     memoryBase: WordSized
--#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
-+#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-   num_temps: 1
- #endif
-   mir_op: true
-@@ -3098,7 +3098,7 @@
-     ptr: WordSized
-     value: Int64
-     memoryBase: WordSized
--#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
-+#if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-   num_temps: 1
- #endif
-   mir_op: WasmStore
-@@ -3128,7 +3128,7 @@
-     memoryBase: WordSized
- #ifdef JS_CODEGEN_X86
-   num_temps: 1
--#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
-+#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-   # Temp that may be used on LL/SC platforms for extract/insert bits of word.
-   num_temps: 3
- #endif
-@@ -3142,7 +3142,7 @@
-     memoryBase: WordSized
- #ifdef JS_CODEGEN_X86
-   num_temps: 1
--#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
-+#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-   # Temp that may be used on LL/SC platforms for extract/insert bits of word.
-   num_temps: 3
- #endif
-@@ -3154,7 +3154,7 @@
-     ptr: WordSized
-     value: WordSized
-     memoryBase: WordSized
--#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
-+#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-   # Temp that may be used on LL/SC platforms for extract/insert bits of word.
-   num_temps: 3
- #elifdef JS_CODEGEN_X86
-@@ -3171,7 +3171,7 @@
-     ptr: WordSized
-     value: WordSized
-     memoryBase: WordSized
--#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
-+#if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-   # Temp that may be used on LL/SC platforms for extract/insert bits of word.
-   num_temps: 3
- #elif defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64)
-@@ -4424,6 +4424,64 @@
-   mir_op: WasmAtomicExchangeHeap
- #endif
- 
-+#ifdef JS_CODEGEN_PPC64
-+- name: DivOrModI64
-+  gen_boilerplate: false
-+
-+- name: UDivOrMod
-+  gen_boilerplate: false
-+
-+- name: UDivOrModI64
-+  gen_boilerplate: false
-+
-+- name: ModMaskI
-+  result_type: WordSized
-+  operands:
-+    input: WordSized
-+  arguments:
-+    shift: int32_t
-+  num_temps: 2
-+  mir_op: Mod
-+
-+- name: WasmTruncateToInt64
-+  result_type: Int64
-+  operands:
-+    input: WordSized
-+  mir_op: true
-+
-+- name: Int64ToFloatingPoint
-+  result_type: WordSized
-+  operands:
-+    input: Int64
-+  mir_op: true
-+
-+- name: WasmCompareExchangeI64
-+  result_type: Int64
-+  operands:
-+    ptr: WordSized
-+    oldValue: Int64
-+    newValue: Int64
-+    memoryBase: WordSized
-+  mir_op: WasmCompareExchangeHeap
-+
-+- name: WasmAtomicBinopI64
-+  result_type: Int64
-+  operands:
-+    ptr: WordSized
-+    value: Int64
-+    memoryBase: WordSized
-+  num_temps64: 1
-+  mir_op: WasmAtomicBinopHeap
-+
-+- name: WasmAtomicExchangeI64
-+  result_type: Int64
-+  operands:
-+    ptr: WordSized
-+    value: Int64
-+    memoryBase: WordSized
-+  mir_op: WasmAtomicExchangeHeap
-+#endif
-+
- #ifdef JS_CODEGEN_RISCV64
- - name: UDiv
-   result_type: WordSized
-diff --git a/js/src/jit/Label.h b/js/src/jit/Label.h
-index 061bf978d26f..2a49ded9c967 100644
---- a/js/src/jit/Label.h
-+++ b/js/src/jit/Label.h
-@@ -23,7 +23,7 @@ struct LabelBase {
-   uint32_t offset_ : 31;
- 
- #if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
--    defined(JS_CODEGEN_RISCV64)
-+    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-  public:
- #endif
-   static const uint32_t INVALID_OFFSET = 0x7fffffff;  // UINT31_MAX.
-diff --git a/js/src/jit/Lowering.cpp b/js/src/jit/Lowering.cpp
-index 9c1c4b0df491..e3fe71ea9c83 100644
---- a/js/src/jit/Lowering.cpp
-+++ b/js/src/jit/Lowering.cpp
-@@ -1174,7 +1174,7 @@ void LIRGenerator::visitTest(MTest* test) {
- 
- #if defined(ENABLE_WASM_SIMD) &&                           \
-     (defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_X64) || \
--     defined(JS_CODEGEN_ARM64))
-+     defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_PPC64))
-   // Check if the operand for this test is an any_true/all_true SIMD operation.
-   // If it is, we want to emit an LWasmReduceAndBranchSimd128 node to avoid
-   // generating an intermediate boolean result.
-diff --git a/js/src/jit/Lowering.h b/js/src/jit/Lowering.h
-index b4f133758eb6..d973a68989b5 100644
---- a/js/src/jit/Lowering.h
-+++ b/js/src/jit/Lowering.h
-@@ -23,6 +23,8 @@
- #  include "jit/loong64/Lowering-loong64.h"
- #elif defined(JS_CODEGEN_RISCV64)
- #  include "jit/riscv64/Lowering-riscv64.h"
-+#elif defined(JS_CODEGEN_PPC64)
-+#  include "jit/ppc64/Lowering-ppc64.h"
- #elif defined(JS_CODEGEN_WASM32)
- #  include "jit/wasm32/Lowering-wasm32.h"
- #elif defined(JS_CODEGEN_NONE)
-diff --git a/js/src/jit/MacroAssembler-inl.h b/js/src/jit/MacroAssembler-inl.h
-index 4747a22e171b..d7385df895d5 100644
---- a/js/src/jit/MacroAssembler-inl.h
-+++ b/js/src/jit/MacroAssembler-inl.h
-@@ -39,6 +39,8 @@
- #  include "jit/loong64/MacroAssembler-loong64-inl.h"
- #elif defined(JS_CODEGEN_RISCV64)
- #  include "jit/riscv64/MacroAssembler-riscv64-inl.h"
-+#elif defined(JS_CODEGEN_PPC64)
-+#  include "jit/ppc64/MacroAssembler-ppc64-inl.h"
- #elif defined(JS_CODEGEN_WASM32)
- #  include "jit/wasm32/MacroAssembler-wasm32-inl.h"
- #elif !defined(JS_CODEGEN_NONE)
-diff --git a/js/src/jit/MacroAssembler.cpp b/js/src/jit/MacroAssembler.cpp
-index eb95d6c9e2c4..5b28e811c88d 100644
---- a/js/src/jit/MacroAssembler.cpp
-+++ b/js/src/jit/MacroAssembler.cpp
-@@ -6128,7 +6128,7 @@ static void MoveDataBlock(MacroAssembler& masm, Register base, int32_t from,
-   static constexpr Register scratch = ABINonArgReg0;
-   masm.push(scratch);
- #elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
--    defined(JS_CODEGEN_RISCV64)
-+    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-   UseScratchRegisterScope temps(masm);
-   Register scratch = temps.Acquire();
- #elif !defined(JS_CODEGEN_NONE)
-@@ -6315,6 +6315,12 @@ static void CollapseWasmFrameFast(MacroAssembler& masm,
- 
- #ifdef JS_USE_LINK_REGISTER
-   // RA is already in its place, just move stack.
-+#  ifdef JS_CODEGEN_PPC64
-+  // PPC64's LR is not a GPR, so WasmTailCallRAScratchReg is a normal GPR
-+  // (r14). We must explicitly move it to LR so the callee's prologue
-+  // (pushReturnAddress) saves the correct return address.
-+  masm.xs_mtlr(tempForRA);
-+#  endif
-   masm.addToStackPtr(Imm32(framePushedAtStart + newArgDest));
- #else
-   // Push RA to new frame: store RA, restore temp, and move stack.
-@@ -6463,6 +6469,12 @@ static void CollapseWasmFrameSlow(MacroAssembler& masm,
- #ifdef JS_USE_LINK_REGISTER
-   masm.freeStack(reserved);
-   // RA is already in its place, just move stack.
-+#  ifdef JS_CODEGEN_PPC64
-+  // PPC64's LR is not a GPR, so WasmTailCallRAScratchReg is a normal GPR
-+  // (r14). We must explicitly move the trampoline address to LR so the
-+  // callee returns to the trampoline.
-+  masm.xs_mtlr(tempForRA);
-+#  endif
-   masm.addToStackPtr(Imm32(framePushedAtStart + newArgDest));
- #else
-   // Push RA to new frame: store RA, restore temp, and move stack.
-@@ -8527,7 +8539,7 @@ void MacroAssembler::debugAssertCanonicalInt32(Register r) {
-     breakpoint();
-     bind(&ok);
- #    elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
--        defined(JS_CODEGEN_RISCV64)
-+        defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-     Label ok;
-     UseScratchRegisterScope temps(*this);
-     Register scratch = temps.Acquire();
-@@ -10567,6 +10579,15 @@ void MacroAssembler::orderedHashTableLookup(Register setOrMapObj,
-   unboxInt32(Address(setOrMapObj, TableObject::offsetOfLiveCount()), temp1);
-   branchTest32(Assembler::Zero, temp1, temp1, &notFound);
- 
-+#if defined(JS_CODEGEN_PPC64)
-+  // If this was preceded by a MoveGroup instruction, the hash may have been
-+  // loaded algebraically since it's an Int32 (and thus sign-extended); the
-+  // operation doesn't know to keep the upper bits clear, failing the assert.
-+  if (isBigInt == IsBigInt::No) {
-+    as_rldicl(hash, hash, 0, 32);
-+  }
-+#endif
-+
- #ifdef DEBUG
-   PushRegsInMask(LiveRegisterSet(RegisterSet::Volatile()));
- 
-diff --git a/js/src/jit/MacroAssembler.h b/js/src/jit/MacroAssembler.h
-index 6c08bb554ca8..754e8642bb57 100644
---- a/js/src/jit/MacroAssembler.h
-+++ b/js/src/jit/MacroAssembler.h
-@@ -23,6 +23,8 @@
- #  include "jit/loong64/MacroAssembler-loong64.h"
- #elif defined(JS_CODEGEN_RISCV64)
- #  include "jit/riscv64/MacroAssembler-riscv64.h"
-+#elif defined(JS_CODEGEN_PPC64)
-+#  include "jit/ppc64/MacroAssembler-ppc64.h"
- #elif defined(JS_CODEGEN_WASM32)
- #  include "jit/wasm32/MacroAssembler-wasm32.h"
- #elif defined(JS_CODEGEN_NONE)
-@@ -93,8 +95,9 @@
- //   }
- //   ////}}} check_macroassembler_style
- 
--#define ALL_ARCH mips64, arm, arm64, x86, x64, loong64, riscv64, wasm32
--#define ALL_SHARED_ARCH arm, arm64, loong64, mips64, riscv64, x86_shared, wasm32
-+#define ALL_ARCH mips64, arm, arm64, x86, x64, loong64, riscv64, ppc64, wasm32
-+#define ALL_SHARED_ARCH \
-+  arm, arm64, loong64, mips64, riscv64, ppc64, x86_shared, wasm32
- 
- // * How this macro works:
- //
-@@ -140,6 +143,7 @@
- #define DEFINED_ON_mips64
- #define DEFINED_ON_loong64
- #define DEFINED_ON_riscv64
-+#define DEFINED_ON_ppc64
- #define DEFINED_ON_wasm32
- #define DEFINED_ON_none
- 
-@@ -169,6 +173,9 @@
- #elif defined(JS_CODEGEN_RISCV64)
- #  undef DEFINED_ON_riscv64
- #  define DEFINED_ON_riscv64 define
-+#elif defined(JS_CODEGEN_PPC64)
-+#  undef DEFINED_ON_ppc64
-+#  define DEFINED_ON_ppc64 define
- #elif defined(JS_CODEGEN_WASM32)
- #  undef DEFINED_ON_wasm32
- #  define DEFINED_ON_wasm32 define
-@@ -562,7 +569,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   void Pop(const Register64 reg);
-   void PopFlags() DEFINED_ON(x86_shared);
-   void PopStackPtr()
--      DEFINED_ON(arm, mips64, x86_shared, loong64, riscv64, wasm32);
-+      DEFINED_ON(arm, mips64, x86_shared, loong64, riscv64, ppc64, wasm32);
- 
-   // Move the stack pointer based on the requested amount.
-   void adjustStack(int amount);
-@@ -620,9 +627,9 @@ class MacroAssembler : public MacroAssemblerSpecific {
- 
-   // These do not adjust framePushed().
-   void pushReturnAddress()
--      DEFINED_ON(mips64, arm, arm64, loong64, riscv64, wasm32);
-+      DEFINED_ON(mips64, arm, arm64, loong64, riscv64, ppc64, wasm32);
-   void popReturnAddress()
--      DEFINED_ON(mips64, arm, arm64, loong64, riscv64, wasm32);
-+      DEFINED_ON(mips64, arm, arm64, loong64, riscv64, ppc64, wasm32);
- 
-   // Useful for dealing with two-valued returns.
-   void moveRegPair(Register src0, Register src1, Register dst0, Register dst1,
-@@ -641,7 +648,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   CodeOffset farJumpWithPatch() PER_SHARED_ARCH;
-   void patchFarJump(CodeOffset farJump, uint32_t targetOffset) PER_SHARED_ARCH;
-   static void patchFarJump(uint8_t* farJump, uint8_t* target)
--      DEFINED_ON(arm, arm64, x86_shared, loong64, mips64, riscv64);
-+      DEFINED_ON(arm, arm64, x86_shared, loong64, mips64, riscv64, ppc64);
- 
-   // Emit a nop that can be patched to and from a nop and a call with int32
-   // relative displacement.
-@@ -667,9 +674,9 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   // target behaviour is only provided for `n` in the range 0 .. 2^31-1
-   // inclusive.
-   CodeOffset move32WithPatch(Register dest)
--      DEFINED_ON(x86_shared, arm, arm64, loong64, mips64, riscv64);
-+      DEFINED_ON(x86_shared, arm, arm64, loong64, mips64, riscv64, ppc64);
-   void patchMove32(CodeOffset offset, Imm32 n)
--      DEFINED_ON(x86_shared, arm, arm64, loong64, mips64, riscv64);
-+      DEFINED_ON(x86_shared, arm, arm64, loong64, mips64, riscv64, ppc64);
- 
-  public:
-   // ===============================================================
-@@ -1174,13 +1181,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   inline void mulPtr(ImmWord rhs, Register srcDest) PER_ARCH;
- 
-   inline void mul64(const Register64& rhs, const Register64& srcDest)
--      DEFINED_ON(x64, arm64, mips64, loong64, riscv64);
-+      DEFINED_ON(x64, arm64, mips64, loong64, riscv64, ppc64);
-   inline void mul64(const Operand& src, const Register64& dest) DEFINED_ON(x64);
-   inline void mul64(const Operand& src, const Register64& dest,
-                     const Register temp) DEFINED_ON(x64);
-   inline void mul64(Imm64 imm, const Register64& dest) PER_ARCH;
-   inline void mul64(Imm64 imm, const Register64& dest, const Register temp)
--      DEFINED_ON(x86, x64, arm, mips64, loong64, riscv64);
-+      DEFINED_ON(x86, x64, arm, mips64, loong64, riscv64, ppc64);
-   inline void mul64(const Register64& src, const Register64& dest,
-                     const Register temp) PER_ARCH;
-   inline void mul64(const Register64& src1, const Register64& src2,
-@@ -1202,11 +1209,11 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   // On ARM, the chip must have hardware division instructions.
-   inline void quotient32(Register lhs, Register rhs, Register dest,
-                          bool isUnsigned)
--      DEFINED_ON(mips64, arm, arm64, loong64, riscv64, wasm32);
-+      DEFINED_ON(mips64, arm, arm64, loong64, riscv64, wasm32, ppc64);
- 
-   inline void quotient64(Register lhs, Register rhs, Register dest,
-                          bool isUnsigned)
--      DEFINED_ON(arm64, loong64, mips64, riscv64);
-+      DEFINED_ON(arm64, loong64, mips64, riscv64, ppc64);
- 
-   // As above, but lhs and dest must be eax and tempEdx must be edx.
-   inline void quotient32(Register lhs, Register rhs, Register dest,
-@@ -1219,11 +1226,11 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   // On ARM, the chip must have hardware division instructions.
-   inline void remainder32(Register lhs, Register rhs, Register dest,
-                           bool isUnsigned)
--      DEFINED_ON(mips64, arm, arm64, loong64, riscv64, wasm32);
-+      DEFINED_ON(mips64, arm, arm64, loong64, riscv64, wasm32, ppc64);
- 
-   inline void remainder64(Register lhs, Register rhs, Register dest,
-                           bool isUnsigned)
--      DEFINED_ON(arm64, loong64, mips64, riscv64);
-+      DEFINED_ON(arm64, loong64, mips64, riscv64, ppc64);
- 
-   // As above, but lhs and dest must be eax and tempEdx must be edx.
-   inline void remainder32(Register lhs, Register rhs, Register dest,
-@@ -2080,7 +2087,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   template <typename T>
-   void branchValueIsNurseryCellImpl(Condition cond, const T& value,
-                                     Register temp, Label* label)
--      DEFINED_ON(arm64, x64, mips64, loong64, riscv64);
-+      DEFINED_ON(arm64, x64, mips64, loong64, riscv64, ppc64);
- 
-   template <typename T>
-   inline void branchTestUndefinedImpl(Condition cond, const T& t, Label* label)
-@@ -2245,7 +2252,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   // from all the other registers, on all supported targets.
-   inline void wasmAddSubI128HI64(Register lhsLo, Register lhsHi, Register rhsLo,
-                                  Register rhsHi, Register output, bool isAdd)
--      DEFINED_ON(x64, arm64, riscv64, loong64, mips64);
-+      DEFINED_ON(x64, arm64, riscv64, loong64, mips64, ppc64);
- 
-   // Produces the top 64 bits of the 128-bit value `RAX *widen rhs`.  The result
-   // will be in RAX.  RDX is trashed.  `rhs` may not be RAX or RDX.  Callers
-@@ -2256,7 +2263,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   // what the registers may be.
-   inline void wasmMulI64WideHI64(Register lhs, Register rhs, Register output,
-                                  bool isSigned)
--      DEFINED_ON(arm64, riscv64, loong64, mips64);
-+      DEFINED_ON(arm64, riscv64, loong64, mips64, ppc64);
- 
-   // ========================================================================
-   // Canonicalization primitives.
-@@ -2355,68 +2362,68 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   // Moves
- 
-   inline void moveSimd128(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Constants
- 
-   inline void loadConstantSimd128(const SimdConstant& v, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Splat
- 
-   inline void splatX16(Register src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void splatX16(uint32_t srcLane, FloatRegister src, FloatRegister dest)
-       DEFINED_ON(arm64);
- 
-   inline void splatX8(Register src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void splatX8(uint32_t srcLane, FloatRegister src, FloatRegister dest)
-       DEFINED_ON(arm64);
- 
-   inline void splatX4(Register src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void splatX4(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void splatX2(Register64 src, FloatRegister dest)
--      DEFINED_ON(x86, x64, arm64);
-+      DEFINED_ON(x86, x64, arm64, ppc64);
- 
-   inline void splatX2(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Extract lane as scalar.  Float extraction does not canonicalize the value.
- 
-   inline void extractLaneInt8x16(uint32_t lane, FloatRegister src,
--                                 Register dest) DEFINED_ON(x86_shared, arm64);
-+                                 Register dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedExtractLaneInt8x16(uint32_t lane, FloatRegister src,
-                                          Register dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void extractLaneInt16x8(uint32_t lane, FloatRegister src,
--                                 Register dest) DEFINED_ON(x86_shared, arm64);
-+                                 Register dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedExtractLaneInt16x8(uint32_t lane, FloatRegister src,
-                                          Register dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void extractLaneInt32x4(uint32_t lane, FloatRegister src,
--                                 Register dest) DEFINED_ON(x86_shared, arm64);
-+                                 Register dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void extractLaneInt64x2(uint32_t lane, FloatRegister src,
--                                 Register64 dest) DEFINED_ON(x86, x64, arm64);
-+                                 Register64 dest) DEFINED_ON(x86, x64, arm64, ppc64);
- 
-   inline void extractLaneFloat32x4(uint32_t lane, FloatRegister src,
-                                    FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void extractLaneFloat64x2(uint32_t lane, FloatRegister src,
-                                    FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Replace lane value
- 
-@@ -2425,21 +2432,21 @@ class MacroAssembler : public MacroAssemblerSpecific {
- 
-   inline void replaceLaneInt8x16(unsigned lane, Register rhs,
-                                  FloatRegister lhsDest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void replaceLaneInt16x8(unsigned lane, FloatRegister lhs, Register rhs,
-                                  FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void replaceLaneInt16x8(unsigned lane, Register rhs,
-                                  FloatRegister lhsDest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void replaceLaneInt32x4(unsigned lane, FloatRegister lhs, Register rhs,
-                                  FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void replaceLaneInt32x4(unsigned lane, Register rhs,
-                                  FloatRegister lhsDest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void replaceLaneInt64x2(unsigned lane, FloatRegister lhs,
-                                  Register64 rhs, FloatRegister dest)
-@@ -2447,7 +2454,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
- 
-   inline void replaceLaneInt64x2(unsigned lane, Register64 rhs,
-                                  FloatRegister lhsDest)
--      DEFINED_ON(x86, x64, arm64);
-+      DEFINED_ON(x86, x64, arm64, ppc64);
- 
-   inline void replaceLaneFloat32x4(unsigned lane, FloatRegister lhs,
-                                    FloatRegister rhs, FloatRegister dest)
-@@ -2455,7 +2462,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
- 
-   inline void replaceLaneFloat32x4(unsigned lane, FloatRegister rhs,
-                                    FloatRegister lhsDest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void replaceLaneFloat64x2(unsigned lane, FloatRegister lhs,
-                                    FloatRegister rhs, FloatRegister dest)
-@@ -2463,7 +2470,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
- 
-   inline void replaceLaneFloat64x2(unsigned lane, FloatRegister rhs,
-                                    FloatRegister lhsDest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Shuffle - blend and permute with immediate indices, and its many
-   // specializations.  Lane values other than those mentioned are illegal.
-@@ -2471,11 +2478,11 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   // lane values 0..31
-   inline void shuffleInt8x16(const uint8_t lanes[16], FloatRegister rhs,
-                              FloatRegister lhsDest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void shuffleInt8x16(const uint8_t lanes[16], FloatRegister lhs,
-                              FloatRegister rhs, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Lane values must be 0 (select from lhs) or FF (select from rhs).
-   // The behavior is undefined for lane values that are neither 0 nor FF.
-@@ -2502,39 +2509,39 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   // The implementation works effectively for I8x16, I16x8, I32x4, and I64x2.
-   inline void laneSelectSimd128(FloatRegister mask, FloatRegister lhs,
-                                 FloatRegister rhs, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void interleaveHighInt8x16(FloatRegister lhs, FloatRegister rhs,
-                                     FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void interleaveHighInt16x8(FloatRegister lhs, FloatRegister rhs,
-                                     FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void interleaveHighInt32x4(FloatRegister lhs, FloatRegister rhs,
-                                     FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void interleaveHighInt64x2(FloatRegister lhs, FloatRegister rhs,
-                                     FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void interleaveLowInt8x16(FloatRegister lhs, FloatRegister rhs,
-                                    FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void interleaveLowInt16x8(FloatRegister lhs, FloatRegister rhs,
-                                    FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void interleaveLowInt32x4(FloatRegister lhs, FloatRegister rhs,
-                                    FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void interleaveLowInt64x2(FloatRegister lhs, FloatRegister rhs,
-                                    FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Permute - permute with immediate indices.
- 
-@@ -2544,7 +2551,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
- 
-   // lane values 0..7
-   inline void permuteInt16x8(const uint16_t lanes[8], FloatRegister src,
--                             FloatRegister dest) DEFINED_ON(arm64);
-+                             FloatRegister dest) DEFINED_ON(arm64, ppc64);
- 
-   // lane values 0..3 [sic].
-   inline void permuteHighInt16x8(const uint16_t lanes[4], FloatRegister src,
-@@ -2562,80 +2569,80 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   //   low_16_bytes_of((lhs ++ rhs) >> shift*8), shift must be < 16
-   inline void concatAndRightShiftSimd128(FloatRegister lhs, FloatRegister rhs,
-                                          FloatRegister dest, uint32_t shift)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Rotate right by immediate count:
-   //   low_16_bytes_of((src ++ src) >> shift*8), shift must be < 16
-   inline void rotateRightSimd128(FloatRegister src, FloatRegister dest,
--                                 uint32_t shift) DEFINED_ON(arm64);
-+                                 uint32_t shift) DEFINED_ON(arm64, ppc64);
- 
-   // Shift bytes with immediate count, shifting in zeroes.  Shift count 0..15.
- 
-   inline void leftShiftSimd128(Imm32 count, FloatRegister src,
-                                FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void rightShiftSimd128(Imm32 count, FloatRegister src,
-                                 FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Zero extend int values.
- 
-   inline void zeroExtend8x16To16x8(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
-   inline void zeroExtend8x16To32x4(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
-   inline void zeroExtend8x16To64x2(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
-   inline void zeroExtend16x8To32x4(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
-   inline void zeroExtend16x8To64x2(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
-   inline void zeroExtend32x4To64x2(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Reverse bytes in lanes.
- 
-   inline void reverseInt16x8(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void reverseInt32x4(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void reverseInt64x2(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Swizzle - permute with variable indices.  `rhs` holds the lanes parameter.
- 
-   inline void swizzleInt8x16(FloatRegister lhs, FloatRegister rhs,
--                             FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                             FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void swizzleInt8x16Relaxed(FloatRegister lhs, FloatRegister rhs,
-                                     FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Integer Add
- 
-   inline void addInt8x16(FloatRegister lhs, FloatRegister rhs,
--                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void addInt8x16(FloatRegister lhs, const SimdConstant& rhs,
-                          FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void addInt16x8(FloatRegister lhs, FloatRegister rhs,
--                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void addInt16x8(FloatRegister lhs, const SimdConstant& rhs,
-                          FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void addInt32x4(FloatRegister lhs, FloatRegister rhs,
--                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void addInt32x4(FloatRegister lhs, const SimdConstant& rhs,
-                          FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void addInt64x2(FloatRegister lhs, FloatRegister rhs,
--                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void addInt64x2(FloatRegister lhs, const SimdConstant& rhs,
-                          FloatRegister dest) DEFINED_ON(x86_shared);
-@@ -2643,13 +2650,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   // Integer Subtract
- 
-   inline void subInt8x16(FloatRegister lhs, FloatRegister rhs,
--                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void subInt8x16(FloatRegister lhs, const SimdConstant& rhs,
-                          FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void subInt16x8(FloatRegister lhs, FloatRegister rhs,
--                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void subInt16x8(FloatRegister lhs, const SimdConstant& rhs,
-                          FloatRegister dest) DEFINED_ON(x86_shared);
-@@ -2658,24 +2665,24 @@ class MacroAssembler : public MacroAssemblerSpecific {
-                          FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void subInt32x4(FloatRegister lhs, FloatRegister rhs,
--                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void subInt64x2(FloatRegister lhs, const SimdConstant& rhs,
-                          FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void subInt64x2(FloatRegister lhs, FloatRegister rhs,
--                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Integer Multiply
- 
-   inline void mulInt16x8(FloatRegister lhs, FloatRegister rhs,
--                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void mulInt16x8(FloatRegister lhs, const SimdConstant& rhs,
-                          FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void mulInt32x4(FloatRegister lhs, FloatRegister rhs,
--                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void mulInt32x4(FloatRegister lhs, const SimdConstant& rhs,
-                          FloatRegister dest) DEFINED_ON(x86_shared);
-@@ -2691,100 +2698,100 @@ class MacroAssembler : public MacroAssemblerSpecific {
- 
-   inline void mulInt64x2(FloatRegister lhs, FloatRegister rhs,
-                          FloatRegister dest, FloatRegister temp1,
--                         FloatRegister temp2) DEFINED_ON(arm64);
-+                         FloatRegister temp2) DEFINED_ON(arm64, ppc64);
- 
-   // Note for the extMul opcodes, the NxM designation is for the input lanes;
-   // the output lanes are twice as wide.
-   inline void extMulLowInt8x16(FloatRegister lhs, FloatRegister rhs,
-                                FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void extMulHighInt8x16(FloatRegister lhs, FloatRegister rhs,
-                                 FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedExtMulLowInt8x16(FloatRegister lhs, FloatRegister rhs,
-                                        FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedExtMulHighInt8x16(FloatRegister lhs, FloatRegister rhs,
-                                         FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void extMulLowInt16x8(FloatRegister lhs, FloatRegister rhs,
-                                FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void extMulHighInt16x8(FloatRegister lhs, FloatRegister rhs,
-                                 FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedExtMulLowInt16x8(FloatRegister lhs, FloatRegister rhs,
-                                        FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedExtMulHighInt16x8(FloatRegister lhs, FloatRegister rhs,
-                                         FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void extMulLowInt32x4(FloatRegister lhs, FloatRegister rhs,
-                                FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void extMulHighInt32x4(FloatRegister lhs, FloatRegister rhs,
-                                 FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedExtMulLowInt32x4(FloatRegister lhs, FloatRegister rhs,
-                                        FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedExtMulHighInt32x4(FloatRegister lhs, FloatRegister rhs,
-                                         FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void q15MulrSatInt16x8(FloatRegister lhs, FloatRegister rhs,
-                                 FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Integer Negate
- 
-   inline void negInt8x16(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void negInt16x8(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void negInt32x4(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void negInt64x2(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Saturating integer add
- 
-   inline void addSatInt8x16(FloatRegister lhs, FloatRegister rhs,
--                            FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                            FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void addSatInt8x16(FloatRegister lhs, const SimdConstant& rhs,
-                             FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void unsignedAddSatInt8x16(FloatRegister lhs, FloatRegister rhs,
-                                     FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedAddSatInt8x16(FloatRegister lhs, const SimdConstant& rhs,
-                                     FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void addSatInt16x8(FloatRegister lhs, FloatRegister rhs,
--                            FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                            FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void addSatInt16x8(FloatRegister lhs, const SimdConstant& rhs,
-                             FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void unsignedAddSatInt16x8(FloatRegister lhs, FloatRegister rhs,
-                                     FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedAddSatInt16x8(FloatRegister lhs, const SimdConstant& rhs,
-                                     FloatRegister dest) DEFINED_ON(x86_shared);
-@@ -2792,27 +2799,27 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   // Saturating integer subtract
- 
-   inline void subSatInt8x16(FloatRegister lhs, FloatRegister rhs,
--                            FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                            FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void subSatInt8x16(FloatRegister lhs, const SimdConstant& rhs,
-                             FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void unsignedSubSatInt8x16(FloatRegister lhs, FloatRegister rhs,
-                                     FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedSubSatInt8x16(FloatRegister lhs, const SimdConstant& rhs,
-                                     FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void subSatInt16x8(FloatRegister lhs, FloatRegister rhs,
--                            FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                            FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void subSatInt16x8(FloatRegister lhs, const SimdConstant& rhs,
-                             FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void unsignedSubSatInt16x8(FloatRegister lhs, FloatRegister rhs,
-                                     FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedSubSatInt16x8(FloatRegister lhs, const SimdConstant& rhs,
-                                     FloatRegister dest) DEFINED_ON(x86_shared);
-@@ -2820,40 +2827,40 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   // Lane-wise integer minimum
- 
-   inline void minInt8x16(FloatRegister lhs, FloatRegister rhs,
--                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void minInt8x16(FloatRegister lhs, const SimdConstant& rhs,
-                          FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void unsignedMinInt8x16(FloatRegister lhs, FloatRegister rhs,
-                                  FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedMinInt8x16(FloatRegister lhs, const SimdConstant& rhs,
-                                  FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void minInt16x8(FloatRegister lhs, FloatRegister rhs,
--                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void minInt16x8(FloatRegister lhs, const SimdConstant& rhs,
-                          FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void unsignedMinInt16x8(FloatRegister lhs, FloatRegister rhs,
-                                  FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedMinInt16x8(FloatRegister lhs, const SimdConstant& rhs,
-                                  FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void minInt32x4(FloatRegister lhs, FloatRegister rhs,
--                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void minInt32x4(FloatRegister lhs, const SimdConstant& rhs,
-                          FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void unsignedMinInt32x4(FloatRegister lhs, FloatRegister rhs,
-                                  FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedMinInt32x4(FloatRegister lhs, const SimdConstant& rhs,
-                                  FloatRegister dest) DEFINED_ON(x86_shared);
-@@ -2861,40 +2868,40 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   // Lane-wise integer maximum
- 
-   inline void maxInt8x16(FloatRegister lhs, FloatRegister rhs,
--                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void maxInt8x16(FloatRegister lhs, const SimdConstant& rhs,
-                          FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void unsignedMaxInt8x16(FloatRegister lhs, FloatRegister rhs,
-                                  FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedMaxInt8x16(FloatRegister lhs, const SimdConstant& rhs,
-                                  FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void maxInt16x8(FloatRegister lhs, FloatRegister rhs,
--                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void maxInt16x8(FloatRegister lhs, const SimdConstant& rhs,
-                          FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void unsignedMaxInt16x8(FloatRegister lhs, FloatRegister rhs,
-                                  FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedMaxInt16x8(FloatRegister lhs, const SimdConstant& rhs,
-                                  FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void maxInt32x4(FloatRegister lhs, FloatRegister rhs,
--                         FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                         FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void maxInt32x4(FloatRegister lhs, const SimdConstant& rhs,
-                          FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void unsignedMaxInt32x4(FloatRegister lhs, FloatRegister rhs,
-                                  FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedMaxInt32x4(FloatRegister lhs, const SimdConstant& rhs,
-                                  FloatRegister dest) DEFINED_ON(x86_shared);
-@@ -2903,25 +2910,25 @@ class MacroAssembler : public MacroAssemblerSpecific {
- 
-   inline void unsignedAverageInt8x16(FloatRegister lhs, FloatRegister rhs,
-                                      FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedAverageInt16x8(FloatRegister lhs, FloatRegister rhs,
-                                      FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Lane-wise integer absolute value
- 
-   inline void absInt8x16(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void absInt16x8(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void absInt32x4(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void absInt64x2(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Left shift by scalar. Immediates and variable shifts must have been
-   // masked; shifts of zero will work but may or may not generate code.
-@@ -2930,41 +2937,41 @@ class MacroAssembler : public MacroAssemblerSpecific {
-                                FloatRegister temp) DEFINED_ON(x86_shared);
- 
-   inline void leftShiftInt8x16(FloatRegister lhs, Register rhs,
--                               FloatRegister dest) DEFINED_ON(arm64);
-+                               FloatRegister dest) DEFINED_ON(arm64, ppc64);
- 
-   inline void leftShiftInt8x16(Imm32 count, FloatRegister src,
-                                FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void leftShiftInt16x8(Register rhs, FloatRegister lhsDest)
-       DEFINED_ON(x86_shared);
- 
-   inline void leftShiftInt16x8(FloatRegister lhs, Register rhs,
--                               FloatRegister dest) DEFINED_ON(arm64);
-+                               FloatRegister dest) DEFINED_ON(arm64, ppc64);
- 
-   inline void leftShiftInt16x8(Imm32 count, FloatRegister src,
-                                FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void leftShiftInt32x4(Register rhs, FloatRegister lhsDest)
-       DEFINED_ON(x86_shared);
- 
-   inline void leftShiftInt32x4(FloatRegister lhs, Register rhs,
--                               FloatRegister dest) DEFINED_ON(arm64);
-+                               FloatRegister dest) DEFINED_ON(arm64, ppc64);
- 
-   inline void leftShiftInt32x4(Imm32 count, FloatRegister src,
-                                FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void leftShiftInt64x2(Register rhs, FloatRegister lhsDest)
-       DEFINED_ON(x86_shared);
- 
-   inline void leftShiftInt64x2(FloatRegister lhs, Register rhs,
--                               FloatRegister dest) DEFINED_ON(arm64);
-+                               FloatRegister dest) DEFINED_ON(arm64, ppc64);
- 
-   inline void leftShiftInt64x2(Imm32 count, FloatRegister src,
-                                FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Right shift by scalar. Immediates and variable shifts must have been
-   // masked; shifts of zero will work but may or may not generate code.
-@@ -2973,82 +2980,82 @@ class MacroAssembler : public MacroAssemblerSpecific {
-                                 FloatRegister temp) DEFINED_ON(x86_shared);
- 
-   inline void rightShiftInt8x16(FloatRegister lhs, Register rhs,
--                                FloatRegister dest) DEFINED_ON(arm64);
-+                                FloatRegister dest) DEFINED_ON(arm64, ppc64);
- 
-   inline void rightShiftInt8x16(Imm32 count, FloatRegister src,
-                                 FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedRightShiftInt8x16(Register rhs, FloatRegister lhsDest,
-                                         FloatRegister temp)
-       DEFINED_ON(x86_shared);
- 
-   inline void unsignedRightShiftInt8x16(FloatRegister lhs, Register rhs,
--                                        FloatRegister dest) DEFINED_ON(arm64);
-+                                        FloatRegister dest) DEFINED_ON(arm64, ppc64);
- 
-   inline void unsignedRightShiftInt8x16(Imm32 count, FloatRegister src,
-                                         FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void rightShiftInt16x8(Register rhs, FloatRegister lhsDest)
-       DEFINED_ON(x86_shared);
- 
-   inline void rightShiftInt16x8(FloatRegister lhs, Register rhs,
--                                FloatRegister dest) DEFINED_ON(arm64);
-+                                FloatRegister dest) DEFINED_ON(arm64, ppc64);
- 
-   inline void rightShiftInt16x8(Imm32 count, FloatRegister src,
-                                 FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedRightShiftInt16x8(Register rhs, FloatRegister lhsDest)
-       DEFINED_ON(x86_shared);
- 
-   inline void unsignedRightShiftInt16x8(FloatRegister lhs, Register rhs,
--                                        FloatRegister dest) DEFINED_ON(arm64);
-+                                        FloatRegister dest) DEFINED_ON(arm64, ppc64);
- 
-   inline void unsignedRightShiftInt16x8(Imm32 count, FloatRegister src,
-                                         FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void rightShiftInt32x4(Register rhs, FloatRegister lhsDest)
-       DEFINED_ON(x86_shared);
- 
-   inline void rightShiftInt32x4(FloatRegister lhs, Register rhs,
--                                FloatRegister dest) DEFINED_ON(arm64);
-+                                FloatRegister dest) DEFINED_ON(arm64, ppc64);
- 
-   inline void rightShiftInt32x4(Imm32 count, FloatRegister src,
-                                 FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedRightShiftInt32x4(Register rhs, FloatRegister lhsDest)
-       DEFINED_ON(x86_shared);
- 
-   inline void unsignedRightShiftInt32x4(FloatRegister lhs, Register rhs,
--                                        FloatRegister dest) DEFINED_ON(arm64);
-+                                        FloatRegister dest) DEFINED_ON(arm64, ppc64);
- 
-   inline void unsignedRightShiftInt32x4(Imm32 count, FloatRegister src,
-                                         FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void rightShiftInt64x2(Register rhs, FloatRegister lhsDest,
-                                 FloatRegister temp) DEFINED_ON(x86_shared);
- 
-   inline void rightShiftInt64x2(Imm32 count, FloatRegister src,
-                                 FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void rightShiftInt64x2(FloatRegister lhs, Register rhs,
--                                FloatRegister dest) DEFINED_ON(arm64);
-+                                FloatRegister dest) DEFINED_ON(arm64, ppc64);
- 
-   inline void unsignedRightShiftInt64x2(Register rhs, FloatRegister lhsDest)
-       DEFINED_ON(x86_shared);
- 
-   inline void unsignedRightShiftInt64x2(FloatRegister lhs, Register rhs,
--                                        FloatRegister dest) DEFINED_ON(arm64);
-+                                        FloatRegister dest) DEFINED_ON(arm64, ppc64);
- 
-   inline void unsignedRightShiftInt64x2(Imm32 count, FloatRegister src,
-                                         FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Sign replication operation
- 
-@@ -3067,47 +3074,47 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   // Bitwise and, or, xor, not
- 
-   inline void bitwiseAndSimd128(FloatRegister rhs, FloatRegister lhsDest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void bitwiseAndSimd128(FloatRegister lhs, FloatRegister rhs,
-                                 FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void bitwiseAndSimd128(FloatRegister lhs, const SimdConstant& rhs,
-                                 FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void bitwiseOrSimd128(FloatRegister rhs, FloatRegister lhsDest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void bitwiseOrSimd128(FloatRegister lhs, FloatRegister rhs,
-                                FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void bitwiseOrSimd128(FloatRegister lhs, const SimdConstant& rhs,
-                                FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void bitwiseXorSimd128(FloatRegister rhs, FloatRegister lhsDest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void bitwiseXorSimd128(FloatRegister lhs, FloatRegister rhs,
-                                 FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void bitwiseXorSimd128(FloatRegister lhs, const SimdConstant& rhs,
-                                 FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void bitwiseNotSimd128(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Bitwise AND with compliment: dest = lhs & ~rhs, note only arm64 can do it.
-   inline void bitwiseAndNotSimd128(FloatRegister lhs, FloatRegister rhs,
--                                   FloatRegister lhsDest) DEFINED_ON(arm64);
-+                                   FloatRegister lhsDest) DEFINED_ON(arm64, ppc64);
- 
-   // Bitwise AND with complement: dest = ~lhs & rhs, note this is not what Wasm
-   // wants but what the x86 hardware offers.  Hence the name.
- 
-   inline void bitwiseNotAndSimd128(FloatRegister rhs, FloatRegister lhsDest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void bitwiseNotAndSimd128(FloatRegister lhs, FloatRegister rhs,
-                                    FloatRegister lhsDest)
-@@ -3120,34 +3127,34 @@ class MacroAssembler : public MacroAssemblerSpecific {
-                                    FloatRegister temp) DEFINED_ON(x86_shared);
- 
-   inline void bitwiseSelectSimd128(FloatRegister onTrue, FloatRegister onFalse,
--                                   FloatRegister maskDest) DEFINED_ON(arm64);
-+                                   FloatRegister maskDest) DEFINED_ON(arm64, ppc64);
- 
-   // Population count
- 
-   inline void popcntInt8x16(FloatRegister src, FloatRegister dest,
--                            FloatRegister temp) DEFINED_ON(x86_shared);
-+                            FloatRegister temp) DEFINED_ON(x86_shared, ppc64);
- 
-   inline void popcntInt8x16(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(arm64);
-+      DEFINED_ON(arm64, ppc64);
- 
-   // Any lane true, ie, any bit set
- 
-   inline void anyTrueSimd128(FloatRegister src, Register dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // All lanes true
- 
-   inline void allTrueInt8x16(FloatRegister src, Register dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void allTrueInt16x8(FloatRegister src, Register dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void allTrueInt32x4(FloatRegister src, Register dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void allTrueInt64x2(FloatRegister src, Register dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Bitmask, ie extract and compress high bits of all lanes
- 
-@@ -3155,31 +3162,31 @@ class MacroAssembler : public MacroAssemblerSpecific {
-       DEFINED_ON(x86_shared);
- 
-   inline void bitmaskInt8x16(FloatRegister src, Register dest,
--                             FloatRegister temp) DEFINED_ON(arm64);
-+                             FloatRegister temp) DEFINED_ON(arm64, ppc64);
- 
-   inline void bitmaskInt16x8(FloatRegister src, Register dest)
-       DEFINED_ON(x86_shared);
- 
-   inline void bitmaskInt16x8(FloatRegister src, Register dest,
--                             FloatRegister temp) DEFINED_ON(arm64);
-+                             FloatRegister temp) DEFINED_ON(arm64, ppc64);
- 
-   inline void bitmaskInt32x4(FloatRegister src, Register dest)
-       DEFINED_ON(x86_shared);
- 
-   inline void bitmaskInt32x4(FloatRegister src, Register dest,
--                             FloatRegister temp) DEFINED_ON(arm64);
-+                             FloatRegister temp) DEFINED_ON(arm64, ppc64);
- 
-   inline void bitmaskInt64x2(FloatRegister src, Register dest)
-       DEFINED_ON(x86_shared);
- 
-   inline void bitmaskInt64x2(FloatRegister src, Register dest,
--                             FloatRegister temp) DEFINED_ON(arm64);
-+                             FloatRegister temp) DEFINED_ON(arm64, ppc64);
- 
-   // Comparisons (integer and floating-point)
- 
-   inline void compareInt8x16(Assembler::Condition cond, FloatRegister rhs,
-                              FloatRegister lhsDest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // On x86_shared, limited to !=, ==, <=, >
-   inline void compareInt8x16(Assembler::Condition cond, FloatRegister lhs,
-@@ -3189,15 +3196,15 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   // On arm64, use any integer comparison condition.
-   inline void compareInt8x16(Assembler::Condition cond, FloatRegister lhs,
-                              FloatRegister rhs, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void compareInt16x8(Assembler::Condition cond, FloatRegister rhs,
-                              FloatRegister lhsDest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void compareInt16x8(Assembler::Condition cond, FloatRegister lhs,
-                              FloatRegister rhs, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // On x86_shared, limited to !=, ==, <=, >
-   inline void compareInt16x8(Assembler::Condition cond, FloatRegister lhs,
-@@ -3207,7 +3214,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   // On x86_shared, limited to !=, ==, <=, >
-   inline void compareInt32x4(Assembler::Condition cond, FloatRegister rhs,
-                              FloatRegister lhsDest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void compareInt32x4(Assembler::Condition cond, FloatRegister lhs,
-                              const SimdConstant& rhs, FloatRegister dest)
-@@ -3216,7 +3223,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   // On arm64, use any integer comparison condition.
-   inline void compareInt32x4(Assembler::Condition cond, FloatRegister lhs,
-                              FloatRegister rhs, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void compareForEqualityInt64x2(Assembler::Condition cond,
-                                         FloatRegister lhs, FloatRegister rhs,
-@@ -3230,15 +3237,15 @@ class MacroAssembler : public MacroAssemblerSpecific {
-       DEFINED_ON(x86_shared);
- 
-   inline void compareInt64x2(Assembler::Condition cond, FloatRegister rhs,
--                             FloatRegister lhsDest) DEFINED_ON(arm64);
-+                             FloatRegister lhsDest) DEFINED_ON(arm64, ppc64);
- 
-   inline void compareInt64x2(Assembler::Condition cond, FloatRegister lhs,
-                              FloatRegister rhs, FloatRegister dest)
--      DEFINED_ON(arm64);
-+      DEFINED_ON(arm64, ppc64);
- 
-   inline void compareFloat32x4(Assembler::Condition cond, FloatRegister rhs,
-                                FloatRegister lhsDest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // On x86_shared, limited to ==, !=, <, <=
-   inline void compareFloat32x4(Assembler::Condition cond, FloatRegister lhs,
-@@ -3249,11 +3256,11 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   // On arm64, use any float-point comparison condition.
-   inline void compareFloat32x4(Assembler::Condition cond, FloatRegister lhs,
-                                FloatRegister rhs, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void compareFloat64x2(Assembler::Condition cond, FloatRegister rhs,
-                                FloatRegister lhsDest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // On x86_shared, limited to ==, !=, <, <=
-   inline void compareFloat64x2(Assembler::Condition cond, FloatRegister lhs,
-@@ -3264,7 +3271,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   // On arm64, use any float-point comparison condition.
-   inline void compareFloat64x2(Assembler::Condition cond, FloatRegister lhs,
-                                FloatRegister rhs, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Load
- 
-@@ -3273,92 +3280,92 @@ class MacroAssembler : public MacroAssemblerSpecific {
- 
-   inline FaultingCodeOffset loadUnalignedSimd128(const Address& src,
-                                                  FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline FaultingCodeOffset loadUnalignedSimd128(const BaseIndex& src,
-                                                  FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Store
- 
-   inline FaultingCodeOffset storeUnalignedSimd128(FloatRegister src,
-                                                   const Address& dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline FaultingCodeOffset storeUnalignedSimd128(FloatRegister src,
-                                                   const BaseIndex& dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Floating point negation
- 
-   inline void negFloat32x4(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void negFloat64x2(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Floating point absolute value
- 
-   inline void absFloat32x4(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void absFloat64x2(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // NaN-propagating minimum
- 
-   inline void minFloat32x4(FloatRegister lhs, FloatRegister rhs,
-                            FloatRegister dest, FloatRegister temp1,
--                           FloatRegister temp2) DEFINED_ON(x86_shared);
-+                           FloatRegister temp2) DEFINED_ON(x86_shared, ppc64);
- 
-   inline void minFloat32x4(FloatRegister rhs, FloatRegister lhsDest)
--      DEFINED_ON(arm64);
-+      DEFINED_ON(arm64, ppc64);
- 
-   inline void minFloat32x4(FloatRegister lhs, FloatRegister rhs,
--                           FloatRegister dest) DEFINED_ON(arm64);
-+                           FloatRegister dest) DEFINED_ON(arm64, ppc64);
- 
-   inline void minFloat64x2(FloatRegister lhs, FloatRegister rhs,
-                            FloatRegister dest, FloatRegister temp1,
--                           FloatRegister temp2) DEFINED_ON(x86_shared);
-+                           FloatRegister temp2) DEFINED_ON(x86_shared, ppc64);
- 
-   inline void minFloat64x2(FloatRegister rhs, FloatRegister lhsDest)
--      DEFINED_ON(arm64);
-+      DEFINED_ON(arm64, ppc64);
- 
-   inline void minFloat64x2(FloatRegister lhs, FloatRegister rhs,
--                           FloatRegister dest) DEFINED_ON(arm64);
-+                           FloatRegister dest) DEFINED_ON(arm64, ppc64);
- 
-   // NaN-propagating maximum
- 
-   inline void maxFloat32x4(FloatRegister lhs, FloatRegister rhs,
-                            FloatRegister dest, FloatRegister temp1,
--                           FloatRegister temp2) DEFINED_ON(x86_shared);
-+                           FloatRegister temp2) DEFINED_ON(x86_shared, ppc64);
- 
-   inline void maxFloat32x4(FloatRegister rhs, FloatRegister lhsDest)
--      DEFINED_ON(arm64);
-+      DEFINED_ON(arm64, ppc64);
- 
-   inline void maxFloat32x4(FloatRegister lhs, FloatRegister rhs,
--                           FloatRegister dest) DEFINED_ON(arm64);
-+                           FloatRegister dest) DEFINED_ON(arm64, ppc64);
- 
-   inline void maxFloat64x2(FloatRegister lhs, FloatRegister rhs,
-                            FloatRegister dest, FloatRegister temp1,
--                           FloatRegister temp2) DEFINED_ON(x86_shared);
-+                           FloatRegister temp2) DEFINED_ON(x86_shared, ppc64);
- 
-   inline void maxFloat64x2(FloatRegister rhs, FloatRegister lhsDest)
--      DEFINED_ON(arm64);
-+      DEFINED_ON(arm64, ppc64);
- 
-   inline void maxFloat64x2(FloatRegister lhs, FloatRegister rhs,
--                           FloatRegister dest) DEFINED_ON(arm64);
-+                           FloatRegister dest) DEFINED_ON(arm64, ppc64);
- 
-   // Floating add
- 
-   inline void addFloat32x4(FloatRegister lhs, FloatRegister rhs,
--                           FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                           FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void addFloat32x4(FloatRegister lhs, const SimdConstant& rhs,
-                            FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void addFloat64x2(FloatRegister lhs, FloatRegister rhs,
--                           FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                           FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void addFloat64x2(FloatRegister lhs, const SimdConstant& rhs,
-                            FloatRegister dest) DEFINED_ON(x86_shared);
-@@ -3366,13 +3373,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   // Floating subtract
- 
-   inline void subFloat32x4(FloatRegister lhs, FloatRegister rhs,
--                           FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                           FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void subFloat32x4(FloatRegister lhs, const SimdConstant& rhs,
-                            FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void subFloat64x2(FloatRegister lhs, FloatRegister rhs,
--                           FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                           FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void subFloat64x2(FloatRegister lhs, const SimdConstant& rhs,
-                            FloatRegister dest) DEFINED_ON(x86_shared);
-@@ -3380,13 +3387,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   // Floating division
- 
-   inline void divFloat32x4(FloatRegister lhs, FloatRegister rhs,
--                           FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                           FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void divFloat32x4(FloatRegister lhs, const SimdConstant& rhs,
-                            FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void divFloat64x2(FloatRegister lhs, FloatRegister rhs,
--                           FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                           FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void divFloat64x2(FloatRegister lhs, const SimdConstant& rhs,
-                            FloatRegister dest) DEFINED_ON(x86_shared);
-@@ -3394,13 +3401,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   // Floating Multiply
- 
-   inline void mulFloat32x4(FloatRegister lhs, FloatRegister rhs,
--                           FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                           FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void mulFloat32x4(FloatRegister lhs, const SimdConstant& rhs,
-                            FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void mulFloat64x2(FloatRegister lhs, FloatRegister rhs,
--                           FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                           FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void mulFloat64x2(FloatRegister lhs, const SimdConstant& rhs,
-                            FloatRegister dest) DEFINED_ON(x86_shared);
-@@ -3408,91 +3415,91 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   // Pairwise add
- 
-   inline void extAddPairwiseInt8x16(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedExtAddPairwiseInt8x16(FloatRegister src,
-                                             FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void extAddPairwiseInt16x8(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedExtAddPairwiseInt16x8(FloatRegister src,
-                                             FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Floating square root
- 
-   inline void sqrtFloat32x4(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void sqrtFloat64x2(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Integer to floating point with rounding
- 
-   inline void convertInt32x4ToFloat32x4(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedConvertInt32x4ToFloat32x4(FloatRegister src,
-                                                 FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void convertInt32x4ToFloat64x2(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedConvertInt32x4ToFloat64x2(FloatRegister src,
-                                                 FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Floating point to integer with saturation
- 
-   inline void truncSatFloat32x4ToInt32x4(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedTruncSatFloat32x4ToInt32x4(FloatRegister src,
-                                                  FloatRegister dest,
-                                                  FloatRegister temp)
--      DEFINED_ON(x86_shared);
-+      DEFINED_ON(x86_shared, ppc64);
- 
-   inline void unsignedTruncSatFloat32x4ToInt32x4(FloatRegister src,
-                                                  FloatRegister dest)
--      DEFINED_ON(arm64);
-+      DEFINED_ON(arm64, ppc64);
- 
-   inline void truncSatFloat64x2ToInt32x4(FloatRegister src, FloatRegister dest,
-                                          FloatRegister temp)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedTruncSatFloat64x2ToInt32x4(FloatRegister src,
-                                                  FloatRegister dest,
-                                                  FloatRegister temp)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void truncFloat32x4ToInt32x4Relaxed(FloatRegister src,
-                                              FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedTruncFloat32x4ToInt32x4Relaxed(FloatRegister src,
-                                                      FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void truncFloat64x2ToInt32x4Relaxed(FloatRegister src,
-                                              FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedTruncFloat64x2ToInt32x4Relaxed(FloatRegister src,
-                                                      FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Floating point narrowing
- 
-   inline void convertFloat64x2ToFloat32x4(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Floating point widening
- 
-   inline void convertFloat32x4ToFloat64x2(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Integer to integer narrowing
- 
-@@ -3500,65 +3507,65 @@ class MacroAssembler : public MacroAssemblerSpecific {
-                             FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void narrowInt16x8(FloatRegister lhs, FloatRegister rhs,
--                            FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                            FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedNarrowInt16x8(FloatRegister lhs, const SimdConstant& rhs,
-                                     FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void unsignedNarrowInt16x8(FloatRegister lhs, FloatRegister rhs,
-                                     FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void narrowInt32x4(FloatRegister lhs, const SimdConstant& rhs,
-                             FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void narrowInt32x4(FloatRegister lhs, FloatRegister rhs,
--                            FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                            FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedNarrowInt32x4(FloatRegister lhs, const SimdConstant& rhs,
-                                     FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void unsignedNarrowInt32x4(FloatRegister lhs, FloatRegister rhs,
-                                     FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Integer to integer widening
- 
-   inline void widenLowInt8x16(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void widenHighInt8x16(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedWidenLowInt8x16(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedWidenHighInt8x16(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void widenLowInt16x8(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void widenHighInt16x8(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedWidenLowInt16x8(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedWidenHighInt16x8(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void widenLowInt32x4(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedWidenLowInt32x4(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void widenHighInt32x4(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void unsignedWidenHighInt32x4(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Compare-based minimum/maximum
-   //
-@@ -3570,47 +3577,47 @@ class MacroAssembler : public MacroAssemblerSpecific {
- 
-   inline void pseudoMinFloat32x4(FloatRegister rhsOrRhsDest,
-                                  FloatRegister lhsOrLhsDest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void pseudoMinFloat32x4(FloatRegister lhs, FloatRegister rhs,
-                                  FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void pseudoMinFloat64x2(FloatRegister rhsOrRhsDest,
-                                  FloatRegister lhsOrLhsDest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void pseudoMinFloat64x2(FloatRegister lhs, FloatRegister rhs,
-                                  FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void pseudoMaxFloat32x4(FloatRegister rhsOrRhsDest,
-                                  FloatRegister lhsOrLhsDest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void pseudoMaxFloat32x4(FloatRegister lhs, FloatRegister rhs,
-                                  FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void pseudoMaxFloat64x2(FloatRegister rhsOrRhsDest,
-                                  FloatRegister lhsOrLhsDest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void pseudoMaxFloat64x2(FloatRegister lhs, FloatRegister rhs,
-                                  FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Widening/pairwise integer dot product
- 
-   inline void widenDotInt16x8(FloatRegister lhs, FloatRegister rhs,
--                              FloatRegister dest) DEFINED_ON(x86_shared, arm64);
-+                              FloatRegister dest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void widenDotInt16x8(FloatRegister lhs, const SimdConstant& rhs,
-                               FloatRegister dest) DEFINED_ON(x86_shared);
- 
-   inline void dotInt8x16Int7x16(FloatRegister lhs, FloatRegister rhs,
-                                 FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void dotInt8x16Int7x16ThenAdd(FloatRegister lhs, FloatRegister rhs,
-                                        FloatRegister dest)
-@@ -3618,81 +3625,81 @@ class MacroAssembler : public MacroAssemblerSpecific {
- 
-   inline void dotInt8x16Int7x16ThenAdd(FloatRegister lhs, FloatRegister rhs,
-                                        FloatRegister dest, FloatRegister temp)
--      DEFINED_ON(arm64);
-+      DEFINED_ON(arm64, ppc64);
- 
-   // Floating point rounding
- 
-   inline void ceilFloat32x4(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void ceilFloat64x2(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void floorFloat32x4(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void floorFloat64x2(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void truncFloat32x4(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void truncFloat64x2(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void nearestFloat32x4(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void nearestFloat64x2(FloatRegister src, FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   // Floating multiply-accumulate: srcDest [+-]= src1 * src2
- 
-   inline void fmaFloat32x4(FloatRegister src1, FloatRegister src2,
--                           FloatRegister srcDest) DEFINED_ON(x86_shared, arm64);
-+                           FloatRegister srcDest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void fnmaFloat32x4(FloatRegister src1, FloatRegister src2,
-                             FloatRegister srcDest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void fmaFloat64x2(FloatRegister src1, FloatRegister src2,
--                           FloatRegister srcDest) DEFINED_ON(x86_shared, arm64);
-+                           FloatRegister srcDest) DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void fnmaFloat64x2(FloatRegister src1, FloatRegister src2,
-                             FloatRegister srcDest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void minFloat32x4Relaxed(FloatRegister src, FloatRegister srcDest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void minFloat32x4Relaxed(FloatRegister lhs, FloatRegister rhs,
-                                   FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void maxFloat32x4Relaxed(FloatRegister src, FloatRegister srcDest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void maxFloat32x4Relaxed(FloatRegister lhs, FloatRegister rhs,
-                                   FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void minFloat64x2Relaxed(FloatRegister src, FloatRegister srcDest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void minFloat64x2Relaxed(FloatRegister lhs, FloatRegister rhs,
-                                   FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void maxFloat64x2Relaxed(FloatRegister src, FloatRegister srcDest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void maxFloat64x2Relaxed(FloatRegister lhs, FloatRegister rhs,
-                                   FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-   inline void q15MulrInt16x8Relaxed(FloatRegister lhs, FloatRegister rhs,
-                                     FloatRegister dest)
--      DEFINED_ON(x86_shared, arm64);
-+      DEFINED_ON(x86_shared, arm64, ppc64);
- 
-  public:
-   // ========================================================================
-@@ -3717,10 +3724,10 @@ class MacroAssembler : public MacroAssemblerSpecific {
- 
-   // temp required on x86 and x64; must be undefined on mips64 and loong64.
-   void convertUInt64ToFloat32(Register64 src, FloatRegister dest, Register temp)
--      DEFINED_ON(arm64, mips64, loong64, riscv64, wasm32, x64, x86);
-+      DEFINED_ON(arm64, mips64, loong64, ppc64, riscv64, wasm32, x64, x86);
- 
-   void convertInt64ToFloat32(Register64 src, FloatRegister dest)
--      DEFINED_ON(arm64, mips64, loong64, riscv64, wasm32, x64, x86);
-+      DEFINED_ON(arm64, mips64, loong64, ppc64, riscv64, wasm32, x64, x86);
- 
-   bool convertUInt64ToDoubleNeedsTemp() PER_ARCH;
- 
-@@ -3801,16 +3808,16 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   // Scalar::Int64.
-   void wasmLoad(const wasm::MemoryAccessDesc& access, Register memoryBase,
-                 Register ptr, Register ptrScratch, AnyRegister output)
--      DEFINED_ON(arm, loong64, riscv64, mips64);
-+      DEFINED_ON(arm, loong64, riscv64, mips64, ppc64);
-   void wasmLoadI64(const wasm::MemoryAccessDesc& access, Register memoryBase,
-                    Register ptr, Register ptrScratch, Register64 output)
--      DEFINED_ON(arm, mips64, loong64, riscv64);
-+      DEFINED_ON(arm, mips64, loong64, riscv64, ppc64);
-   void wasmStore(const wasm::MemoryAccessDesc& access, AnyRegister value,
-                  Register memoryBase, Register ptr, Register ptrScratch)
--      DEFINED_ON(arm, loong64, riscv64, mips64);
-+      DEFINED_ON(arm, loong64, riscv64, mips64, ppc64);
-   void wasmStoreI64(const wasm::MemoryAccessDesc& access, Register64 value,
-                     Register memoryBase, Register ptr, Register ptrScratch)
--      DEFINED_ON(arm, mips64, loong64, riscv64);
-+      DEFINED_ON(arm, mips64, loong64, riscv64, ppc64);
- 
-   // These accept general memoryBase + ptr + offset (in `access`); the offset is
-   // always smaller than the guard region.  They will insert an additional add
-@@ -3889,11 +3896,11 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   void wasmTruncateDoubleToInt64(FloatRegister input, Register64 output,
-                                  bool isSaturating, Label* oolEntry,
-                                  Label* oolRejoin, FloatRegister tempDouble)
--      DEFINED_ON(arm64, x86, x64, mips64, loong64, riscv64, wasm32);
-+      DEFINED_ON(arm64, x86, x64, mips64, loong64, riscv64, wasm32, ppc64);
-   void wasmTruncateDoubleToUInt64(FloatRegister input, Register64 output,
-                                   bool isSaturating, Label* oolEntry,
-                                   Label* oolRejoin, FloatRegister tempDouble)
--      DEFINED_ON(arm64, x86, x64, mips64, loong64, riscv64, wasm32);
-+      DEFINED_ON(arm64, x86, x64, mips64, loong64, riscv64, wasm32, ppc64);
-   void oolWasmTruncateCheckF64ToI64(FloatRegister input, Register64 output,
-                                     TruncFlags flags,
-                                     const wasm::TrapSiteDesc& trapSiteDesc,
-@@ -3902,11 +3909,11 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   void wasmTruncateFloat32ToInt64(FloatRegister input, Register64 output,
-                                   bool isSaturating, Label* oolEntry,
-                                   Label* oolRejoin, FloatRegister tempDouble)
--      DEFINED_ON(arm64, x86, x64, mips64, loong64, riscv64, wasm32);
-+      DEFINED_ON(arm64, x86, x64, mips64, loong64, riscv64, wasm32, ppc64);
-   void wasmTruncateFloat32ToUInt64(FloatRegister input, Register64 output,
-                                    bool isSaturating, Label* oolEntry,
-                                    Label* oolRejoin, FloatRegister tempDouble)
--      DEFINED_ON(arm64, x86, x64, mips64, loong64, riscv64, wasm32);
-+      DEFINED_ON(arm64, x86, x64, mips64, loong64, riscv64, wasm32, ppc64);
-   void oolWasmTruncateCheckF32ToI64(FloatRegister input, Register64 output,
-                                     TruncFlags flags,
-                                     const wasm::TrapSiteDesc& trapSiteDesc,
-@@ -4220,7 +4227,8 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   // convention, which requires predictable high bits.  In practice, this means
-   // that the 32-bit value will be zero-extended or sign-extended to 64 bits as
-   // appropriate for the platform.
--  void widenInt32(Register r) DEFINED_ON(arm64, x64, mips64, loong64, riscv64);
-+  void widenInt32(Register r)
-+      DEFINED_ON(arm64, x64, mips64, loong64, riscv64, ppc64);
- 
-   // As enterFakeExitFrame(), but using register conventions appropriate for
-   // wasm stubs.
-@@ -4287,13 +4295,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
-                        const Address& mem, Register expected,
-                        Register replacement, Register valueTemp,
-                        Register offsetTemp, Register maskTemp, Register output)
--      DEFINED_ON(mips64, loong64, riscv64);
-+      DEFINED_ON(mips64, loong64, riscv64, ppc64);
- 
-   void compareExchange(Scalar::Type type, Synchronization sync,
-                        const BaseIndex& mem, Register expected,
-                        Register replacement, Register valueTemp,
-                        Register offsetTemp, Register maskTemp, Register output)
--      DEFINED_ON(mips64, loong64, riscv64);
-+      DEFINED_ON(mips64, loong64, riscv64, ppc64);
- 
-   // x86: `expected` and `output` must be edx:eax; `replacement` is ecx:ebx.
-   // x64: `output` must be rax.
-@@ -4303,12 +4311,12 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   void compareExchange64(Synchronization sync, const Address& mem,
-                          Register64 expected, Register64 replacement,
-                          Register64 output)
--      DEFINED_ON(arm, arm64, x64, x86, mips64, loong64, riscv64);
-+      DEFINED_ON(arm, arm64, x64, x86, mips64, loong64, riscv64, ppc64);
- 
-   void compareExchange64(Synchronization sync, const BaseIndex& mem,
-                          Register64 expected, Register64 replacement,
-                          Register64 output)
--      DEFINED_ON(arm, arm64, x64, x86, mips64, loong64, riscv64);
-+      DEFINED_ON(arm, arm64, x64, x86, mips64, loong64, riscv64, ppc64);
- 
-   // Exchange with memory.  Return the value initially in memory.
-   // MIPS: `valueTemp`, `offsetTemp` and `maskTemp` must be defined for 8-bit
-@@ -4325,12 +4333,12 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   void atomicExchange(Scalar::Type type, Synchronization sync,
-                       const Address& mem, Register value, Register valueTemp,
-                       Register offsetTemp, Register maskTemp, Register output)
--      DEFINED_ON(mips64, loong64, riscv64);
-+      DEFINED_ON(mips64, loong64, riscv64, ppc64);
- 
-   void atomicExchange(Scalar::Type type, Synchronization sync,
-                       const BaseIndex& mem, Register value, Register valueTemp,
-                       Register offsetTemp, Register maskTemp, Register output)
--      DEFINED_ON(mips64, loong64, riscv64);
-+      DEFINED_ON(mips64, loong64, riscv64, ppc64);
- 
-   // x86: `value` must be ecx:ebx; `output` must be edx:eax.
-   // ARM: `value` and `output` must be distinct and (even,odd) pairs.
-@@ -4338,11 +4346,11 @@ class MacroAssembler : public MacroAssemblerSpecific {
- 
-   void atomicExchange64(Synchronization sync, const Address& mem,
-                         Register64 value, Register64 output)
--      DEFINED_ON(arm, arm64, x64, x86, mips64, loong64, riscv64);
-+      DEFINED_ON(arm, arm64, x64, x86, mips64, loong64, riscv64, ppc64);
- 
-   void atomicExchange64(Synchronization sync, const BaseIndex& mem,
-                         Register64 value, Register64 output)
--      DEFINED_ON(arm, arm64, x64, x86, mips64, loong64, riscv64);
-+      DEFINED_ON(arm, arm64, x64, x86, mips64, loong64, riscv64, ppc64);
- 
-   // Read-modify-write with memory.  Return the value in memory before the
-   // operation.
-@@ -4376,12 +4384,12 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   void atomicFetchOp(Scalar::Type type, Synchronization sync, AtomicOp op,
-                      Register value, const Address& mem, Register valueTemp,
-                      Register offsetTemp, Register maskTemp, Register output)
--      DEFINED_ON(mips64, loong64, riscv64);
-+      DEFINED_ON(mips64, loong64, riscv64, ppc64);
- 
-   void atomicFetchOp(Scalar::Type type, Synchronization sync, AtomicOp op,
-                      Register value, const BaseIndex& mem, Register valueTemp,
-                      Register offsetTemp, Register maskTemp, Register output)
--      DEFINED_ON(mips64, loong64, riscv64);
-+      DEFINED_ON(mips64, loong64, riscv64, ppc64);
- 
-   // x86:
-   //   `temp` must be ecx:ebx; `output` must be edx:eax.
-@@ -4395,7 +4403,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
- 
-   void atomicFetchOp64(Synchronization sync, AtomicOp op, Register64 value,
-                        const Address& mem, Register64 temp, Register64 output)
--      DEFINED_ON(arm, arm64, x64, mips64, loong64, riscv64);
-+      DEFINED_ON(arm, arm64, x64, mips64, loong64, riscv64, ppc64);
- 
-   void atomicFetchOp64(Synchronization sync, AtomicOp op, const Address& value,
-                        const Address& mem, Register64 temp, Register64 output)
-@@ -4403,7 +4411,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
- 
-   void atomicFetchOp64(Synchronization sync, AtomicOp op, Register64 value,
-                        const BaseIndex& mem, Register64 temp, Register64 output)
--      DEFINED_ON(arm, arm64, x64, mips64, loong64, riscv64);
-+      DEFINED_ON(arm, arm64, x64, mips64, loong64, riscv64, ppc64);
- 
-   void atomicFetchOp64(Synchronization sync, AtomicOp op, const Address& value,
-                        const BaseIndex& mem, Register64 temp, Register64 output)
-@@ -4421,14 +4429,14 @@ class MacroAssembler : public MacroAssemblerSpecific {
- 
-   void atomicEffectOp64(Synchronization sync, AtomicOp op, Register64 value,
-                         const Address& mem, Register64 temp)
--      DEFINED_ON(arm, arm64, mips64, loong64, riscv64);
-+      DEFINED_ON(arm, arm64, mips64, loong64, riscv64, ppc64);
- 
-   void atomicEffectOp64(Synchronization sync, AtomicOp op, Register64 value,
-                         const BaseIndex& mem) DEFINED_ON(x64);
- 
-   void atomicEffectOp64(Synchronization sync, AtomicOp op, Register64 value,
-                         const BaseIndex& mem, Register64 temp)
--      DEFINED_ON(arm, arm64, mips64, loong64, riscv64);
-+      DEFINED_ON(arm, arm64, mips64, loong64, riscv64, ppc64);
- 
-   // 64-bit atomic load. On 64-bit systems, use regular load with
-   // Synchronization::Load, not this method.
-@@ -4481,14 +4489,14 @@ class MacroAssembler : public MacroAssemblerSpecific {
-                            Register replacement, Register valueTemp,
-                            Register offsetTemp, Register maskTemp,
-                            Register output)
--      DEFINED_ON(mips64, loong64, riscv64);
-+      DEFINED_ON(mips64, loong64, riscv64, ppc64);
- 
-   void wasmCompareExchange(const wasm::MemoryAccessDesc& access,
-                            const BaseIndex& mem, Register expected,
-                            Register replacement, Register valueTemp,
-                            Register offsetTemp, Register maskTemp,
-                            Register output)
--      DEFINED_ON(mips64, loong64, riscv64);
-+      DEFINED_ON(mips64, loong64, riscv64, ppc64);
- 
-   void wasmAtomicExchange(const wasm::MemoryAccessDesc& access,
-                           const Address& mem, Register value, Register output)
-@@ -4502,13 +4510,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
-                           const Address& mem, Register value,
-                           Register valueTemp, Register offsetTemp,
-                           Register maskTemp, Register output)
--      DEFINED_ON(mips64, loong64, riscv64);
-+      DEFINED_ON(mips64, loong64, riscv64, ppc64);
- 
-   void wasmAtomicExchange(const wasm::MemoryAccessDesc& access,
-                           const BaseIndex& mem, Register value,
-                           Register valueTemp, Register offsetTemp,
-                           Register maskTemp, Register output)
--      DEFINED_ON(mips64, loong64, riscv64);
-+      DEFINED_ON(mips64, loong64, riscv64, ppc64);
- 
-   void wasmAtomicFetchOp(const wasm::MemoryAccessDesc& access, AtomicOp op,
-                          Register value, const Address& mem, Register temp,
-@@ -4529,13 +4537,14 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   void wasmAtomicFetchOp(const wasm::MemoryAccessDesc& access, AtomicOp op,
-                          Register value, const Address& mem, Register valueTemp,
-                          Register offsetTemp, Register maskTemp,
--                         Register output) DEFINED_ON(mips64, loong64, riscv64);
-+                         Register output)
-+      DEFINED_ON(mips64, loong64, riscv64, ppc64);
- 
-   void wasmAtomicFetchOp(const wasm::MemoryAccessDesc& access, AtomicOp op,
-                          Register value, const BaseIndex& mem,
-                          Register valueTemp, Register offsetTemp,
-                          Register maskTemp, Register output)
--      DEFINED_ON(mips64, loong64, riscv64);
-+      DEFINED_ON(mips64, loong64, riscv64, ppc64);
- 
-   // Read-modify-write with memory.  Return no value.
-   //
-@@ -4562,13 +4571,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
-                           Register value, const Address& mem,
-                           Register valueTemp, Register offsetTemp,
-                           Register maskTemp)
--      DEFINED_ON(mips64, loong64, riscv64);
-+      DEFINED_ON(mips64, loong64, riscv64, ppc64);
- 
-   void wasmAtomicEffectOp(const wasm::MemoryAccessDesc& access, AtomicOp op,
-                           Register value, const BaseIndex& mem,
-                           Register valueTemp, Register offsetTemp,
-                           Register maskTemp)
--      DEFINED_ON(mips64, loong64, riscv64);
-+      DEFINED_ON(mips64, loong64, riscv64, ppc64);
- 
-   // 64-bit wide operations.
- 
-@@ -4626,12 +4635,12 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   void wasmAtomicFetchOp64(const wasm::MemoryAccessDesc& access, AtomicOp op,
-                            Register64 value, const Address& mem,
-                            Register64 temp, Register64 output)
--      DEFINED_ON(arm, arm64, mips64, loong64, riscv64, x64);
-+      DEFINED_ON(arm, arm64, mips64, loong64, riscv64, ppc64, x64);
- 
-   void wasmAtomicFetchOp64(const wasm::MemoryAccessDesc& access, AtomicOp op,
-                            Register64 value, const BaseIndex& mem,
-                            Register64 temp, Register64 output)
--      DEFINED_ON(arm, arm64, mips64, loong64, riscv64, x64);
-+      DEFINED_ON(arm, arm64, mips64, loong64, riscv64, ppc64, x64);
- 
-   void wasmAtomicFetchOp64(const wasm::MemoryAccessDesc& access, AtomicOp op,
-                            const Address& value, const Address& mem,
-@@ -4684,14 +4693,14 @@ class MacroAssembler : public MacroAssemblerSpecific {
-                          Register replacement, Register valueTemp,
-                          Register offsetTemp, Register maskTemp, Register temp,
-                          AnyRegister output)
--      DEFINED_ON(mips64, loong64, riscv64);
-+      DEFINED_ON(mips64, loong64, riscv64, ppc64);
- 
-   void compareExchangeJS(Scalar::Type arrayType, Synchronization sync,
-                          const BaseIndex& mem, Register expected,
-                          Register replacement, Register valueTemp,
-                          Register offsetTemp, Register maskTemp, Register temp,
-                          AnyRegister output)
--      DEFINED_ON(mips64, loong64, riscv64);
-+      DEFINED_ON(mips64, loong64, riscv64, ppc64);
- 
-   void atomicExchangeJS(Scalar::Type arrayType, Synchronization sync,
-                         const Address& mem, Register value, Register temp,
-@@ -4705,13 +4714,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
-                         const Address& mem, Register value, Register valueTemp,
-                         Register offsetTemp, Register maskTemp, Register temp,
-                         AnyRegister output)
--      DEFINED_ON(mips64, loong64, riscv64);
-+      DEFINED_ON(mips64, loong64, riscv64, ppc64);
- 
-   void atomicExchangeJS(Scalar::Type arrayType, Synchronization sync,
-                         const BaseIndex& mem, Register value,
-                         Register valueTemp, Register offsetTemp,
-                         Register maskTemp, Register temp, AnyRegister output)
--      DEFINED_ON(mips64, loong64, riscv64);
-+      DEFINED_ON(mips64, loong64, riscv64, ppc64);
- 
-   void atomicFetchOpJS(Scalar::Type arrayType, Synchronization sync,
-                        AtomicOp op, Register value, const Address& mem,
-@@ -4737,13 +4746,13 @@ class MacroAssembler : public MacroAssemblerSpecific {
-                        AtomicOp op, Register value, const Address& mem,
-                        Register valueTemp, Register offsetTemp,
-                        Register maskTemp, Register temp, AnyRegister output)
--      DEFINED_ON(mips64, loong64, riscv64);
-+      DEFINED_ON(mips64, loong64, riscv64, ppc64);
- 
-   void atomicFetchOpJS(Scalar::Type arrayType, Synchronization sync,
-                        AtomicOp op, Register value, const BaseIndex& mem,
-                        Register valueTemp, Register offsetTemp,
-                        Register maskTemp, Register temp, AnyRegister output)
--      DEFINED_ON(mips64, loong64, riscv64);
-+      DEFINED_ON(mips64, loong64, riscv64, ppc64);
- 
-   void atomicEffectOpJS(Scalar::Type arrayType, Synchronization sync,
-                         AtomicOp op, Register value, const Address& mem,
-@@ -4764,12 +4773,14 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   void atomicEffectOpJS(Scalar::Type arrayType, Synchronization sync,
-                         AtomicOp op, Register value, const Address& mem,
-                         Register valueTemp, Register offsetTemp,
--                        Register maskTemp) DEFINED_ON(mips64, loong64, riscv64);
-+                        Register maskTemp)
-+      DEFINED_ON(mips64, loong64, riscv64, ppc64);
- 
-   void atomicEffectOpJS(Scalar::Type arrayType, Synchronization sync,
-                         AtomicOp op, Register value, const BaseIndex& mem,
-                         Register valueTemp, Register offsetTemp,
--                        Register maskTemp) DEFINED_ON(mips64, loong64, riscv64);
-+                        Register maskTemp)
-+      DEFINED_ON(mips64, loong64, riscv64, ppc64);
- 
-   void atomicIsLockFreeJS(Register value, Register output);
- 
-@@ -5928,7 +5939,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
-   inline void addStackPtrTo(T t);
- 
-   void subFromStackPtr(Imm32 imm32)
--      DEFINED_ON(mips64, loong64, riscv64, wasm32, arm, x86, x64);
-+      DEFINED_ON(mips64, loong64, riscv64, ppc64, wasm32, arm, x86, x64);
-   void subFromStackPtr(Register reg);
- 
-   template <typename T>
-diff --git a/js/src/jit/MoveEmitter.h b/js/src/jit/MoveEmitter.h
-index 642829c070d6..3a883c596ca0 100644
---- a/js/src/jit/MoveEmitter.h
-+++ b/js/src/jit/MoveEmitter.h
-@@ -17,6 +17,8 @@
- #  include "jit/loong64/MoveEmitter-loong64.h"
- #elif defined(JS_CODEGEN_RISCV64)
- #  include "jit/riscv64/MoveEmitter-riscv64.h"
-+#elif defined(JS_CODEGEN_PPC64)
-+#  include "jit/ppc64/MoveEmitter-ppc64.h"
- #elif defined(JS_CODEGEN_WASM32)
- #  include "jit/wasm32/MoveEmitter-wasm32.h"
- #elif defined(JS_CODEGEN_NONE)
-diff --git a/js/src/jit/MoveResolver.cpp b/js/src/jit/MoveResolver.cpp
-index d2e1f12700bd..8e622407a0a8 100644
---- a/js/src/jit/MoveResolver.cpp
-+++ b/js/src/jit/MoveResolver.cpp
-@@ -57,6 +57,22 @@ bool MoveResolver::addMove(const MoveOperand& from, const MoveOperand& to,
-                            MoveOp::Type type) {
-   // Assert that we're not doing no-op moves.
-   MOZ_ASSERT(!(from == to));
-+#ifdef JS_CODEGEN_PPC64
-+  // PPC64 FloatRegisters expose Single/Double kinds that have distinct code()
-+  // values but share one physical register. The register allocator can emit a
-+  // move between two such kind-views of the same FPR (e.g. f2-Double to
-+  // f2-Single); these are no-ops on the hardware, are not caught by the
-+  // (from == to) assert above, and would otherwise trip the
-+  // !from().aliases(to()) invariant the resolver relies on later. Drop them.
-+  //
-+  // This would be correct for any backend whose FloatRegister has multiple
-+  // kinds aliasing one physical register, and could be un-gated if another
-+  // such backend needs it, but it is scoped to PPC64 so move resolution on
-+  // tier-1 platforms is left unchanged.
-+  if (from.aliases(to)) {
-+    return true;
-+  }
-+#endif
-   PendingMove* pm = movePool_.allocate(from, to, type);
-   if (!pm) {
-     return false;
-diff --git a/js/src/jit/RegisterAllocator.h b/js/src/jit/RegisterAllocator.h
-index eda9933f6322..42e48111046a 100644
---- a/js/src/jit/RegisterAllocator.h
-+++ b/js/src/jit/RegisterAllocator.h
-@@ -262,9 +262,10 @@ class RegisterAllocator {
-  public:
-   template <typename TakeableSet>
-   static void takeWasmRegisters(TakeableSet& regs) {
--#if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM) ||      \
--    defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) || \
--    defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
-+#if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM) ||         \
-+    defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) ||    \
-+    defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || \
-+    defined(JS_CODEGEN_PPC64)
-     regs.take(HeapReg);
- #endif
-     MOZ_ASSERT(!regs.has(FramePointer));
-diff --git a/js/src/jit/Registers.h b/js/src/jit/Registers.h
-index e0d02e2fb60d..423777ce38cd 100644
---- a/js/src/jit/Registers.h
-+++ b/js/src/jit/Registers.h
-@@ -20,6 +20,8 @@
- #  include "jit/loong64/Architecture-loong64.h"
- #elif defined(JS_CODEGEN_RISCV64)
- #  include "jit/riscv64/Architecture-riscv64.h"
-+#elif defined(JS_CODEGEN_PPC64)
-+#  include "jit/ppc64/Architecture-ppc64.h"
- #elif defined(JS_CODEGEN_WASM32)
- #  include "jit/wasm32/Architecture-wasm32.h"
- #elif defined(JS_CODEGEN_NONE)
-diff --git a/js/src/jit/Safepoints.cpp b/js/src/jit/Safepoints.cpp
-index 42e305f053af..8e3a25c3c5ff 100644
---- a/js/src/jit/Safepoints.cpp
-+++ b/js/src/jit/Safepoints.cpp
-@@ -63,6 +63,11 @@ static void WriteFloatRegisterMask(CompactBufferWriter& stream,
-       stream.writeUnsigned64(bits.low());
-       stream.writeUnsigned64(bits.high());
-       break;
-+#elif defined(JS_CODEGEN_PPC64)
-+    case 16:
-+      stream.writeUnsigned64(static_cast<uint64_t>(bits));
-+      stream.writeUnsigned64(static_cast<uint64_t>(bits >> 64));
-+      break;
- #else
-     case 1:
-       stream.writeByte(bits);
-@@ -88,6 +93,12 @@ static FloatRegisters::SetType ReadFloatRegisterMask(
-       uint64_t high = stream.readUnsigned64();
-       return Bitset128(high, low);
-     }
-+#elif defined(JS_CODEGEN_PPC64)
-+    case 16: {
-+      uint64_t low = stream.readUnsigned64();
-+      uint64_t high = stream.readUnsigned64();
-+      return FloatRegisters::SetType(high) << 64 | FloatRegisters::SetType(low);
-+    }
- #else
-     case 1:
-       return stream.readByte();
-diff --git a/js/src/jit/SharedICHelpers-inl.h b/js/src/jit/SharedICHelpers-inl.h
-index eedccc831732..1005b140f1df 100644
---- a/js/src/jit/SharedICHelpers-inl.h
-+++ b/js/src/jit/SharedICHelpers-inl.h
-@@ -19,6 +19,8 @@
- #  include "jit/loong64/SharedICHelpers-loong64-inl.h"
- #elif defined(JS_CODEGEN_RISCV64)
- #  include "jit/riscv64/SharedICHelpers-riscv64-inl.h"
-+#elif defined(JS_CODEGEN_PPC64)
-+#  include "jit/ppc64/SharedICHelpers-ppc64-inl.h"
- #elif defined(JS_CODEGEN_WASM32)
- #  include "jit/wasm32/SharedICHelpers-wasm32-inl.h"
- #elif defined(JS_CODEGEN_NONE)
-diff --git a/js/src/jit/SharedICHelpers.h b/js/src/jit/SharedICHelpers.h
-index 1ebd61e44509..f2703c6f986c 100644
---- a/js/src/jit/SharedICHelpers.h
-+++ b/js/src/jit/SharedICHelpers.h
-@@ -19,6 +19,8 @@
- #  include "jit/loong64/SharedICHelpers-loong64.h"
- #elif defined(JS_CODEGEN_RISCV64)
- #  include "jit/riscv64/SharedICHelpers-riscv64.h"
-+#elif defined(JS_CODEGEN_PPC64)
-+#  include "jit/ppc64/SharedICHelpers-ppc64.h"
- #elif defined(JS_CODEGEN_WASM32)
- #  include "jit/wasm32/SharedICHelpers-wasm32.h"
- #elif defined(JS_CODEGEN_NONE)
-diff --git a/js/src/jit/SharedICRegisters.h b/js/src/jit/SharedICRegisters.h
-index c3ab86bf0a82..5b270d0c166a 100644
---- a/js/src/jit/SharedICRegisters.h
-+++ b/js/src/jit/SharedICRegisters.h
-@@ -19,6 +19,8 @@
- #  include "jit/loong64/SharedICRegisters-loong64.h"
- #elif defined(JS_CODEGEN_RISCV64)
- #  include "jit/riscv64/SharedICRegisters-riscv64.h"
-+#elif defined(JS_CODEGEN_PPC64)
-+#  include "jit/ppc64/SharedICRegisters-ppc64.h"
- #elif defined(JS_CODEGEN_WASM32)
- #  include "jit/wasm32/SharedICRegisters-wasm32.h"
- #elif defined(JS_CODEGEN_NONE)
-diff --git a/js/src/jit/Simulator.h b/js/src/jit/Simulator.h
-index 39503716f10d..9f60baf53198 100644
---- a/js/src/jit/Simulator.h
-+++ b/js/src/jit/Simulator.h
-@@ -15,6 +15,8 @@
- #  include "jit/loong64/Simulator-loong64.h"
- #elif defined(JS_SIMULATOR_RISCV64)
- #  include "jit/riscv64/Simulator-riscv64.h"
-+#elif defined(JS_SIMULATOR_PPC64)
-+#  include "jit/ppc64/Simulator-ppc64.h"
- #elif defined(JS_SIMULATOR)
- #  error "Unexpected simulator platform"
- #endif
-diff --git a/js/src/jit/moz.build b/js/src/jit/moz.build
-index 5b5df3e5b7b2..36ef65d6221a 100644
---- a/js/src/jit/moz.build
-+++ b/js/src/jit/moz.build
-@@ -228,6 +228,18 @@ elif CONFIG["JS_CODEGEN_LOONG64"]:
-     ]
-     if CONFIG["JS_SIMULATOR_LOONG64"]:
-         UNIFIED_SOURCES += ["loong64/Simulator-loong64.cpp"]
-+elif CONFIG["JS_CODEGEN_PPC64"]:
-+    UNIFIED_SOURCES += [
-+        "ppc64/Architecture-ppc64.cpp",
-+        "ppc64/Assembler-ppc64.cpp",
-+        "ppc64/CodeGenerator-ppc64.cpp",
-+        "ppc64/Lowering-ppc64.cpp",
-+        "ppc64/MacroAssembler-ppc64.cpp",
-+        "ppc64/MoveEmitter-ppc64.cpp",
-+        "ppc64/Trampoline-ppc64.cpp",
-+    ]
-+    if CONFIG["JS_SIMULATOR_PPC64"]:
-+        UNIFIED_SOURCES += ["ppc64/Simulator-ppc64.cpp"]
- elif CONFIG["JS_CODEGEN_RISCV64"]:
-     UNIFIED_SOURCES += [
-         "riscv64/Architecture-riscv64.cpp",
-diff --git a/js/src/jit/ppc64/Architecture-ppc64.cpp b/js/src/jit/ppc64/Architecture-ppc64.cpp
-new file mode 100644
-index 000000000000..5632865556ac
---- /dev/null
-+++ b/js/src/jit/ppc64/Architecture-ppc64.cpp
-@@ -0,0 +1,221 @@
-+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
-+ * vim: set ts=8 sts=2 et sw=2 tw=80:
-+ * This Source Code Form is subject to the terms of the Mozilla Public
-+ * License, v. 2.0. If a copy of the MPL was not distributed with this
-+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-+
-+#include "jit/ppc64/Architecture-ppc64.h"
-+
-+#ifndef JS_SIMULATOR
-+#  include <sys/auxv.h>
-+#endif
-+
-+#include "jit/FlushICache.h"  // js::jit::FlushICache
-+#include "jit/RegisterSets.h"
-+
-+namespace js {
-+namespace jit {
-+
-+Registers::Code Registers::FromName(const char* name) {
-+  for (size_t i = 0; i < Total; i++) {
-+    if (strcmp(GetName(i), name) == 0) {
-+      return Code(i);
-+    }
-+  }
-+
-+  return Invalid;
-+}
-+
-+FloatRegisters::Code FloatRegisters::FromName(const char* name) {
-+  for (size_t i = 0; i < Total; i++) {
-+    if (strcmp(GetName(i), name) == 0) {
-+      return Code(i);
-+    }
-+  }
-+
-+  return Invalid;
-+}
-+
-+FloatRegisterSet FloatRegister::ReduceSetForPush(const FloatRegisterSet& s) {
-+  SetType all = s.bits();
-+  SetType simd128Set =
-+      (all >> (uint32_t(FloatRegisters::Simd128) * FloatRegisters::TotalPhys)) &
-+      FloatRegisters::AllPhysMask;
-+  SetType doubleSet =
-+      (all >> (uint32_t(FloatRegisters::Double) * FloatRegisters::TotalPhys)) &
-+      FloatRegisters::AllPhysMask;
-+  SetType singleSet =
-+      (all >> (uint32_t(FloatRegisters::Single) * FloatRegisters::TotalPhys)) &
-+      FloatRegisters::AllPhysMask;
-+
-+  // Single+Double share physical FPRs (push as Double, 8-byte slot);
-+  // Simd128 lives in its own physical VRs (push as Simd128, 16-byte
-+  // slot). Different physical pools — no dedup. Note that
-+  // sizeof(FloatRegisters::RegisterContent) is 8 bytes (no v128 in the
-+  // union), so RegisterDump::FPUArray is 32 × 8 = 256 bytes, matching
-+  // the Float-only layout PushRegsInMask produces.
-+  SetType set64 = singleSet | doubleSet;
-+
-+  SetType reduced =
-+      (simd128Set << (uint32_t(FloatRegisters::Simd128) *
-+                      FloatRegisters::TotalPhys)) |
-+      (set64 << (uint32_t(FloatRegisters::Double) * FloatRegisters::TotalPhys));
-+  return FloatRegisterSet(reduced);
-+}
-+
-+uint32_t FloatRegister::GetPushSizeInBytes(const FloatRegisterSet& s) {
-+  SetType all = s.bits();
-+  SetType simd128Set =
-+      (all >> (uint32_t(FloatRegisters::Simd128) * FloatRegisters::TotalPhys)) &
-+      FloatRegisters::AllPhysMask;
-+  SetType doubleSet =
-+      (all >> (uint32_t(FloatRegisters::Double) * FloatRegisters::TotalPhys)) &
-+      FloatRegisters::AllPhysMask;
-+  SetType singleSet =
-+      (all >> (uint32_t(FloatRegisters::Single) * FloatRegisters::TotalPhys)) &
-+      FloatRegisters::AllPhysMask;
-+
-+  // Natural per-kind slot sizes. See ReduceSetForPush comment.
-+  SetType set64 = singleSet | doubleSet;
-+
-+  uint32_t count64 = std::popcount(static_cast<uint64_t>(set64));
-+  uint32_t count128 = std::popcount(static_cast<uint64_t>(simd128Set));
-+
-+  return count64 * sizeof(double) + count128 * 16;
-+}
-+
-+uint32_t FloatRegister::getRegisterDumpOffsetInBytes() {
-+  // Simd128 encoding is 32-63 — mask back to 0-31 for the FPUArray-
-+  // relative offset. (FPUArray has 32 slots; Simd128 should never be in
-+  // a SafepointState/BailoutState anyway.)
-+  return (encoding() & 31) * sizeof(FloatRegisters::RegisterContent);
-+}
-+
-+static bool sPOWER9Detected = false;
-+static bool sPOWER10Detected = false;
-+static bool sCPUFlagsComputed = false;
-+
-+#ifndef JS_SIMULATOR
-+// Cache line sizes, detected at startup from ELF auxiliary vector.
-+// Fallback to 32 bytes (safe minimum per LuaJIT/LLVM compiler-rt).
-+static size_t sDCacheLineSize = 0;
-+static size_t sICacheLineSize = 0;
-+#endif
-+
-+void PPC64Flags::Init() {
-+  if (sCPUFlagsComputed) {
-+    return;
-+  }
-+#ifndef JS_SIMULATOR
-+  unsigned long hwcap2 = getauxval(AT_HWCAP2);
-+  // PPC_FEATURE2_ARCH_3_00 = 0x00800000 (ISA 3.0 / POWER9)
-+  sPOWER9Detected = (hwcap2 & 0x00800000) != 0;
-+  // PPC_FEATURE2_ARCH_3_1 = 0x00040000 (ISA 3.1 / POWER10)
-+  sPOWER10Detected = (hwcap2 & 0x00040000) != 0;
-+  // Allow forcing POWER8 mode for testing: MOZ_PPC64_FORCE_POWER8=1.
-+  // P10 implies P9; downgrade clears both.
-+  const char* forceP8 = getenv("MOZ_PPC64_FORCE_POWER8");
-+  if (forceP8 && forceP8[0] == '1') {
-+    sPOWER9Detected = false;
-+    sPOWER10Detected = false;
-+  }
-+
-+  size_t dcache = getauxval(AT_DCACHEBSIZE);
-+  size_t icache = getauxval(AT_ICACHEBSIZE);
-+  sDCacheLineSize = dcache ? dcache : 32;
-+  sICacheLineSize = icache ? icache : 32;
-+#endif
-+  // FORCE_POWER9/10 opt into the corresponding ISA fast paths. Useful under
-+  // the simulator; on real silicon below the gated level they are foot-guns
-+  // because the CPU will trap on undefined ops. Outside the JS_SIMULATOR
-+  // guard so the sim can opt in via env.
-+  //
-+  // FORCE_POWER10 also implies FORCE_POWER9 — this matches what real-P10
-+  // silicon advertises in hwcap2 (both ARCH_3_00 and ARCH_3_1 bits set), so
-+  // we don't ask sim users to pass both vars separately.
-+  const char* forceP9 = getenv("MOZ_PPC64_FORCE_POWER9");
-+  if (forceP9 && forceP9[0] == '1') {
-+    sPOWER9Detected = true;
-+  }
-+  const char* forceP10 = getenv("MOZ_PPC64_FORCE_POWER10");
-+  if (forceP10 && forceP10[0] == '1') {
-+    sPOWER10Detected = true;
-+    sPOWER9Detected = true;
-+  }
-+  sCPUFlagsComputed = true;
-+}
-+
-+bool HasPOWER9() {
-+  MOZ_ASSERT(sCPUFlagsComputed);
-+  return sPOWER9Detected;
-+}
-+
-+bool HasPOWER10() {
-+  MOZ_ASSERT(sCPUFlagsComputed);
-+  return sPOWER10Detected;
-+}
-+
-+bool CPUFlagsHaveBeenComputed() { return sCPUFlagsComputed; }
-+
-+// Per-bit feature flags packed into the wasm code signature. Adding a
-+// new bit (e.g., POWER10, VSX4) should be a 1-line change here plus a
-+// corresponding HasPOWER10()/IsVSX4Available() probe above. The value
-+// is also assert-checked into a fixed-width field in
-+// js/src/wasm/WasmCompile.cpp — if that field ever overflows, widen
-+// it there before landing more bits here.
-+uint32_t GetPPC64Flags() {
-+  uint32_t flags = 0;
-+  if (sPOWER9Detected) {
-+    flags |= PPC64Flag_POWER9;
-+  }
-+  return flags;
-+}
-+
-+void FlushICache(void* code, size_t size) {
-+#if defined(JS_SIMULATOR)
-+  js::jit::SimulatorProcess::FlushICache(code, size);
-+#else
-+  // PPC64 has incoherent I/D caches. GCC's __builtin___clear_cache is a
-+  // no-op on PPC64 Linux, so we implement the flush explicitly.
-+  // This follows the same approach as QEMU (util/cacheflush.c) and the
-+  // Linux kernel (arch/powerpc/mm/cacheflush.c):
-+  //   dcbst loop -> sync -> icbi loop -> sync -> isync
-+  if (!size) {
-+    return;
-+  }
-+  MOZ_ASSERT(sCPUFlagsComputed,
-+             "PPC64Flags::Init must run before any FlushICache call");
-+
-+  uintptr_t start = reinterpret_cast<uintptr_t>(code);
-+  uintptr_t end = start + size;
-+
-+  // Step 1: Write back data cache to memory.
-+  for (uintptr_t addr = start & ~(sDCacheLineSize - 1); addr < end;
-+       addr += sDCacheLineSize) {
-+    asm volatile("dcbst 0, %0" : : "r"(addr) : "memory");
-+  }
-+  asm volatile("sync" ::: "memory");
-+
-+  // Step 2: Invalidate instruction cache.
-+  for (uintptr_t addr = start & ~(sICacheLineSize - 1); addr < end;
-+       addr += sICacheLineSize) {
-+    asm volatile("icbi 0, %0" : : "r"(addr) : "memory");
-+  }
-+  // The extra sync before isync matches the Linux kernel and QEMU.
-+  // It ensures all icbi operations complete before the pipeline flush.
-+  asm volatile("sync" ::: "memory");
-+  asm volatile("isync" ::: "memory");
-+#endif
-+}
-+
-+void FlushExecutionContext() {
-+#if !defined(JS_SIMULATOR)
-+  // PPC64's isync flushes the instruction pipeline on the current core,
-+  // ensuring any previously invalidated icache entries are discarded and
-+  // instructions are re-fetched from coherent memory.
-+  asm volatile("isync" ::: "memory");
-+#endif
-+}
-+
-+}  // namespace jit
-+}  // namespace js
-diff --git a/js/src/jit/ppc64/Architecture-ppc64.h b/js/src/jit/ppc64/Architecture-ppc64.h
-new file mode 100644
-index 000000000000..efaab0b0c854
---- /dev/null
-+++ b/js/src/jit/ppc64/Architecture-ppc64.h
-@@ -0,0 +1,581 @@
-+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
-+ * vim: set ts=8 sts=2 et sw=2 tw=80:
-+ * This Source Code Form is subject to the terms of the Mozilla Public
-+ * License, v. 2.0. If a copy of the MPL was not distributed with this
-+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-+
-+#ifndef jit_ppc64_Architecture_ppc64_h
-+#define jit_ppc64_Architecture_ppc64_h
-+
-+#include <algorithm>
-+#include <bit>
-+
-+#include "jit/shared/Architecture-shared.h"
-+
-+#include "js/Utility.h"
-+
-+namespace js {
-+namespace jit {
-+
-+// PPC64 has 32 64-bit general purpose registers, r0 through r31.
-+// The program counter is not directly accessible as a register.
-+// The link register (LR) and count register (CTR) are SPRs.
-+
-+// PPC64 ELFv2 GPR Convention:
-+//  Name    Usage
-+//  r0      Volatile, cannot be base register in load/store
-+//  r1      Stack pointer (callee-saved)
-+//  r2      TOC pointer (reserved)
-+//  r3      Return value / first argument
-+//  r4-r10  Arguments 2-8
-+//  r11     Environment pointer / scratch
-+//  r12     Branch target / scratch
-+//  r13     Thread pointer (reserved, TLS)
-+//  r14-r31 Callee-saved
-+
-+// PPC64 ELFv2 FPR Convention:
-+//  f0      Scratch
-+//  f1-f13  Arguments / volatile
-+//  f14-f31 Callee-saved
-+
-+class Registers {
-+ public:
-+  enum RegisterID {
-+    r0 = 0,
-+    r1,
-+    r2,
-+    r3,
-+    r4,
-+    r5,
-+    r6,
-+    r7,
-+    r8,
-+    r9,
-+    r10,
-+    r11,
-+    r12,
-+    r13,
-+    r14,
-+    r15,
-+    r16,
-+    r17,
-+    r18,
-+    r19,
-+    r20,
-+    r21,
-+    r22,
-+    r23,
-+    r24,
-+    r25,
-+    r26,
-+    r27,
-+    r28,
-+    r29,
-+    r30,
-+    r31,
-+    sp = r1,
-+    invalid_reg,
-+  };
-+  typedef uint8_t Code;
-+  typedef RegisterID Encoding;
-+  typedef uint32_t SetType;
-+
-+  static const Encoding StackPointer = sp;
-+  static const Encoding Invalid = invalid_reg;
-+
-+  union RegisterContent {
-+    uintptr_t r;
-+  };
-+
-+  static uint32_t SetSize(SetType x) { return std::popcount(x); }
-+  static uint32_t FirstBit(SetType x) {
-+    MOZ_ASSERT(x);
-+    return std::countr_zero(x);
-+  }
-+  static uint32_t LastBit(SetType x) {
-+    MOZ_ASSERT(x);
-+    return std::bit_width(x) - 1;
-+  }
-+
-+  static const char* GetName(uint32_t code) {
-+    static const char* const Names[] = {
-+        "r0",  "sp",  "r2",  "r3",  "r4",  "r5",  "r6",  "r7",
-+        "r8",  "r9",  "r10", "r11", "r12", "r13", "r14", "r15",
-+        "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
-+        "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31"};
-+    static_assert(Total == std::size(Names), "Table is the correct size");
-+    if (code >= Total) {
-+      return "invalid";
-+    }
-+    return Names[code];
-+  }
-+
-+  static Code FromName(const char* name);
-+
-+  static const uint32_t Total = 32;
-+  static const uint32_t TotalPhys = 32;
-+  static const uint32_t Allocatable = 24;
-+
-+  static const SetType AllMask = 0xFFFFFFFF;
-+  static const SetType NoneMask = 0x0;
-+
-+  static const SetType ArgRegMask =
-+      (1U << Registers::r3) | (1U << Registers::r4) | (1U << Registers::r5) |
-+      (1U << Registers::r6) | (1U << Registers::r7) | (1U << Registers::r8) |
-+      (1U << Registers::r9) | (1U << Registers::r10);
-+
-+  // r0, r11, r12 are also volatile but handled separately.
-+  static const SetType VolatileMask = ArgRegMask;
-+
-+  // ELFv2 callee-saved GPRs are r14..r31. r2 (TOC) and r13 (TLS) are
-+  // dedicated registers, NOT general callee-saved: r2 is restored by the
-+  // PLT-call linkage convention (`ld r2, 24(r1)` after every cross-module
-+  // call); r13 is the thread pointer and must NEVER be written. Including
-+  // them here previously made `PushRegsInMask(NonVolatileMask)` save and
-+  // restore them — wasted 16 bytes per wasm-stub frame at best, latent
-+  // TLS corruption if save/restore were ever misordered. Verified that
-+  // no JIT-emitted code writes r2 or r13 (both are NonAllocatable, and
-+  // grep across js/src/jit/ppc64/ finds no `as_*` site assigning to
-+  // them), so they're preserved across the JIT body for free.
-+  static const SetType NonVolatileMask =
-+      (1U << Registers::r14) |
-+      (1U << Registers::r15) | (1U << Registers::r16) | (1U << Registers::r17) |
-+      (1U << Registers::r18) | (1U << Registers::r19) | (1U << Registers::r20) |
-+      (1U << Registers::r21) | (1U << Registers::r22) | (1U << Registers::r23) |
-+      (1U << Registers::r24) | (1U << Registers::r25) | (1U << Registers::r26) |
-+      (1U << Registers::r27) | (1U << Registers::r28) | (1U << Registers::r29) |
-+      (1U << Registers::r30) | (1U << Registers::r31);
-+
-+  static const SetType NonAllocatableMask =
-+      (1U << Registers::r0) |   // Cannot be base in load/store.
-+      (1U << Registers::sp) |   // Stack pointer.
-+      (1U << Registers::r2) |   // TOC pointer (ELFv2).
-+      (1U << Registers::r11) |  // Third scratch.
-+      (1U << Registers::r12) |  // Second scratch / addressTempRegister.
-+      (1U << Registers::r13) |  // Thread-local storage (ELFv2).
-+      (1U << Registers::r16) |  // Saved scratch register.
-+      (1U << Registers::r31);   // Frame pointer.
-+
-+  static const SetType WrapperMask = VolatileMask;
-+
-+  // Registers returned from a JS -> JS call.
-+  static const SetType JSCallMask = (1U << Registers::r5);
-+
-+  // Registers returned from a JS -> C call.
-+  static const SetType CallMask = (1U << Registers::r3);
-+
-+  static const SetType AllocatableMask = AllMask & ~NonAllocatableMask;
-+};
-+
-+typedef uint32_t PackedRegisterMask;
-+
-+template <typename T>
-+class TypedRegisterSet;
-+
-+class FloatRegisters {
-+ public:
-+  enum FPRegisterID {
-+    f0 = 0,
-+    f1,
-+    f2,
-+    f3,
-+    f4,
-+    f5,
-+    f6,
-+    f7,
-+    f8,
-+    f9,
-+    f10,
-+    f11,
-+    f12,
-+    f13,
-+    f14,
-+    f15,
-+    f16,
-+    f17,
-+    f18,
-+    f19,
-+    f20,
-+    f21,
-+    f22,
-+    f23,
-+    f24,
-+    f25,
-+    f26,
-+    f27,
-+    f28,
-+    f29,
-+    f30,
-+    f31,
-+  };
-+
-+  // Eight bits: (invalid << 7) | (kind << 5) | encoding
-+  typedef uint8_t Code;
-+  typedef FPRegisterID Encoding;
-+  // 3 kinds × 32 regs = 96 bits needed. Use __uint128_t.
-+  typedef __uint128_t SetType;
-+
-+  enum Kind : uint8_t { Double, Single, Simd128, NumTypes };
-+
-+  static constexpr Code Invalid = 0x80;
-+
-+  static const char* GetName(uint32_t code) {
-+    static const char* const Names[] = {
-+        "f0",  "f1",  "f2",  "f3",  "f4",  "f5",  "f6",  "f7",
-+        "f8",  "f9",  "f10", "f11", "f12", "f13", "f14", "f15",
-+        "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23",
-+        "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31"};
-+    static_assert(TotalPhys == std::size(Names), "Table is the correct size");
-+    if (code >= Total) {
-+      return "invalid";
-+    }
-+    return Names[code % TotalPhys];
-+  }
-+
-+  static Code FromName(const char* name);
-+
-+  static const uint32_t TotalPhys = 32;
-+  static const uint32_t Total = TotalPhys * NumTypes;
-+  static const uint32_t Allocatable = 31;  // Without f0, the scratch register.
-+
-+  static_assert(sizeof(SetType) * 8 >= Total,
-+                "SetType should be large enough to enumerate all registers.");
-+
-+  static const SetType SpreadSingle = SetType(1)
-+                                      << (uint32_t(Single) * TotalPhys);
-+  static const SetType SpreadDouble = SetType(1)
-+                                      << (uint32_t(Double) * TotalPhys);
-+  static const SetType SpreadSimd128 = SetType(1)
-+                                       << (uint32_t(Simd128) * TotalPhys);
-+  static const SetType Spread = SpreadSingle | SpreadDouble | SpreadSimd128;
-+
-+  static const SetType AllPhysMask = ((SetType(1) << TotalPhys) - 1);
-+  static const SetType AllMask = AllPhysMask * Spread;
-+  static const SetType AllSingleMask = AllPhysMask * SpreadSingle;
-+  static const SetType AllDoubleMask = AllPhysMask * SpreadDouble;
-+  static const SetType AllSimd128Mask = AllPhysMask * SpreadSimd128;
-+  static const SetType NoneMask = SetType(0);
-+
-+  // ELFv2: f14-f31 are non-volatile (callee-saved) for scalar FP.
-+  // The upper 64 bits of VSR 0-31 are volatile, so Simd128 view is all-volatile.
-+  static const SetType NonVolatilePhysMask =
-+      SetType((1U << FloatRegisters::f14) | (1U << FloatRegisters::f15) |
-+              (1U << FloatRegisters::f16) | (1U << FloatRegisters::f17) |
-+              (1U << FloatRegisters::f18) | (1U << FloatRegisters::f19) |
-+              (1U << FloatRegisters::f20) | (1U << FloatRegisters::f21) |
-+              (1U << FloatRegisters::f22) | (1U << FloatRegisters::f23) |
-+              (1U << FloatRegisters::f24) | (1U << FloatRegisters::f25) |
-+              (1U << FloatRegisters::f26) | (1U << FloatRegisters::f27) |
-+              (1U << FloatRegisters::f28) | (1U << FloatRegisters::f29) |
-+              (1U << FloatRegisters::f30) | (1U << FloatRegisters::f31));
-+  // Simd128 lives in VR-namespace (VSR32-63 = VR0-VR31). Per ELFv2 ABI,
-+  // VR20-VR31 are non-volatile (callee-saved). Encoding storage is 20-31
-+  // with kind=Simd128.
-+  static const SetType SimdNonVolatilePhysMask =
-+      SetType((1U << 20) | (1U << 21) | (1U << 22) | (1U << 23) |
-+              (1U << 24) | (1U << 25) | (1U << 26) | (1U << 27) |
-+              (1U << 28) | (1U << 29) | (1U << 30) | (1U << 31));
-+  static const SetType NonVolatileMask =
-+      NonVolatilePhysMask * (SpreadSingle | SpreadDouble) |
-+      SimdNonVolatilePhysMask * SpreadSimd128;
-+
-+  static const SetType VolatileMask = AllMask & ~NonVolatileMask;
-+
-+  static const SetType WrapperMask = VolatileMask;
-+
-+  // f0 is the scratch register (all three views: single, double, simd128).
-+  static const SetType NonAllocatableMask =
-+      (SetType(1) << FloatRegisters::f0) * Spread;
-+
-+  static const SetType AllocatableMask = AllMask & ~NonAllocatableMask;
-+
-+  union RegisterContent {
-+    float s;
-+    double d;
-+    // No v128 here. Simd128 lives in physically-distinct VRs (VSR32-63)
-+    // and never reaches RegisterDump (asserted by SafepointState; bailout
-+    // AllRegs excludes Simd128). With v128 in the union, sizeof was 16,
-+    // forcing PushRegsInMask to a 16-byte stride that mismatched
-+    // addressOfRegister's 8-byte walk via (*iter).size().
-+  };
-+
-+  static constexpr Encoding encoding(Code c) { return Encoding(c & 31); }
-+
-+  static constexpr Kind kind(Code c) { return Kind((c >> 5) & 3); }
-+
-+  static constexpr Code fromParts(uint32_t encoding, uint32_t kind,
-+                                  uint32_t invalid) {
-+    return Code((invalid << 7) | (kind << 5) | encoding);
-+  }
-+};
-+
-+// SpillSlotSize must fit the widest register class (Simd128 = 16 bytes).
-+// We can't derive from sizeof(FloatRegisters::RegisterContent) — that
-+// union is sized for FPRs only (8 bytes since v128 lives in distinct
-+// VRs, not in the FPR union), so deriving would under-reserve for
-+// Simd128 cycle breaks. SpillSlotSize is consumed only by MoveEmitter
-+// and is not part of the JIT frame layout.
-+static const uint32_t SpillSlotSize = 16;
-+
-+// PPC64 ELFv2 ABI: the callee saves LR at [caller_SP+16], CR at
-+// [caller_SP+8], and may save TOC at [caller_SP+24]. Reserve 32 bytes
-+// (the minimum ELFv2 stack frame) as a shadow area for every ABI call.
-+static constexpr uint32_t ShadowStackSpace = 32;
-+static const uint32_t SizeOfReturnAddressAfterCall = 0;
-+
-+// PPC64 branch instructions have a 26-bit signed offset field, giving a
-+// range of +/- 32MB. We reduce this to leave room for jump island insertion.
-+static constexpr uint32_t JumpImmediateRange = (32 * 1024 * 1024) - 32;
-+
-+// Size of each bailout table entry (a single bl instruction).
-+static const uint32_t BAILOUT_TABLE_ENTRY_SIZE = 4;
-+
-+// PPC64 special purpose registers (not exposed to the allocator).
-+enum SPRegisterID {
-+  spr_xer = 1,
-+  spr_lr = 8,
-+  spr_ctr = 9,
-+  spr_vrsave = 256,
-+  invalid_spreg
-+};
-+
-+// PPC64 condition registers.
-+enum CRegisterID { cr0 = 0, cr1, cr5 = 5, cr6, cr7, invalid_creg };
-+
-+struct FloatRegister {
-+  typedef FloatRegisters Codes;
-+  typedef size_t Code;
-+  typedef Codes::Encoding Encoding;
-+  typedef Codes::SetType SetType;
-+
-+  static uint32_t SetSize(SetType x) {
-+    // Fold all 3 kinds (Single, Double, Simd128) down to physical mask.
-+    SetType phys = (x & FloatRegisters::AllPhysMask) |
-+                   ((x >> FloatRegisters::TotalPhys) & FloatRegisters::AllPhysMask) |
-+                   ((x >> (2 * FloatRegisters::TotalPhys)) & FloatRegisters::AllPhysMask);
-+    return std::popcount(static_cast<uint64_t>(phys));
-+  }
-+
-+  // __uint128_t helpers for FirstBit/LastBit.
-+  static uint32_t FirstBit(SetType x) {
-+    MOZ_ASSERT(x);
-+    uint64_t lo = static_cast<uint64_t>(x);
-+    if (lo) {
-+      return std::countr_zero(lo);
-+    }
-+    return 64 + std::countr_zero(static_cast<uint64_t>(x >> 64));
-+  }
-+  static uint32_t LastBit(SetType x) {
-+    MOZ_ASSERT(x);
-+    uint64_t hi = static_cast<uint64_t>(x >> 64);
-+    if (hi) {
-+      return 64 + (std::bit_width(hi) - 1);
-+    }
-+    return std::bit_width(static_cast<uint64_t>(x)) - 1;
-+  }
-+
-+ private:
-+  uint8_t encoding_;
-+  uint8_t kind_;
-+  bool invalid_;
-+
-+  typedef Codes::Kind Kind;
-+
-+ public:
-+  constexpr FloatRegister(Encoding encoding, Kind kind)
-+      : encoding_(encoding), kind_(kind), invalid_(false) {}
-+
-+  constexpr FloatRegister()
-+      : encoding_(0), kind_(FloatRegisters::Double), invalid_(true) {}
-+
-+  static FloatRegister FromCode(uint32_t i) {
-+    MOZ_ASSERT(i < Codes::Total);
-+    return FloatRegister(FloatRegisters::encoding(i), FloatRegisters::kind(i));
-+  }
-+
-+  bool isSingle() const {
-+    MOZ_ASSERT(!invalid_);
-+    return kind_ == FloatRegisters::Single;
-+  }
-+  bool isDouble() const {
-+    MOZ_ASSERT(!invalid_);
-+    return kind_ == FloatRegisters::Double;
-+  }
-+  bool isSimd128() const {
-+    MOZ_ASSERT(!invalid_);
-+    return kind_ == FloatRegisters::Simd128;
-+  }
-+  bool isInvalid() const { return invalid_; }
-+
-+  FloatRegister asSingle() const {
-+    MOZ_ASSERT(!invalid_);
-+    return FloatRegister(Encoding(encoding_), FloatRegisters::Single);
-+  }
-+  FloatRegister asDouble() const {
-+    MOZ_ASSERT(!invalid_);
-+    return FloatRegister(Encoding(encoding_), FloatRegisters::Double);
-+  }
-+  FloatRegister asSimd128() const {
-+    MOZ_ASSERT(!invalid_);
-+    return FloatRegister(Encoding(encoding_), FloatRegisters::Simd128);
-+  }
-+
-+  constexpr uint32_t size() const {
-+    MOZ_ASSERT(!invalid_);
-+    if (kind_ == FloatRegisters::Double) {
-+      return sizeof(double);
-+    }
-+    if (kind_ == FloatRegisters::Single) {
-+      return sizeof(float);
-+    }
-+    MOZ_ASSERT(kind_ == FloatRegisters::Simd128);
-+    return 16;
-+  }
-+
-+  constexpr Code code() const {
-+    return Codes::fromParts(encoding_, kind_, invalid_);
-+  }
-+
-+  constexpr Encoding encoding() const {
-+    MOZ_ASSERT(!invalid_);
-+    // Simd128 lives in VR-namespace at VSR32-63 (= VR0-31). Single/Double
-+    // share FPR namespace at VSR0-31. The unified XX-form encoders split
-+    // the result into low-5-bit VRT/VRA/VRB + TX/AX/BX bits; VMX
-+    // FloatRegister-taking encoders mask with `& 31` for the raw VR
-+    // field. So 32+E flows correctly through both paths.
-+    return Encoding(encoding_ +
-+                    (kind_ == FloatRegisters::Simd128 ? 32 : 0));
-+  }
-+
-+  const char* name() const { return FloatRegisters::GetName(code()); }
-+  bool volatile_() const {
-+    MOZ_ASSERT(!invalid_);
-+    return !!((SetType(1) << code()) & FloatRegisters::VolatileMask);
-+  }
-+  constexpr bool operator!=(FloatRegister other) const {
-+    return code() != other.code();
-+  }
-+  constexpr bool operator==(FloatRegister other) const {
-+    return code() == other.code();
-+  }
-+
-+  bool aliases(FloatRegister other) const {
-+    // Register-class partition: {Single, Double} share FPRs (VSR0-31);
-+    // Simd128 lives in VR-namespace (VSR32-63). FPR f5 (Single/Double
-+    // encoding 5) and VR v5 (Simd128 encoding 5) are distinct physical
-+    // registers.
-+    if (encoding_ != other.encoding_) return false;
-+    bool selfSimd = (kind_ == FloatRegisters::Simd128);
-+    bool otherSimd = (other.kind_ == FloatRegisters::Simd128);
-+    return selfSimd == otherSimd;
-+  }
-+  bool equiv(FloatRegister other) const {
-+    MOZ_ASSERT(!invalid_);
-+    return kind_ == other.kind_;
-+  }
-+
-+  uint32_t numAliased() const {
-+    return (kind_ == FloatRegisters::Simd128) ? 1 : 2;
-+  }
-+  uint32_t numAlignedAliased() { return numAliased(); }
-+
-+  FloatRegister aliased(uint32_t aliasIdx) {
-+    MOZ_ASSERT(!invalid_);
-+    MOZ_ASSERT(aliasIdx < numAliased());
-+    if (kind_ == FloatRegisters::Simd128) {
-+      return *this;
-+    }
-+    Kind otherKind = (kind_ == FloatRegisters::Single)
-+                         ? FloatRegisters::Double
-+                         : FloatRegisters::Single;
-+    Kind selectedKind = (aliasIdx == 0) ? Kind(kind_) : otherKind;
-+    return FloatRegister(Encoding(encoding_), selectedKind);
-+  }
-+  FloatRegister alignedAliased(uint32_t aliasIdx) {
-+    MOZ_ASSERT(aliasIdx < numAliased());
-+    return aliased(aliasIdx);
-+  }
-+  SetType alignedOrDominatedAliasedSet() const {
-+    if (kind_ == FloatRegisters::Simd128) {
-+      return SetType(1) << ((uint32_t(FloatRegisters::Simd128) *
-+                             FloatRegisters::TotalPhys) +
-+                            encoding_);
-+    }
-+    return (Codes::SpreadSingle | Codes::SpreadDouble) << encoding_;
-+  }
-+
-+  static constexpr RegTypeName DefaultType = RegTypeName::Float64;
-+
-+  template <RegTypeName Name = DefaultType>
-+  static SetType LiveAsIndexableSet(SetType s) {
-+    return SetType(0);
-+  }
-+
-+  template <RegTypeName Name = DefaultType>
-+  static SetType AllocatableAsIndexableSet(SetType s) {
-+    static_assert(Name != RegTypeName::Any, "Allocatable set are not iterable");
-+    return LiveAsIndexableSet<Name>(s);
-+  }
-+
-+  static TypedRegisterSet<FloatRegister> ReduceSetForPush(
-+      const TypedRegisterSet<FloatRegister>& s);
-+  static uint32_t GetPushSizeInBytes(const TypedRegisterSet<FloatRegister>& s);
-+  uint32_t getRegisterDumpOffsetInBytes();
-+};
-+
-+template <>
-+inline FloatRegister::SetType
-+FloatRegister::LiveAsIndexableSet<RegTypeName::Float32>(SetType set) {
-+  return set & FloatRegisters::AllSingleMask;
-+}
-+
-+template <>
-+inline FloatRegister::SetType
-+FloatRegister::LiveAsIndexableSet<RegTypeName::Float64>(SetType set) {
-+  return set & FloatRegisters::AllDoubleMask;
-+}
-+
-+template <>
-+inline FloatRegister::SetType
-+FloatRegister::LiveAsIndexableSet<RegTypeName::Vector128>(SetType set) {
-+  return set & FloatRegisters::AllSimd128Mask;
-+}
-+
-+template <>
-+inline FloatRegister::SetType
-+FloatRegister::LiveAsIndexableSet<RegTypeName::Any>(SetType set) {
-+  return set;
-+}
-+
-+inline bool hasUnaliasedDouble() { return false; }
-+inline bool hasMultiAlias() { return false; }
-+
-+// PPC64 feature bits packed into the value GetPPC64Flags() returns,
-+// which feeds wasm/WasmCompile.cpp's per-architecture code signature.
-+// Defined as enum constants (not enum class) so callers can OR/AND
-+// freely. New bits should remain backward-compatible — older signatures
-+// must keep meaning the same set of features.
-+enum PPC64FeatureFlags : uint32_t {
-+  PPC64Flag_POWER9 = 1u << 0,
-+  // Future: PPC64Flag_POWER10 = 1u << 1, PPC64Flag_VSX4 = 1u << 2, ...
-+};
-+
-+uint32_t GetPPC64Flags();
-+
-+class PPC64Flags final {
-+ public:
-+  PPC64Flags() = delete;
-+
-+  // PPC64Flags::Init is called from the JitContext constructor to read the
-+  // hardware capabilities (via getauxval(AT_HWCAP2)). It must be called
-+  // exactly once, before HasPOWER9()/HasPOWER10() are used.
-+  static void Init();
-+};
-+
-+bool HasPOWER9();
-+bool HasPOWER10();
-+
-+}  // namespace jit
-+}  // namespace js
-+
-+#endif /* jit_ppc64_Architecture_ppc64_h */
-diff --git a/js/src/jit/ppc64/Assembler-ppc64.cpp b/js/src/jit/ppc64/Assembler-ppc64.cpp
-new file mode 100644
-index 000000000000..481070c4c6d5
---- /dev/null
-+++ b/js/src/jit/ppc64/Assembler-ppc64.cpp
-@@ -0,0 +1,3028 @@
-+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
-+ * vim: set ts=8 sts=2 et sw=2 tw=80:
-+ * This Source Code Form is subject to the terms of the Mozilla Public
-+ * License, v. 2.0. If a copy of the MPL was not distributed with this
-+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-+
-+#include "jit/ppc64/Assembler-ppc64.h"
-+
-+#include "mozilla/DebugOnly.h"
-+#include "mozilla/Maybe.h"
-+
-+#include "gc/Marking.h"
-+#include "jit/AutoWritableJitCode.h"
-+#include "jit/ExecutableAllocator.h"
-+#include "jit/FlushICache.h"
-+
-+using mozilla::DebugOnly;
-+
-+using namespace js;
-+using namespace js::jit;
-+
-+// ELFv2 ABI: 8 GPRs (r3-r10), 13 FPRs (f1-f13).
-+// FP arguments also consume a GPR slot per ELFv2 convention.
-+ABIArg ABIArgGenerator::next(MIRType type) {
-+  switch (type) {
-+    case MIRType::Int32:
-+    case MIRType::Int64:
-+    case MIRType::Pointer:
-+    case MIRType::WasmAnyRef:
-+    case MIRType::WasmArrayData:
-+    case MIRType::StackResults: {
-+      if (intRegIndex_ >= NumIntArgRegs) {
-+        current_ = ABIArg(stackOffset_);
-+        stackOffset_ += sizeof(uintptr_t);
-+        break;
-+      }
-+      current_ = ABIArg(Register::FromCode(Registers::r3 + intRegIndex_));
-+      intRegIndex_++;
-+      break;
-+    }
-+    case MIRType::Float32:
-+    case MIRType::Double: {
-+      if (floatRegIndex_ == NumFloatArgRegs) {
-+        current_ = ABIArg(stackOffset_);
-+        stackOffset_ += sizeof(double);
-+        break;
-+      }
-+      current_ = ABIArg(FloatRegister(
-+          FloatRegisters::Encoding(FloatRegisters::f1 + floatRegIndex_),
-+          type == MIRType::Double ? FloatRegisters::Double
-+                                  : FloatRegisters::Single));
-+      floatRegIndex_++;
-+      // ELFv2 ABI: each FP arg also consumes a GPR slot (shadow).
-+      // Cap at NumIntArgRegs so subsequent int args go to the stack.
-+      if (intRegIndex_ < NumIntArgRegs) {
-+        intRegIndex_++;
-+      }
-+      break;
-+    }
-+    case MIRType::Simd128: {
-+      // Pass v128 in FP registers (Simd128 kind). On PPC64 ELFv2, SIMD
-+      // values use the same VSR register file as FP args.
-+      if (floatRegIndex_ == NumFloatArgRegs) {
-+        current_ = ABIArg(stackOffset_);
-+        stackOffset_ += 16;
-+        break;
-+      }
-+      current_ = ABIArg(FloatRegister(
-+          FloatRegisters::Encoding(FloatRegisters::f1 + floatRegIndex_),
-+          FloatRegisters::Simd128));
-+      floatRegIndex_++;
-+      if (intRegIndex_ < NumIntArgRegs) {
-+        intRegIndex_++;
-+      }
-+      break;
-+    }
-+    default:
-+      MOZ_CRASH("Unexpected argument type");
-+  }
-+  return current_;
-+}
-+
-+// Condition inversion tables.
-+Assembler::Condition Assembler::InvertCondition(Condition cond) {
-+  switch (cond) {
-+    case Equal:
-+      return NotEqual;
-+    case NotEqual:
-+      return Equal;
-+    case LessThan:
-+      return GreaterThanOrEqual;
-+    case LessThanOrEqual:
-+      return GreaterThan;
-+    case GreaterThan:
-+      return LessThanOrEqual;
-+    case GreaterThanOrEqual:
-+      return LessThan;
-+    case Above:
-+      return BelowOrEqual;
-+    case AboveOrEqual:
-+      return Below;
-+    case Below:
-+      return AboveOrEqual;
-+    case BelowOrEqual:
-+      return Above;
-+    case Zero:
-+      return NonZero;
-+    case NonZero:
-+      return Zero;
-+    case Signed:
-+      return NotSigned;
-+    case NotSigned:
-+      return Signed;
-+    case SOBit:
-+      return NSOBit;
-+    case NSOBit:
-+      return SOBit;
-+    case Overflow:
-+      return NotOverflow;
-+    case NotOverflow:
-+      return Overflow;
-+    case CarrySet:
-+      return CarryClear;
-+    case CarryClear:
-+      return CarrySet;
-+    default:
-+      MOZ_CRASH("unexpected condition");
-+  }
-+}
-+
-+Assembler::DoubleCondition Assembler::InvertCondition(DoubleCondition cond) {
-+  switch (cond) {
-+    case DoubleOrdered:
-+      return DoubleUnordered;
-+    case DoubleEqual:
-+      return DoubleNotEqualOrUnordered;
-+    case DoubleNotEqual:
-+      return DoubleEqualOrUnordered;
-+    case DoubleGreaterThan:
-+      return DoubleLessThanOrEqualOrUnordered;
-+    case DoubleGreaterThanOrEqual:
-+      return DoubleLessThanOrUnordered;
-+    case DoubleLessThan:
-+      return DoubleGreaterThanOrEqualOrUnordered;
-+    case DoubleLessThanOrEqual:
-+      return DoubleGreaterThanOrUnordered;
-+    case DoubleUnordered:
-+      return DoubleOrdered;
-+    case DoubleEqualOrUnordered:
-+      return DoubleNotEqual;
-+    case DoubleNotEqualOrUnordered:
-+      return DoubleEqual;
-+    case DoubleGreaterThanOrUnordered:
-+      return DoubleLessThanOrEqual;
-+    case DoubleGreaterThanOrEqualOrUnordered:
-+      return DoubleLessThan;
-+    case DoubleLessThanOrUnordered:
-+      return DoubleGreaterThanOrEqual;
-+    case DoubleLessThanOrEqualOrUnordered:
-+      return DoubleGreaterThan;
-+    default:
-+      MOZ_CRASH("unexpected condition");
-+  }
-+}
-+
-+// InstImm helper.
-+uint8_t InstImm::traptag() {
-+  uint8_t r = ((data & 0x001f0000) >> 16);
-+  MOZ_ASSERT(isOpcode(PPC_tw));
-+  MOZ_ASSERT(r == ((data & 0x0000f800) >> 11));
-+  return r & 0xfe;
-+}
-+
-+BOffImm16::BOffImm16(InstImm inst) : data(inst.extractImm16Value() & 0xFFFC) {
-+  // Sign-extend the 16-bit field.
-+  if (data & 0x8000) {
-+    data |= ~0xFFFF;
-+  }
-+}
-+
-+Instruction* BOffImm16::getDest(Instruction* src) const {
-+  return (Instruction*)((uint8_t*)src + data);
-+}
-+
-+Instruction* JOffImm26::getDest(Instruction* src) const {
-+  return (Instruction*)((uint8_t*)src + data);
-+}
-+
-+Imm16::Imm16() : value(0) {}
-+
-+Imm8::Imm8() : value(0) {}
-+
-+// Buffer management.
-+bool Assembler::oom() const {
-+  return AssemblerShared::oom() || m_buffer.oom() || jumpRelocations_.oom() ||
-+         dataRelocations_.oom();
-+}
-+
-+void Assembler::finish() {
-+  MOZ_ASSERT(!isFinished);
-+  isFinished = true;
-+  m_buffer.flushPool();
-+}
-+
-+bool Assembler::appendRawCode(const uint8_t* code, size_t numBytes) {
-+  return m_buffer.appendRawCode(code, numBytes);
-+}
-+
-+bool Assembler::reserve(size_t size) {
-+  // Fixed-size chunk buffer; no point in reserving now vs. on-demand.
-+  return !oom();
-+}
-+
-+bool Assembler::swapBuffer(wasm::Bytes& bytes) {
-+  MOZ_ASSERT(bytes.empty());
-+  if (!bytes.resize(bytesNeeded())) {
-+    return false;
-+  }
-+  m_buffer.executableCopy(bytes.begin());
-+  return true;
-+}
-+
-+void Assembler::copyJumpRelocationTable(uint8_t* dest) {
-+  if (jumpRelocations_.length()) {
-+    memcpy(dest, jumpRelocations_.buffer(), jumpRelocations_.length());
-+  }
-+}
-+
-+void Assembler::copyDataRelocationTable(uint8_t* dest) {
-+  if (dataRelocations_.length()) {
-+    memcpy(dest, dataRelocations_.buffer(), dataRelocations_.length());
-+  }
-+}
-+
-+void Assembler::executableCopy(void* buffer) {
-+  MOZ_ASSERT(isFinished);
-+  m_buffer.executableCopy(static_cast<uint8_t*>(buffer));
-+}
-+
-+void Assembler::executableCopy(uint8_t* buffer) {
-+  MOZ_ASSERT(isFinished);
-+  m_buffer.executableCopy(buffer);
-+}
-+
-+size_t Assembler::size() const {
-+  // AssemblerBufferWithConstantPools::size() asserts pool is empty.
-+  // Flush pending pool entries first.
-+  const_cast<PPCBufferWithExecutableCopy&>(m_buffer).flushPool();
-+  return m_buffer.size();
-+}
-+
-+size_t Assembler::jumpRelocationTableBytes() const {
-+  return jumpRelocations_.length();
-+}
-+
-+size_t Assembler::dataRelocationTableBytes() const {
-+  return dataRelocations_.length();
-+}
-+
-+size_t Assembler::bytesNeeded() const {
-+  return size() + jumpRelocationTableBytes() + dataRelocationTableBytes();
-+}
-+
-+// Write an instruction into the buffer or to an external destination.
-+BufferOffset Assembler::writeInst(uint32_t x, uint32_t* dest) {
-+  MOZ_ASSERT(hasCreator());
-+  if (dest == nullptr) {
-+    return m_buffer.putInt(x);
-+  }
-+
-+  WriteInstStatic(x, dest);
-+  return BufferOffset();
-+}
-+
-+void Assembler::WriteInstStatic(uint32_t x, uint32_t* dest) {
-+  MOZ_ASSERT(dest != nullptr);
-+  *dest = x;
-+}
-+
-+// Alignment.
-+BufferOffset Assembler::haltingAlign(int alignment) {
-+  BufferOffset ret;
-+  MOZ_ASSERT(m_buffer.isAligned(4));
-+  if (alignment == 8) {
-+    if (!m_buffer.isAligned(alignment)) {
-+      BufferOffset tmp = xs_trap();
-+      if (!ret.assigned()) {
-+        ret = tmp;
-+      }
-+    }
-+  } else {
-+    MOZ_ASSERT((alignment & (alignment - 1)) == 0);
-+    while (size() & (alignment - 1)) {
-+      BufferOffset tmp = xs_trap();
-+      if (!ret.assigned()) {
-+        ret = tmp;
-+      }
-+    }
-+  }
-+  return ret;
-+}
-+
-+BufferOffset Assembler::nopAlign(int alignment) {
-+  BufferOffset ret;
-+  MOZ_ASSERT(m_buffer.isAligned(4));
-+  if (alignment == 8) {
-+    if (!m_buffer.isAligned(alignment)) {
-+      BufferOffset tmp = as_nop();
-+      if (!ret.assigned()) {
-+        ret = tmp;
-+      }
-+    }
-+  } else {
-+    MOZ_ASSERT((alignment & (alignment - 1)) == 0);
-+    while (size() & (alignment - 1)) {
-+      BufferOffset tmp = as_nop();
-+      if (!ret.assigned()) {
-+        ret = tmp;
-+      }
-+    }
-+  }
-+  return ret;
-+}
-+
-+// Primitive instructions.
-+BufferOffset Assembler::as_nop() {
-+  spew("nop");
-+  return writeInst(PPC_nop);
-+}
-+
-+BufferOffset Assembler::as_lwsync() {
-+  spew("lwsync");
-+  return writeInst(PPC_lwsync);
-+}
-+
-+BufferOffset Assembler::as_sync() {
-+  spew("sync");
-+  return writeInst(PPC_sync);
-+}
-+
-+BufferOffset Assembler::as_isync() {
-+  spew("isync");
-+  return writeInst(PPC_isync);
-+}
-+
-+// Branch and jump instructions.
-+BufferOffset Assembler::as_b(JOffImm26 off, BranchAddressType bat, LinkBit lb) {
-+  return as_b(off.encode(), bat, lb);
-+}
-+
-+BufferOffset Assembler::as_b(int32_t off, BranchAddressType bat, LinkBit lb) {
-+  spew("b%s%s\t%x", bat == AbsoluteBranch ? "a" : "", lb ? "l" : "", off);
-+  MOZ_ASSERT(!(off & 0x03));
-+  return writeInst(PPC_b | ((uint32_t)off & 0x3fffffc) | bat | lb);
-+}
-+
-+BufferOffset Assembler::as_blr(LinkBit lb) {
-+  spew("blr%s", lb ? "l" : "");
-+  return writeInst(uint32_t(PPC_blr) | uint32_t(lb));
-+}
-+
-+BufferOffset Assembler::as_bctr(LinkBit lb) {
-+  spew("bctr%s", lb ? "l" : "");
-+  return writeInst(uint32_t(PPC_bctr) | uint32_t(lb));
-+}
-+
-+// Conditional branches.
-+BufferOffset Assembler::as_bc(BOffImm16 off, Condition cond, CRegisterID cr,
-+                              LikelyBit lkb, LinkBit lb) {
-+  return as_bc(off.encode(), cond, cr, lkb, lb);
-+}
-+
-+BufferOffset Assembler::as_bc(int16_t off, Condition cond, CRegisterID cr,
-+                              LikelyBit lkb, LinkBit lb) {
-+  return as_bc(off, computeConditionCode(cond, cr), lkb, lb);
-+}
-+
-+BufferOffset Assembler::as_bc(BOffImm16 off, DoubleCondition cond,
-+                              CRegisterID cr, LikelyBit lkb, LinkBit lb) {
-+  return as_bc(off.encode(), cond, cr, lkb, lb);
-+}
-+
-+BufferOffset Assembler::as_bc(int16_t off, DoubleCondition cond, CRegisterID cr,
-+                              LikelyBit lkb, LinkBit lb) {
-+  return as_bc(off, computeConditionCode(cond, cr), lkb, lb);
-+}
-+
-+BufferOffset Assembler::as_bcctr(Condition cond, CRegisterID cr, LikelyBit lkb,
-+                                 LinkBit lb) {
-+  return as_bcctr(computeConditionCode(cond, cr), lkb, lb);
-+}
-+
-+BufferOffset Assembler::as_bcctr(DoubleCondition cond, CRegisterID cr,
-+                                 LikelyBit lkb, LinkBit lb) {
-+  return as_bcctr(computeConditionCode(cond, cr), lkb, lb);
-+}
-+
-+// Condition code computation: turn DoubleCondition + CR into BO|BI.
-+// May emit CR logic instructions for synthetic conditions involving FU bit.
-+uint16_t Assembler::computeConditionCode(DoubleCondition op, CRegisterID cr) {
-+  const uint8_t condBit = crBit(cr, op);
-+  const uint8_t fuBit = crBit(cr, DoubleUnordered);
-+  uint32_t newop = (uint32_t)op & 255;
-+
-+  if (op & DoubleConditionUnordered) {
-+    if ((uint32_t(op) & BranchOptionMask) == BranchOnClear) {
-+      as_crorc(condBit, fuBit, condBit);
-+      newop |= BranchOnSet;
-+    } else {
-+      if (condBit != fuBit) {
-+        as_cror(condBit, fuBit, condBit);
-+      }
-+    }
-+  } else {
-+    if ((uint32_t(op) & BranchOptionMask) == BranchOnClear) {
-+      if (condBit != fuBit) {
-+        as_cror(condBit, fuBit, condBit);
-+      }
-+    } else {
-+      if (condBit != fuBit) {
-+        as_crandc(condBit, condBit, fuBit);
-+      }
-+    }
-+  }
-+
-+  return (newop + ((uint8_t)cr << 6));
-+}
-+
-+// Condition code computation: turn Condition + CR into BO|BI.
-+// May emit mcrxrx for XER-mediated conditions.
-+uint16_t Assembler::computeConditionCode(Condition op, CRegisterID cr) {
-+  uint32_t newop = (uint32_t)op & 255;
-+
-+  if (op & ConditionOnlyXER) {
-+    MOZ_ASSERT(op == Overflow || op == NotOverflow);
-+    if (HasPOWER9()) {
-+      as_mcrxrx(cr);
-+    } else {
-+      // POWER8: read XER, place OV into the GT position of the target
-+      // CR field. Overflow condition (0x1c = GreaterThan) tests GT bit,
-+      // which mcrxrx populates with OV32. For 64-bit ops OV == OV32.
-+      // XER layout in GPR low 32 bits (IBM): bit 0=SO, 1=OV, 2=CA.
-+      // Target: GT position = IBM bit 4*cr+1.
-+      xs_mfxer(r0);
-+      int gtBit = 4 * (int)cr + 1;          // GT position in CR field
-+      int sh = (1 - gtBit) & 31;            // rotate OV from bit 1 to gtBit
-+      as_rlwinm(r0, r0, sh, gtBit, gtBit);  // isolate OV at GT only
-+      as_mtcrf(1 << (7 - (int)cr), r0);
-+    }
-+    newop = (uint32_t)op & 255;
-+  }
-+
-+  return (newop + ((uint8_t)cr << 6));
-+}
-+
-+// Given BO|BI in a 16-bit quantity, split into bit fields for instruction.
-+static uint32_t makeOpMask(uint16_t op) {
-+  MOZ_ASSERT(!(op & 0xfc00));
-+  return ((op & 0x0f) << 21) | ((op & 0xfff0) << 12);
-+}
-+
-+BufferOffset Assembler::as_bc(int16_t off, uint16_t op, LikelyBit lkb,
-+                              LinkBit lb) {
-+  spew("bc%s%s\tBO_BI=0x%04x,%d", lb ? "l" : "", lkb ? "+" : "", op, off);
-+  MOZ_ASSERT(!(off & 0x03));
-+  return writeInst(Instruction(PPC_bc | makeOpMask(op) | lkb << 21 |
-+                               ((uint16_t)off & 0xfffc) | lb)
-+                       .encode());
-+}
-+
-+BufferOffset Assembler::as_bcctr(uint16_t op, LikelyBit lkb, LinkBit lb) {
-+  spew("bcctr%s%s", lb ? "l" : "", lkb ? "+" : "");
-+  return writeInst(PPC_bcctr | makeOpMask(op) | lkb << 21 | lb);
-+}
-+
-+// SPR operations.
-+BufferOffset Assembler::as_mtspr(SPRegisterID spr, Register ra) {
-+  spew("mtspr\t%d,%3s", spr, ra.name());
-+  return writeInst(PPC_mtspr | ra.code() << 21 | PPC_SPR(spr));
-+}
-+
-+BufferOffset Assembler::as_mfspr(Register rd, SPRegisterID spr) {
-+  spew("mfspr\t%3s,%d", rd.name(), spr);
-+  return writeInst(PPC_mfspr | rd.code() << 21 | PPC_SPR(spr));
-+}
-+
-+// CR operations.
-+#define DEF_CRCR(op)                                                 \
-+  BufferOffset Assembler::as_##op(uint8_t t, uint8_t a, uint8_t b) { \
-+    spew(#op "\t%d,%d,%d", t, a, b);                                 \
-+    return writeInst(PPC_##op | t << 21 | a << 16 | b << 11);        \
-+  }
-+DEF_CRCR(crandc)
-+DEF_CRCR(cror)
-+DEF_CRCR(crorc)
-+#undef DEF_CRCR
-+
-+BufferOffset Assembler::as_mtcrf(uint32_t mask, Register rs) {
-+  spew("mtcrf\t%d,%3s", mask, rs.name());
-+  return writeInst(PPC_mtcrf | rs.code() << 21 | mask << 12);
-+}
-+
-+BufferOffset Assembler::as_mfocrf(Register rd, CRegisterID crfs) {
-+  spew("mfocrf\t%3s,cr%d", rd.name(), crfs);
-+  // FXM is a one-hot 8-bit mask at bits 12-19. Bit (7-crfs) selects the CR.
-+  return writeInst(PPC_mfocrf | rd.code() << 21 | (1 << (7 - crfs)) << 12);
-+}
-+
-+BufferOffset Assembler::as_mcrxrx(CRegisterID cr) {
-+  spew("mcrxrx\tcr%d", cr);
-+  return writeInst(PPC_mcrxrx | cr << 23);
-+}
-+
-+// GPR neg.
-+BufferOffset Assembler::as_neg(Register rd, Register rs) {
-+  spew("neg\t%3s,%3s", rd.name(), rs.name());
-+  return writeInst(InstReg(PPC_neg, rd, rs, r0).encode());
-+}
-+
-+// Compare instructions.
-+BufferOffset Assembler::as_cmpd(CRegisterID cr, Register ra, Register rb) {
-+  spew("cmpd\tcr%d,%3s,%3s", cr, ra.name(), rb.name());
-+  return writeInst(PPC_cmpd | cr << 23 | ra.code() << 16 | rb.code() << 11);
-+}
-+
-+BufferOffset Assembler::as_cmpdi(CRegisterID cr, Register ra, int16_t im) {
-+  spew("cmpdi\tcr%d,%3s,%d", cr, ra.name(), im);
-+  return writeInst(PPC_cmpdi | cr << 23 | ra.code() << 16 |
-+                   ((uint16_t)im & 0xffff));
-+}
-+
-+BufferOffset Assembler::as_cmpld(CRegisterID cr, Register ra, Register rb) {
-+  spew("cmpld\tcr%d,%3s,%3s", cr, ra.name(), rb.name());
-+  return writeInst(PPC_cmpld | cr << 23 | ra.code() << 16 | rb.code() << 11);
-+}
-+
-+BufferOffset Assembler::as_cmpldi(CRegisterID cr, Register ra, int16_t im) {
-+  spew("cmpldi\tcr%d,%3s,%d", cr, ra.name(), im);
-+  return writeInst(PPC_cmpldi | cr << 23 | ra.code() << 16 |
-+                   ((uint16_t)im & 0xffff));
-+}
-+
-+BufferOffset Assembler::as_cmpw(CRegisterID cr, Register ra, Register rb) {
-+  spew("cmpw\tcr%d,%3s,%3s", cr, ra.name(), rb.name());
-+  return writeInst(PPC_cmpw | cr << 23 | ra.code() << 16 | rb.code() << 11);
-+}
-+
-+BufferOffset Assembler::as_cmpwi(CRegisterID cr, Register ra, int16_t im) {
-+  spew("cmpwi\tcr%d,%3s,%d", cr, ra.name(), im);
-+  return writeInst(PPC_cmpwi | cr << 23 | ra.code() << 16 |
-+                   ((uint16_t)im & 0xffff));
-+}
-+
-+BufferOffset Assembler::as_cmplw(CRegisterID cr, Register ra, Register rb) {
-+  spew("cmplw\tcr%d,%3s,%3s", cr, ra.name(), rb.name());
-+  return writeInst(PPC_cmplw | cr << 23 | ra.code() << 16 | rb.code() << 11);
-+}
-+
-+BufferOffset Assembler::as_cmplwi(CRegisterID cr, Register ra, int16_t im) {
-+  spew("cmplwi\tcr%d,%3s,%d", cr, ra.name(), im);
-+  return writeInst(PPC_cmplwi | cr << 23 | ra.code() << 16 |
-+                   ((uint16_t)im & 0xffff));
-+}
-+
-+// Compare instructions (cr0 implicit).
-+BufferOffset Assembler::as_cmpd(Register ra, Register rb) {
-+  spew("cmpd\t%3s,%3s", ra.name(), rb.name());
-+  return writeInst(PPC_cmpd | ra.code() << 16 | rb.code() << 11);
-+}
-+
-+BufferOffset Assembler::as_cmpdi(Register ra, int16_t im) {
-+  spew("cmpdi\t%3s,%d", ra.name(), im);
-+  return writeInst(PPC_cmpdi | ra.code() << 16 | ((uint16_t)im & 0xffff));
-+}
-+
-+BufferOffset Assembler::as_cmpld(Register ra, Register rb) {
-+  spew("cmpld\t%3s,%3s", ra.name(), rb.name());
-+  return writeInst(PPC_cmpld | ra.code() << 16 | rb.code() << 11);
-+}
-+
-+BufferOffset Assembler::as_cmpldi(Register ra, int16_t im) {
-+  spew("cmpldi\t%3s,%d", ra.name(), im);
-+  return writeInst(PPC_cmpldi | ra.code() << 16 | ((uint16_t)im & 0xffff));
-+}
-+
-+BufferOffset Assembler::as_cmpw(Register ra, Register rb) {
-+  spew("cmpw\t%3s,%3s", ra.name(), rb.name());
-+  return writeInst(PPC_cmpw | ra.code() << 16 | rb.code() << 11);
-+}
-+
-+BufferOffset Assembler::as_cmpwi(Register ra, int16_t im) {
-+  spew("cmpwi\t%3s,%d", ra.name(), im);
-+  return writeInst(PPC_cmpwi | ra.code() << 16 | ((uint16_t)im & 0xffff));
-+}
-+
-+BufferOffset Assembler::as_cmplw(Register ra, Register rb) {
-+  spew("cmplw\t%3s,%3s", ra.name(), rb.name());
-+  return writeInst(PPC_cmplw | ra.code() << 16 | rb.code() << 11);
-+}
-+
-+BufferOffset Assembler::as_cmplwi(Register ra, int16_t im) {
-+  spew("cmplwi\t%3s,%d", ra.name(), im);
-+  return writeInst(PPC_cmplwi | ra.code() << 16 | ((uint16_t)im & 0xffff));
-+}
-+
-+// FP encoding helpers.
-+static uint32_t AForm(uint32_t op, FloatRegister frt, FloatRegister fra,
-+                      FloatRegister frb, FloatRegister frc, bool rc) {
-+  return (op | (frt.encoding() << 21) | (fra.encoding() << 16) |
-+          (frb.encoding() << 11) | (frc.encoding() << 6) | rc);
-+}
-+
-+static uint32_t XForm(uint32_t op, FloatRegister frt, FloatRegister fra,
-+                      FloatRegister frb, bool rc) {
-+  return (op | (frt.encoding() << 21) | (fra.encoding() << 16) |
-+          (frb.encoding() << 11) | rc);
-+}
-+
-+static uint32_t XForm(uint32_t op, FloatRegister frt, Register ra, Register rb,
-+                      bool rc) {
-+  return (op | (frt.encoding() << 21) | (ra.code() << 16) | (rb.code() << 11) |
-+          rc);
-+}
-+
-+static uint32_t DForm(uint32_t op, FloatRegister frt, Register ra,
-+                      int16_t imm) {
-+  return (op | (frt.encoding() << 21) | (ra.code() << 16) |
-+          ((uint16_t)imm & 0xffff));
-+}
-+
-+// XX-form encoders. Each form has its own X-bit positions.
-+// All take uint32_t encodings (0-63) so they correctly
-+// emit the high bit for VSR32-63. FloatRegister.encoding() returns 0-31
-+// for Single/Double (= VSR0-31 = FPR namespace) and 32-63 for Simd128
-+// (= VSR32-63 = VR namespace) — so a single XX-form encoder addresses
-+// the full VSR space.
-+
-+// XX1-form: T + GPR (RA) + GPR (RB). TX bit at instruction bit 0.
-+// Used by lxvx, stxvx, lxvd2x, stxvd2x, mtvsrdd, mtvsrd, mtvsrws, mtvsrwz.
-+static uint32_t XX1Form(uint32_t op, uint32_t xt, uint32_t ra, uint32_t rb) {
-+  return op | (xt & 31) << 21 | (ra & 31) << 16 | (rb & 31) << 11 |
-+         ((xt >> 5) & 1);
-+}
-+
-+// XX1-form for mfvsrX: GPR (RT) + VSR (XS). TX bit ("SX") at instruction
-+// bit 0; the X spec calls this SX since the source register is the VSR.
-+// Used by mfvsrd, mfvsrld.
-+static uint32_t XX1FormMfvsr(uint32_t op, uint32_t rt, uint32_t xs) {
-+  return op | (xs & 31) << 21 | (rt & 31) << 16 | ((xs >> 5) & 1);
-+}
-+
-+// XX2-form: T + B (no A field; bits 16-20 unused or hold a UIM). BX bit
-+// at instruction bit 1, TX bit at instruction bit 0. The bits16-20 slot
-+// is set by callers — for plain XX2 it must be 0, for XX2 with UIM it
-+// holds the immediate.
-+// Used by xxbrd, xxbrh, xxbrw, xxbrq, xscvdpsp, xscvspdp, xscvdpspn,
-+// xscvspdpn, xxspltw (UIM=2 bits), xxinsertw (UIM=4 bits),
-+// xxextractuw (UIM=4 bits), xvabs*/xvneg*/xvsqrt*/xvr* etc. via
-+// DEF_VSX_UN.
-+static uint32_t XX2Form(uint32_t op, uint32_t xt, uint32_t xb,
-+                        uint32_t bits16to20 = 0) {
-+  return op | (xt & 31) << 21 | (bits16to20 & 31) << 16 | (xb & 31) << 11 |
-+         ((xb >> 5) & 1) << 1 | ((xt >> 5) & 1);
-+}
-+
-+// XX3-form: T + A + B. AX/BX/TX bits at instruction bits 2/1/0.
-+// Used by xxlor, xxland, xxlxor, xxlnor, xxlandc, xxpermdi, xsmaxjdp,
-+// xsminjdp, xvadd*, xvcmp*, etc.
-+static uint32_t XX3Form(uint32_t op, uint32_t xt, uint32_t xa, uint32_t xb) {
-+  return op | (xt & 31) << 21 | (xa & 31) << 16 | (xb & 31) << 11 |
-+         ((xa >> 5) & 1) << 2 | ((xb >> 5) & 1) << 1 | ((xt >> 5) & 1);
-+}
-+
-+// XX4-form: T + A + B + C. CX/AX/BX/TX bits at instruction bits 3/2/1/0.
-+// Used by xxsel.
-+static uint32_t XX4Form(uint32_t op, uint32_t xt, uint32_t xa, uint32_t xb,
-+                        uint32_t xc) {
-+  return op | (xt & 31) << 21 | (xa & 31) << 16 | (xb & 31) << 11 |
-+         (xc & 31) << 6 | ((xc >> 5) & 1) << 3 | ((xa >> 5) & 1) << 2 |
-+         ((xb >> 5) & 1) << 1 | ((xt >> 5) & 1);
-+}
-+
-+// FloatRegister convenience overload for XX3Form (the most common form).
-+static uint32_t XX3Form(uint32_t op, FloatRegister xt, FloatRegister xa,
-+                        FloatRegister xb) {
-+  return XX3Form(op, uint32_t(xt.encoding()), uint32_t(xa.encoding()),
-+                 uint32_t(xb.encoding()));
-+}
-+
-+// --- Macro-defined instruction emitters ---
-+
-+// X-form: rd in bits 21-25, ra in 16-20, rb in 11-15.
-+#define DEF_XFORM(op)                                                      \
-+  BufferOffset Assembler::as_##op(Register rd, Register ra, Register rb) { \
-+    spew(#op "\t%3s,%3s,%3s", rd.name(), ra.name(), rb.name());            \
-+    return writeInst(InstReg(PPC_##op, rd, ra, rb).encode());              \
-+  }
-+
-+#define DEF_XFORM_RC(op)                                            \
-+  BufferOffset Assembler::as_##op##_rc(Register rd, Register ra,    \
-+                                       Register rb) {               \
-+    spew(#op ".\t%3s,%3s,%3s", rd.name(), ra.name(), rb.name());    \
-+    return writeInst(InstReg(PPC_##op, rd, ra, rb).encode() | 0x1); \
-+  }
-+
-+// X-form with swapped RS/RA encoding: rs in bits 21-25, ra in 16-20.
-+#define DEF_XFORMS(op)                                                     \
-+  BufferOffset Assembler::as_##op(Register rd, Register ra, Register rb) { \
-+    spew(#op "\t%3s,%3s,%3s", rd.name(), ra.name(), rb.name());            \
-+    return writeInst(InstReg(PPC_##op, ra, rd, rb).encode());              \
-+  }
-+
-+#define DEF_XFORMS_RC(op)                                           \
-+  BufferOffset Assembler::as_##op##_rc(Register rd, Register ra,    \
-+                                       Register rb) {               \
-+    spew(#op ".\t%3s,%3s,%3s", rd.name(), ra.name(), rb.name());    \
-+    return writeInst(InstReg(PPC_##op, ra, rd, rb).encode() | 0x1); \
-+  }
-+
-+// X-form shift immediate with swapped encoding.
-+#define DEF_XFORMS_I(op)                                                       \
-+  BufferOffset Assembler::as_##op(Register rd, Register ra, uint8_t sh) {      \
-+    spew(#op "\t%3s,%3s,%d", rd.name(), ra.name(), sh);                        \
-+    MOZ_ASSERT(sh < 32);                                                       \
-+    return writeInst(PPC_##op | ra.code() << 21 | rd.code() << 16 | sh << 11); \
-+  }
-+
-+// 2-reg X-form: rd in bits 21-25, ra in 16-20, rb=r0.
-+#define DEF_XFORM2(op)                                        \
-+  BufferOffset Assembler::as_##op(Register rd, Register ra) { \
-+    spew(#op "\t%3s,%3s", rd.name(), ra.name());              \
-+    return writeInst(InstReg(PPC_##op, rd, ra, r0).encode()); \
-+  }
-+
-+#define DEF_XFORM2_RC(op)                                           \
-+  BufferOffset Assembler::as_##op##_rc(Register rd, Register ra) {  \
-+    spew(#op ".\t%3s,%3s", rd.name(), ra.name());                   \
-+    return writeInst(InstReg(PPC_##op, rd, ra, r0).encode() | 0x1); \
-+  }
-+
-+// 2-reg X-form swapped: ra in bits 21-25, rd in 16-20.
-+#define DEF_XFORM2S(op)                                       \
-+  BufferOffset Assembler::as_##op(Register rd, Register ra) { \
-+    spew(#op "\t%3s,%3s", rd.name(), ra.name());              \
-+    return writeInst(InstReg(PPC_##op, ra, rd, r0).encode()); \
-+  }
-+
-+#define DEF_XFORM2S_RC(op)                                          \
-+  BufferOffset Assembler::as_##op##_rc(Register rd, Register ra) {  \
-+    spew(#op ".\t%3s,%3s", rd.name(), ra.name());                   \
-+    return writeInst(InstReg(PPC_##op, ra, rd, r0).encode() | 0x1); \
-+  }
-+
-+// D-form load/store: rd=RT, rb=RA (base register), off=displacement.
-+// r0 cannot be used as base register for D-form loads/stores.
-+#define DEF_DFORM(op)                                                      \
-+  BufferOffset Assembler::as_##op(Register rd, Register rb, int16_t off) { \
-+    spew(#op "\t%3s,%d(%3s)", rd.name(), off, rb.name());                  \
-+    MOZ_ASSERT(rb != r0);                                                  \
-+    return writeInst(InstImm(PPC_##op, rd, rb, off).encode());             \
-+  }
-+
-+// D-form with swapped RS/RA encoding for logical immediates.
-+#define DEF_DFORMS(op)                                                     \
-+  BufferOffset Assembler::as_##op(Register rd, Register ra, uint16_t im) { \
-+    spew(#op "\t%3s,%3s,%d", rd.name(), ra.name(), im);                    \
-+    return writeInst(InstImm(PPC_##op, ra, rd, im).encode());              \
-+  }
-+
-+// M-form: rotate with 3 registers + mb + me.
-+#define DEF_MFORM(op)                                                         \
-+  BufferOffset Assembler::as_##op(Register rd, Register rs, Register rb,      \
-+                                  uint8_t mb, uint8_t me) {                   \
-+    spew(#op "\t%3s,%3s,%3s,%d,%d", rd.name(), rs.name(), rb.name(), mb, me); \
-+    MOZ_ASSERT(mb < 32);                                                      \
-+    MOZ_ASSERT(me < 32);                                                      \
-+    return writeInst(PPC_##op | rs.code() << 21 | rd.code() << 16 |           \
-+                     rb.code() << 11 | mb << 6 | me << 1);                    \
-+  }
-+
-+// M-form with immediate shift.
-+#define DEF_MFORM_I(op)                                                        \
-+  BufferOffset Assembler::as_##op(Register rd, Register rs, uint8_t sh,        \
-+                                  uint8_t mb, uint8_t me) {                    \
-+    spew(#op "\t%3s,%3s,%d,%d,%d", rd.name(), rs.name(), sh, mb, me);          \
-+    MOZ_ASSERT(sh < 32);                                                       \
-+    MOZ_ASSERT(mb < 32);                                                       \
-+    MOZ_ASSERT(me < 32);                                                       \
-+    return writeInst(PPC_##op | rs.code() << 21 | rd.code() << 16 | sh << 11 | \
-+                     mb << 6 | me << 1);                                       \
-+  }
-+
-+#define DEF_MFORM_I_RC(op)                                                     \
-+  BufferOffset Assembler::as_##op##_rc(Register rd, Register rs, uint8_t sh,   \
-+                                       uint8_t mb, uint8_t me) {               \
-+    spew(#op ".\t%3s,%3s,%d,%d,%d", rd.name(), rs.name(), sh, mb, me);         \
-+    MOZ_ASSERT(sh < 32);                                                       \
-+    MOZ_ASSERT(mb < 32);                                                       \
-+    MOZ_ASSERT(me < 32);                                                       \
-+    return writeInst(PPC_##op | rs.code() << 21 | rd.code() << 16 | sh << 11 | \
-+                     mb << 6 | me << 1 | 1);                                   \
-+  }
-+
-+// MDS-form: rotate with register + mb (64-bit).
-+#define DEF_MDSFORM(op)                                                   \
-+  BufferOffset Assembler::as_##op(Register ra, Register rs, Register rb,  \
-+                                  uint8_t mb) {                           \
-+    spew(#op "\t%3s,%3s,%3s,%d", ra.name(), rs.name(), rb.name(), mb);    \
-+    MOZ_ASSERT(mb < 64);                                                  \
-+    return writeInst(PPC_##op | rs.code() << 21 | ra.code() << 16 |       \
-+                     rb.code() << 11 | ((mb & 0x1f) << 6) | (mb & 0x20)); \
-+  }
-+
-+#define DEF_MDSFORM_RC(op)                                                    \
-+  BufferOffset Assembler::as_##op##_rc(Register ra, Register rs, Register rb, \
-+                                       uint8_t mb) {                          \
-+    spew(#op ".\t%3s,%3s,%3s,%d", ra.name(), rs.name(), rb.name(), mb);       \
-+    MOZ_ASSERT(mb < 64);                                                      \
-+    return writeInst(PPC_##op | rs.code() << 21 | ra.code() << 16 |           \
-+                     rb.code() << 11 | ((mb & 0x1f) << 6) | (mb & 0x20) | 1); \
-+  }
-+
-+// MD-form: rotate/shift with immediate sh + mb (64-bit).
-+// sh and mb are 6-bit fields split across the instruction word.
-+#define DEF_MDFORM(op)                                                        \
-+  BufferOffset Assembler::as_##op(Register ra, Register rs, uint8_t sh,       \
-+                                  uint8_t mb) {                               \
-+    spew(#op "\t%3s,%3s,%d,%d", ra.name(), rs.name(), sh, mb);                \
-+    MOZ_ASSERT(sh < 64);                                                      \
-+    MOZ_ASSERT(mb < 64);                                                      \
-+    return writeInst(PPC_##op | rs.code() << 21 | ra.code() << 16 |           \
-+                     ((sh & 0x1f) << 11) | ((mb & 0x1f) << 6) | (mb & 0x20) | \
-+                     ((sh & 0x20) >> 4));                                     \
-+  }
-+
-+#define DEF_MDFORM_RC(op)                                                     \
-+  BufferOffset Assembler::as_##op##_rc(Register ra, Register rs, uint8_t sh,  \
-+                                       uint8_t mb) {                          \
-+    spew(#op ".\t%3s,%3s,%d,%d", ra.name(), rs.name(), sh, mb);               \
-+    MOZ_ASSERT(sh < 64);                                                      \
-+    MOZ_ASSERT(mb < 64);                                                      \
-+    return writeInst(PPC_##op | rs.code() << 21 | ra.code() << 16 |           \
-+                     ((sh & 0x1f) << 11) | ((mb & 0x1f) << 6) | (mb & 0x20) | \
-+                     ((sh & 0x20) >> 4) | 0x01);                              \
-+  }
-+
-+// FP 2-reg X-form: frt in bits 21-25, fra=f0, frb in 11-15.
-+#define DEF_XFORM2_F(op)                                                \
-+  BufferOffset Assembler::as_##op(FloatRegister rd, FloatRegister ra) { \
-+    spew(#op "\t%3s,%3s", rd.name(), ra.name());                        \
-+    return writeInst(XForm(PPC_##op, rd, f0, ra, false));               \
-+  }
-+
-+#define DEF_XFORM2_F_RC(op)                                                  \
-+  BufferOffset Assembler::as_##op##_rc(FloatRegister rd, FloatRegister ra) { \
-+    spew(#op ".\t%3s,%3s", rd.name(), ra.name());                            \
-+    return writeInst(XForm(PPC_##op, rd, f0, ra, true));                     \
-+  }
-+
-+// FP A-form with frc (fmul-type): frt, fra, frc; frb=f0.
-+#define DEF_AFORM_C(op)                                               \
-+  BufferOffset Assembler::as_##op(FloatRegister rd, FloatRegister ra, \
-+                                  FloatRegister rc) {                 \
-+    spew(#op "\t%3s,%3s,%3s", rd.name(), ra.name(), rc.name());       \
-+    return writeInst(AForm(PPC_##op, rd, ra, f0, rc, false));         \
-+  }
-+
-+#define DEF_AFORM_C_RC(op)                                                 \
-+  BufferOffset Assembler::as_##op##_rc(FloatRegister rd, FloatRegister ra, \
-+                                       FloatRegister rc) {                 \
-+    spew(#op ".\t%3s,%3s,%3s", rd.name(), ra.name(), rc.name());           \
-+    return writeInst(AForm(PPC_##op, rd, ra, f0, rc, true));               \
-+  }
-+
-+// FP A-form with frb (fadd-type): frt, fra, frb; frc=f0.
-+#define DEF_AFORM_B(op)                                               \
-+  BufferOffset Assembler::as_##op(FloatRegister rd, FloatRegister ra, \
-+                                  FloatRegister rb) {                 \
-+    spew(#op "\t%3s,%3s,%3s", rd.name(), ra.name(), rb.name());       \
-+    return writeInst(AForm(PPC_##op, rd, ra, rb, f0, false));         \
-+  }
-+
-+#define DEF_AFORM_B_RC(op)                                                 \
-+  BufferOffset Assembler::as_##op##_rc(FloatRegister rd, FloatRegister ra, \
-+                                       FloatRegister rb) {                 \
-+    spew(#op ".\t%3s,%3s,%3s", rd.name(), ra.name(), rb.name());           \
-+    return writeInst(AForm(PPC_##op, rd, ra, rb, f0, true));               \
-+  }
-+
-+// Full FP A-form: frt, fra, frc, frb (fmadd-type).
-+#define DEF_AFORM(op)                                                          \
-+  BufferOffset Assembler::as_##op(FloatRegister rd, FloatRegister ra,          \
-+                                  FloatRegister rc, FloatRegister rb) {        \
-+    spew(#op "\t%3s,%3s,%3s,%3s", rd.name(), ra.name(), rc.name(), rb.name()); \
-+    return writeInst(AForm(PPC_##op, rd, ra, rb, rc, false));                  \
-+  }
-+
-+#define DEF_AFORM_RC(op)                                                     \
-+  BufferOffset Assembler::as_##op##_rc(FloatRegister rd, FloatRegister ra,   \
-+                                       FloatRegister rc, FloatRegister rb) { \
-+    spew(#op ".\t%3s,%3s,%3s,%3s", rd.name(), ra.name(), rc.name(),          \
-+         rb.name());                                                         \
-+    return writeInst(AForm(PPC_##op, rd, ra, rb, rc, true));                 \
-+  }
-+
-+// FP D-form load/store.
-+#define DEF_DFORM_F(op)                                          \
-+  BufferOffset Assembler::as_##op(FloatRegister rd, Register rb, \
-+                                  int16_t off) {                 \
-+    spew(#op "\t%3s,%d(%3s)", rd.name(), off, rb.name());        \
-+    MOZ_ASSERT(rb != r0);                                        \
-+    return writeInst(DForm(PPC_##op, rd, rb, off));              \
-+  }
-+
-+// FP X-form indexed load/store.
-+#define DEF_FMEMx(op)                                            \
-+  BufferOffset Assembler::as_##op(FloatRegister rd, Register ra, \
-+                                  Register rb) {                 \
-+    spew(#op "\t%3s,%3s,%3s", rd.name(), ra.name(), rb.name());  \
-+    return writeInst(XForm(PPC_##op, rd, ra, rb, false));        \
-+  }
-+
-+// --- Rotate/shift instructions ---
-+
-+DEF_MFORM(rlwnm)
-+DEF_MFORM_I(rlwinm)
-+DEF_MFORM_I_RC(rlwinm)
-+DEF_MFORM_I(rlwimi)
-+DEF_XFORMS_I(srawi)
-+
-+DEF_MDSFORM(rldcl)
-+DEF_MDFORM(rldicl)
-+DEF_MDFORM_RC(rldicl)
-+DEF_MDFORM(rldicr)
-+DEF_MDFORM_RC(rldicr)
-+DEF_MDFORM(rldimi)
-+
-+BufferOffset Assembler::as_sradi(Register rd, Register rs, int sh) {
-+  spew("sradi\t%3s,%3s,%d", rd.name(), rs.name(), sh);
-+  MOZ_ASSERT(sh >= 0 && sh < 64);
-+  return writeInst(PPC_sradi | rd.code() << 16 | rs.code() << 21 |
-+                   (sh & 0x1f) << 11 | (sh & 0x20) >> 4);
-+}
-+
-+// --- ALU three-register ---
-+
-+#define DEF_ALU2(op) DEF_XFORM(op)
-+
-+DEF_ALU2(add)
-+DEF_ALU2(addc)
-+DEF_ALU2(adde)
-+DEF_ALU2(subf)
-+DEF_ALU2(subfc)
-+DEF_ALU2(subfe)
-+DEF_ALU2(divd)
-+DEF_ALU2(divdu)
-+DEF_ALU2(divw)
-+DEF_ALU2(divwu)
-+// POWER9 modulo (XO-form, same encoding pattern as div).
-+DEF_XFORM(modsd)
-+DEF_XFORM(modsw)
-+DEF_XFORM(modud)
-+DEF_XFORM(moduw)
-+DEF_ALU2(mulld)
-+DEF_ALU2(mulhd)
-+DEF_ALU2(mulhdu)
-+DEF_ALU2(mulldo)
-+DEF_ALU2(mullw)
-+DEF_ALU2(mulhwu)
-+#undef DEF_ALU2
-+
-+// --- ALU immediate ---
-+
-+// D-form ALU-immediate ops have no Rc bit at instruction LSB (that bit
-+// is part of the 16-bit immediate). The only valid record-form variant
-+// in this group is `addic.`, which is a separate primary opcode (13)
-+// hand-written below; subfic and mulli have no record form at all.
-+#define DEF_ALUI(op)                                                      \
-+  BufferOffset Assembler::as_##op(Register rd, Register ra, int16_t im) { \
-+    spew(#op "\t%3s,%3s,%d", rd.name(), ra.name(), im);                   \
-+    return writeInst(InstImm(PPC_##op, rd, ra, im).encode());             \
-+  }
-+
-+BufferOffset Assembler::as_addi(Register rd, Register ra, int16_t im,
-+                                bool actually_li) {
-+#ifdef DEBUG
-+  if (actually_li) {
-+    spew("li\t%3s,%d", rd.name(), im);
-+  } else {
-+    MOZ_ASSERT(ra != r0);
-+    spew("addi\t%3s,%3s,%d", rd.name(), ra.name(), im);
-+  }
-+#endif
-+  return writeInst(InstImm(PPC_addi, rd, ra, im).encode());
-+}
-+
-+BufferOffset Assembler::as_addis(Register rd, Register ra, int16_t im,
-+                                 bool actually_lis) {
-+#ifdef DEBUG
-+  if (actually_lis) {
-+    spew("lis\t%3s,%d", rd.name(), im);
-+  } else {
-+    MOZ_ASSERT(ra != r0);
-+    spew("addis\t%3s,%3s,%d", rd.name(), ra.name(), im);
-+  }
-+#endif
-+  return writeInst(InstImm(PPC_addis, rd, ra, im).encode());
-+}
-+
-+DEF_ALUI(mulli)
-+DEF_ALUI(subfic)
-+#undef DEF_ALUI
-+
-+// --- ALU unary/extended ---
-+
-+
-+#define DEF_ALUE_S(op) DEF_XFORM2S(op)
-+DEF_ALUE_S(cntlzw)
-+DEF_ALUE_S(cntlzd)
-+DEF_ALUE_S(cnttzd)
-+DEF_ALUE_S(cnttzw)
-+#undef DEF_ALUE_S
-+
-+DEF_XFORM2S(popcntd)
-+DEF_XFORM2S(popcntw)
-+DEF_XFORM2S(brd)  // POWER10
-+DEF_XFORM2S(brh)  // POWER10
-+DEF_XFORM2S(brw)  // POWER10
-+
-+// --- Bitwise logical (three-register) ---
-+
-+#define DEF_BITALU2(op) DEF_XFORMS(op)
-+DEF_BITALU2(nor)
-+DEF_BITALU2(slw)
-+DEF_BITALU2(srw)
-+DEF_BITALU2(sraw)
-+DEF_BITALU2(sld)
-+DEF_BITALU2(srd)
-+DEF_BITALU2(srad)
-+#undef DEF_BITALU2
-+
-+// and_, or_, xor_ are manually defined (trailing underscore to avoid C++
-+// keyword conflicts). xs_mr delegates to as_or_ so we must not assert
-+// rd==rs==rb in as_or_ (which would be a valid mr).
-+BufferOffset Assembler::as_or_(Register rd, Register rs, Register rb) {
-+  spew("or\t%3s,%3s,%3s", rd.name(), rs.name(), rb.name());
-+  return writeInst(InstReg(PPC_or_, rs, rd, rb).encode());
-+}
-+
-+BufferOffset Assembler::as_xor_(Register rd, Register rs, Register rb) {
-+  spew("xor\t%3s,%3s,%3s", rd.name(), rs.name(), rb.name());
-+  return writeInst(InstReg(PPC_xor_, rs, rd, rb).encode());
-+}
-+
-+BufferOffset Assembler::as_and_(Register rd, Register rs, Register rb) {
-+  spew("and\t%3s,%3s,%3s", rd.name(), rs.name(), rb.name());
-+  return writeInst(InstReg(PPC_and_, rs, rd, rb).encode());
-+}
-+
-+BufferOffset Assembler::as_and__rc(Register rd, Register rs, Register rb) {
-+  spew("and.\t%3s,%3s,%3s", rd.name(), rs.name(), rb.name());
-+  return writeInst(InstReg(PPC_and_, rs, rd, rb).encode() | 0x1);
-+}
-+
-+// --- Bitwise logical (immediate) ---
-+
-+DEF_DFORMS(ori)
-+DEF_DFORMS(oris)
-+DEF_DFORMS(xori)
-+DEF_DFORMS(xoris)
-+
-+BufferOffset Assembler::as_andi_rc(Register rd, Register ra, uint16_t im) {
-+  spew("andi.\t%3s,%3s,%d", rd.name(), ra.name(), im);
-+  return writeInst(InstImm(PPC_andi_dot, ra, rd, im).encode());
-+}
-+
-+// --- Sign extension ---
-+
-+#define DEF_ALUEXT(op) DEF_XFORM2S(op) DEF_XFORM2S_RC(op)
-+DEF_XFORM2S(extsb)
-+DEF_XFORM2S(extsh)
-+DEF_ALUEXT(extsw)
-+#undef DEF_ALUEXT
-+
-+// --- Integer loads (D-form) ---
-+
-+DEF_DFORM(lbz)
-+DEF_DFORM(lha)
-+DEF_DFORM(lhz)
-+
-+BufferOffset Assembler::as_lwa(Register rd, Register rb, int16_t off) {
-+  spew("lwa\t%3s,%d(%3s)", rd.name(), off, rb.name());
-+  MOZ_ASSERT(rb != r0);
-+  MOZ_ASSERT(!(off & 0x03));
-+  return writeInst(InstImm(PPC_lwa, rd, rb, off).encode());
-+}
-+
-+DEF_DFORM(lwz)
-+
-+BufferOffset Assembler::as_ld(Register rd, Register rb, int16_t off) {
-+  spew("ld\t%3s,%d(%3s)", rd.name(), off, rb.name());
-+  MOZ_ASSERT(rb != r0);
-+  MOZ_ASSERT(!(off & 0x03));
-+  return writeInst(InstImm(PPC_ld, rd, rb, off).encode());
-+}
-+
-+// --- Integer stores (D-form) ---
-+
-+DEF_DFORM(stb)
-+DEF_DFORM(sth)
-+DEF_DFORM(stw)
-+
-+BufferOffset Assembler::as_std(Register rd, Register rb, int16_t off) {
-+  spew("std\t%3s,%d(%3s)", rd.name(), off, rb.name());
-+  MOZ_ASSERT(rb != r0);
-+  MOZ_ASSERT(!(off & 0x03));
-+  return writeInst(InstImm(PPC_std, rd, rb, off).encode());
-+}
-+
-+DEF_DFORM(stdu)
-+
-+#undef DEF_DFORM
-+#undef DEF_DFORMS
-+
-+// --- Integer loads/stores (X-form, indexed) ---
-+
-+#define DEF_MEMx(op) DEF_XFORM(op)
-+DEF_MEMx(lbzx) DEF_MEMx(lhax) DEF_MEMx(lhzx) DEF_MEMx(lwax)
-+    DEF_MEMx(lwzx) DEF_MEMx(lwarx) DEF_MEMx(lbarx)
-+        DEF_MEMx(lharx) DEF_MEMx(ldx) DEF_MEMx(ldarx) DEF_MEMx(stbx)
-+            DEF_MEMx(stbcx) DEF_MEMx(stwx) DEF_MEMx(stwbrx) DEF_MEMx(sthx)
-+                DEF_MEMx(sthcx) DEF_MEMx(stdx) DEF_MEMx(stdcx)
-+                    DEF_MEMx(stwcx)
-+#undef DEF_MEMx
-+
-+// --- Integer select ---
-+
-+BufferOffset Assembler::as_isel(Register rt, Register ra, Register rb,
-+                                uint16_t bc, CRegisterID cr) {
-+  MOZ_ASSERT(ra != r0);
-+  return as_isel0(rt, ra, rb, bc, cr);
-+}
-+
-+BufferOffset Assembler::as_isel0(Register rt, Register ra, Register rb,
-+                                 uint16_t bc, CRegisterID cr) {
-+  spew("isel\t%3s,%3s,%3s,cr%d:0x%02x", rt.name(), ra.name(), rb.name(), cr,
-+       bc);
-+  MOZ_ASSERT((bc < 0x40) && ((bc & 0x0f) == 0x0c));
-+  uint16_t nbc = (bc >> 4) + (cr << 2);
-+  return writeInst(PPC_isel | rt.code() << 21 | ra.code() << 16 |
-+                   rb.code() << 11 | nbc << 6);
-+}
-+
-+BufferOffset Assembler::as_setbc(Register rt, uint16_t bc, CRegisterID cr) {
-+  spew("setbc\t%3s,cr%d:0x%02x", rt.name(), cr, bc);
-+  MOZ_ASSERT((bc < 0x40) && ((bc & 0x0f) == 0x0c));
-+  uint16_t nbc = (bc >> 4) + (cr << 2);
-+  return writeInst(PPC_setbc | (rt.code() << 21) | (nbc << 16));
-+}
-+
-+BufferOffset Assembler::as_setbcr(Register rt, uint16_t bc, CRegisterID cr) {
-+  spew("setbcr\t%3s,cr%d:0x%02x", rt.name(), cr, bc);
-+  MOZ_ASSERT((bc < 0x40) && ((bc & 0x0f) == 0x0c));
-+  uint16_t nbc = (bc >> 4) + (cr << 2);
-+  return writeInst(PPC_setbcr | (rt.code() << 21) | (nbc << 16));
-+}
-+
-+// --- FP compare ---
-+
-+BufferOffset Assembler::as_fcmpu(CRegisterID cr, FloatRegister ra,
-+                                 FloatRegister rb) {
-+  spew("fcmpu\tcr%d,%3s,%3s", cr, ra.name(), rb.name());
-+  return writeInst(PPC_fcmpu | cr << 23 | ra.encoding() << 16 |
-+                   rb.encoding() << 11);
-+}
-+
-+BufferOffset Assembler::as_fcmpu(FloatRegister ra, FloatRegister rb) {
-+  return as_fcmpu(cr0, ra, rb);
-+}
-+
-+// --- FP arithmetic ---
-+
-+#define DEF_FPUAC(op) DEF_AFORM_C(op)
-+DEF_FPUAC(fmul)
-+DEF_FPUAC(fmuls)
-+#undef DEF_FPUAC
-+
-+#define DEF_FPUAB(op) DEF_AFORM_B(op)
-+DEF_FPUAB(fadd)
-+DEF_FPUAB(fdiv)
-+DEF_FPUAB(fsub)
-+DEF_FPUAB(fadds)
-+DEF_FPUAB(fdivs)
-+DEF_FPUAB(fsubs)
-+DEF_FPUAB(fcpsgn)
-+#undef DEF_FPUAB
-+
-+// --- FP unary/conversion/rounding ---
-+
-+#define DEF_FPUDS(op) DEF_XFORM2_F(op)
-+DEF_FPUDS(fabs)
-+DEF_FPUDS(fneg)
-+DEF_FPUDS(fmr)
-+DEF_FPUDS(fcfid)
-+DEF_FPUDS(fcfids)
-+DEF_FPUDS(fcfidu)
-+DEF_FPUDS(fcfidus)
-+DEF_FPUDS(fctid)
-+DEF_FPUDS(fctidz)
-+DEF_FPUDS(fctiduz)
-+DEF_FPUDS(fctiwz)
-+DEF_FPUDS(frim)
-+DEF_FPUDS(frip)
-+DEF_FPUDS(friz)
-+DEF_FPUDS(frsp)
-+DEF_FPUDS(fsqrt)
-+DEF_FPUDS(fsqrts)
-+#undef DEF_FPUDS
-+
-+// --- FP loads/stores (D-form) ---
-+
-+DEF_DFORM_F(lfd)
-+DEF_DFORM_F(lfs)
-+DEF_DFORM_F(stfd)
-+DEF_DFORM_F(stfs)
-+DEF_DFORM_F(stfdu)
-+DEF_DFORM_F(stfsu)
-+
-+// --- FP loads/stores (X-form, indexed) ---
-+
-+DEF_FMEMx(lfdx) DEF_FMEMx(lfsx) DEF_FMEMx(lfiwax)
-+    DEF_FMEMx(stfdx) DEF_FMEMx(stfsx)
-+// Clean up macros.
-+#undef DEF_XFORM
-+#undef DEF_XFORM_RC
-+#undef DEF_XFORMS
-+#undef DEF_XFORMS_RC
-+#undef DEF_XFORMS_I
-+#undef DEF_XFORM2
-+#undef DEF_XFORM2_RC
-+#undef DEF_XFORM2S
-+#undef DEF_XFORM2S_RC
-+#undef DEF_XFORM2_F
-+#undef DEF_XFORM2_F_RC
-+#undef DEF_MFORM
-+#undef DEF_MFORM_I
-+#undef DEF_MFORM_I_RC
-+#undef DEF_MDSFORM
-+#undef DEF_MDSFORM_RC
-+#undef DEF_MDFORM
-+#undef DEF_MDFORM_RC
-+#undef DEF_DFORM_F
-+#undef DEF_FMEMx
-+#undef DEF_AFORM_C
-+#undef DEF_AFORM_C_RC
-+#undef DEF_AFORM_B
-+#undef DEF_AFORM_B_RC
-+#undef DEF_AFORM
-+#undef DEF_AFORM_RC
-+
-+    // --- FPSCR operations ---
-+
-+    BufferOffset Assembler::as_mtfsb0(uint8_t bt) {
-+  spew("mtfsb0\t%d", bt);
-+  return writeInst(PPC_mtfsb0 | (uint32_t)bt << 21);
-+}
-+
-+BufferOffset Assembler::as_mcrfs(CRegisterID bf, uint8_t bfa) {
-+  spew("mcrfs\tcr%d,%d", bf, bfa);
-+  return writeInst(PPC_mcrfs | (uint32_t)bf << 23 | (uint32_t)bfa << 18);
-+}
-+
-+// --- VSX (FPR-only subset) ---
-+
-+BufferOffset Assembler::as_mfvsrd(Register ra, FloatRegister xs) {
-+  spew("mfvsrd\t%3s,%3s", ra.name(), xs.name());
-+  return writeInst(XX1FormMfvsr(PPC_mfvsrd, ra.code(), xs.encoding()));
-+}
-+
-+BufferOffset Assembler::as_mtvsrd(FloatRegister xt, Register ra) {
-+  spew("mtvsrd\t%3s,%3s", xt.name(), ra.name());
-+  return writeInst(XX1Form(PPC_mtvsrd, xt.encoding(), ra.code(), 0));
-+}
-+
-+BufferOffset Assembler::as_mtvsrwa(FloatRegister xt, Register ra) {
-+  spew("mtvsrwa\t%3s,%3s", xt.name(), ra.name());
-+  return writeInst(XX1Form(PPC_mtvsrwa, xt.encoding(), ra.code(), 0));
-+}
-+
-+BufferOffset Assembler::as_mtvsrws(FloatRegister xt, Register ra) {
-+  spew("mtvsrws\t%3s,%3s", xt.name(), ra.name());
-+  return writeInst(XX1Form(PPC_mtvsrws, xt.encoding(), ra.code(), 0));
-+}
-+
-+BufferOffset Assembler::as_mtvsrwz(FloatRegister xt, Register ra) {
-+  spew("mtvsrwz\t%3s,%3s", xt.name(), ra.name());
-+  return writeInst(XX1Form(PPC_mtvsrwz, xt.encoding(), ra.code(), 0));
-+}
-+
-+BufferOffset Assembler::as_xxbrd(FloatRegister xt, FloatRegister xb) {
-+  spew("xxbrd\t%3s,%3s", xt.name(), xb.name());
-+  return writeInst(XX2Form(PPC_xxbrd, xt.encoding(), xb.encoding()));
-+}
-+
-+BufferOffset Assembler::as_xscvdpspn(FloatRegister xt, FloatRegister xb) {
-+  spew("xscvdpspn\t%3s,%3s", xt.name(), xb.name());
-+  return writeInst(XX2Form(PPC_xscvdpspn, xt.encoding(), xb.encoding()));
-+}
-+
-+BufferOffset Assembler::as_xscvspdpn(FloatRegister xt, FloatRegister xb) {
-+  spew("xscvspdpn\t%3s,%3s", xt.name(), xb.name());
-+  return writeInst(XX2Form(PPC_xscvspdpn, xt.encoding(), xb.encoding()));
-+}
-+
-+// POWER9 (ISA 3.0) scalar FP16 conversions. The UIM disambiguator is
-+// already in PPC_xscvdphp / PPC_xscvhpdp; XX2Form's bits16to20 default
-+// of 0 leaves it intact.
-+BufferOffset Assembler::as_xscvdphp(FloatRegister xt, FloatRegister xb) {
-+  spew("xscvdphp\t%3s,%3s", xt.name(), xb.name());
-+  return writeInst(XX2Form(PPC_xscvdphp, xt.encoding(), xb.encoding()));
-+}
-+
-+BufferOffset Assembler::as_xscvhpdp(FloatRegister xt, FloatRegister xb) {
-+  spew("xscvhpdp\t%3s,%3s", xt.name(), xb.name());
-+  return writeInst(XX2Form(PPC_xscvhpdp, xt.encoding(), xb.encoding()));
-+}
-+
-+BufferOffset Assembler::as_xsxexpdp(FloatRegister xt, FloatRegister xb) {
-+  spew("xsxexpdp\t%3s,%3s", xt.name(), xb.name());
-+  return writeInst(XX2Form(PPC_xsxexpdp, xt.encoding(), xb.encoding()));
-+}
-+
-+// POWER9 (ISA 3.0) FP16 load/store, X-form indexed. lxsihzx loads
-+// 16 bits into VSR dword 0 word 1's low halfword (zeroing the rest);
-+// stxsihx stores from there. The XT[5]/XS[5] bit travels via the
-+// X-form's TX/SX bit at instruction bit 0.
-+BufferOffset Assembler::as_lxsihzx(FloatRegister xt, Register ra, Register rb) {
-+  spew("lxsihzx\t%3s,%3s,%3s", xt.name(), ra.name(), rb.name());
-+  return writeInst(PPC_lxsihzx | (xt.encoding() & 31) << 21 |
-+                   ra.code() << 16 | rb.code() << 11 |
-+                   ((xt.encoding() >> 5) & 1));
-+}
-+
-+BufferOffset Assembler::as_stxsihx(FloatRegister xs, Register ra, Register rb) {
-+  spew("stxsihx\t%3s,%3s,%3s", xs.name(), ra.name(), rb.name());
-+  return writeInst(PPC_stxsihx | (xs.encoding() & 31) << 21 |
-+                   ra.code() << 16 | rb.code() << 11 |
-+                   ((xs.encoding() >> 5) & 1));
-+}
-+
-+// XX3-form, FPR-space only (encoding 0..31 → VSR0..31, all AX/BX/TX = 0).
-+// Java/JavaScript-style scalar max/min — semantics verified to match
-+// ECMA-262 Math.max/Math.min including ±0 and NaN propagation. POWER9-only.
-+BufferOffset Assembler::as_xsmaxjdp(FloatRegister xt, FloatRegister xa,
-+                                    FloatRegister xb) {
-+  spew("xsmaxjdp\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name());
-+  return writeInst(XX3Form(PPC_xsmaxjdp, xt, xa, xb));
-+}
-+
-+BufferOffset Assembler::as_xsminjdp(FloatRegister xt, FloatRegister xa,
-+                                    FloatRegister xb) {
-+  spew("xsminjdp\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name());
-+  return writeInst(XX3Form(PPC_xsminjdp, xt, xa, xb));
-+}
-+
-+// --- VSX SIMD load/store ---
-+
-+// For VSX0-31 (FPR), the 6th register bit (TX/SX/BX) is 0.
-+// X-form: opcode | T << 21 | A << 16 | B << 11 | xo | TX
-+// lxvx/stxvx are POWER9 (ISA 3.0). lxvd2x/stxvd2x are POWER8 (ISA 2.07).
-+
-+BufferOffset Assembler::as_lxvx(FloatRegister xt, Register ra, Register rb) {
-+  spew("lxvx\t%3s,%3s,%3s", xt.name(), ra.name(), rb.name());
-+  return writeInst(XX1Form(PPC_lxvx, xt.encoding(), ra.code(), rb.code()));
-+}
-+
-+BufferOffset Assembler::as_stxvx(FloatRegister xs, Register ra, Register rb) {
-+  spew("stxvx\t%3s,%3s,%3s", xs.name(), ra.name(), rb.name());
-+  return writeInst(XX1Form(PPC_stxvx, xs.encoding(), ra.code(), rb.code()));
-+}
-+
-+BufferOffset Assembler::as_lxvd2x(FloatRegister xt, Register ra, Register rb) {
-+  spew("lxvd2x\t%3s,%3s,%3s", xt.name(), ra.name(), rb.name());
-+  return writeInst(XX1Form(PPC_lxvd2x, xt.encoding(), ra.code(), rb.code()));
-+}
-+
-+BufferOffset Assembler::as_stxvd2x(FloatRegister xs, Register ra, Register rb) {
-+  spew("stxvd2x\t%3s,%3s,%3s", xs.name(), ra.name(), rb.name());
-+  return writeInst(XX1Form(PPC_stxvd2x, xs.encoding(), ra.code(), rb.code()));
-+}
-+
-+// VMX register load/store. See PPC_lvx/PPC_stvx in Assembler-ppc64.h for
-+// the encoding rationale.
-+BufferOffset Assembler::as_lvx(uint8_t vrt, Register ra, Register rb) {
-+  MOZ_ASSERT(vrt < 32);
-+  spew("lvx\tvr%d,%3s,%3s", vrt, ra.name(), rb.name());
-+  return writeInst(PPC_lvx | uint32_t(vrt) << 21 | ra.code() << 16 |
-+                   rb.code() << 11);
-+}
-+
-+BufferOffset Assembler::as_stvx(uint8_t vrs, Register ra, Register rb) {
-+  MOZ_ASSERT(vrs < 32);
-+  spew("stvx\tvr%d,%3s,%3s", vrs, ra.name(), rb.name());
-+  return writeInst(PPC_stvx | uint32_t(vrs) << 21 | ra.code() << 16 |
-+                   rb.code() << 11);
-+}
-+
-+// --- VSX SIMD register operations ---
-+
-+// XX3-form: opcode | T[0:4]<<21 | A[0:4]<<16 | B[0:4]<<11 | xo | AX | BX | TX
-+// where AX/BX/TX (bits 2/1/0) carry bit 5 of each 6-bit VSR index.
-+// Encoded by the XX3Form helper above for both VSR0-31 (Single/Double) and
-+// VSR32-63 (Simd128) operands.
-+BufferOffset Assembler::as_xxlor(FloatRegister xt, FloatRegister xa,
-+                                 FloatRegister xb) {
-+  spew("xxlor\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name());
-+  return writeInst(XX3Form(PPC_xxlor, xt, xa, xb));
-+}
-+
-+BufferOffset Assembler::as_xxland(FloatRegister xt, FloatRegister xa,
-+                                  FloatRegister xb) {
-+  spew("xxland\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name());
-+  return writeInst(XX3Form(PPC_xxland, xt, xa, xb));
-+}
-+
-+BufferOffset Assembler::as_xxlxor(FloatRegister xt, FloatRegister xa,
-+                                  FloatRegister xb) {
-+  spew("xxlxor\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name());
-+  return writeInst(XX3Form(PPC_xxlxor, xt, xa, xb));
-+}
-+
-+BufferOffset Assembler::as_xxlnor(FloatRegister xt, FloatRegister xa,
-+                                  FloatRegister xb) {
-+  spew("xxlnor\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name());
-+  return writeInst(XX3Form(PPC_xxlnor, xt, xa, xb));
-+}
-+
-+BufferOffset Assembler::as_xxlandc(FloatRegister xt, FloatRegister xa,
-+                                   FloatRegister xb) {
-+  spew("xxlandc\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name());
-+  return writeInst(XX3Form(PPC_xxlandc, xt, xa, xb));
-+}
-+
-+BufferOffset Assembler::as_xxsel(FloatRegister xt, FloatRegister xa,
-+                                 FloatRegister xb, FloatRegister xc) {
-+  spew("xxsel\t%3s,%3s,%3s,%3s", xt.name(), xa.name(), xb.name(), xc.name());
-+  return writeInst(XX4Form(PPC_xxsel, xt.encoding(), xa.encoding(),
-+                           xb.encoding(), xc.encoding()));
-+}
-+
-+BufferOffset Assembler::as_xxpermdi(FloatRegister xt, FloatRegister xa,
-+                                    FloatRegister xb, uint8_t dm) {
-+  MOZ_ASSERT(dm < 4);
-+  spew("xxpermdi\t%3s,%3s,%3s,%d", xt.name(), xa.name(), xb.name(), dm);
-+  return writeInst(XX3Form(PPC_xxpermdi | (uint32_t(dm) << 8), xt, xa, xb));
-+}
-+
-+// POWER9 (ISA 3.0). XX1-form with two GPR sources.
-+BufferOffset Assembler::as_mtvsrdd(FloatRegister xt, Register ra, Register rb) {
-+  spew("mtvsrdd\t%3s,%3s,%3s", xt.name(), ra.name(), rb.name());
-+  return writeInst(XX1Form(PPC_mtvsrdd, xt.encoding(), ra.code(), rb.code()));
-+}
-+
-+// POWER9 (ISA 3.0). XX1-form: move lower doubleword of VSR to GPR.
-+BufferOffset Assembler::as_mfvsrld(Register rt, FloatRegister xs) {
-+  spew("mfvsrld\t%3s,%3s", rt.name(), xs.name());
-+  return writeInst(XX1FormMfvsr(PPC_mfvsrld, rt.code(), xs.encoding()));
-+}
-+
-+// --- XX2-form VSX instructions ---
-+
-+// XX2-form: opcode | T<<21 | UIM<<16_area | B<<11_area | XO<<2 | BX | TX
-+// For VSR0-31, BX=TX=0.
-+
-+BufferOffset Assembler::as_xxspltw(FloatRegister xt, FloatRegister xb,
-+                                   uint8_t uim) {
-+  MOZ_ASSERT(uim < 4);
-+  spew("xxspltw\t%3s,%3s,%d", xt.name(), xb.name(), uim);
-+  return writeInst(XX2Form(PPC_xxspltw, xt.encoding(), xb.encoding(), uim));
-+}
-+
-+BufferOffset Assembler::as_xxinsertw(FloatRegister xt, FloatRegister xb,
-+                                     uint8_t uim) {
-+  MOZ_ASSERT(uim <= 12 && (uim & 3) == 0);
-+  spew("xxinsertw\t%3s,%3s,%d", xt.name(), xb.name(), uim);
-+  return writeInst(XX2Form(PPC_xxinsertw, xt.encoding(), xb.encoding(), uim));
-+}
-+
-+BufferOffset Assembler::as_xxextractuw(FloatRegister xt, FloatRegister xb,
-+                                       uint8_t uim) {
-+  MOZ_ASSERT(uim <= 12 && (uim & 3) == 0);
-+  spew("xxextractuw\t%3s,%3s,%d", xt.name(), xb.name(), uim);
-+  return writeInst(XX2Form(PPC_xxextractuw, xt.encoding(), xb.encoding(), uim));
-+}
-+
-+// POWER9 (ISA 3.0). XX1-form-ish: T(5) + UIM8(8) + XO + TX. UIM8 occupies
-+// bits 18..11 (a non-standard slot that XX1Form doesn't fit), so encode
-+// inline. TX bit at instruction bit 0 selects the upper half of VSR
-+// space when xt.encoding() is in 32-63 (Simd128).
-+BufferOffset Assembler::as_xxspltib(FloatRegister xt, uint8_t imm8) {
-+  spew("xxspltib\t%3s,%u", xt.name(), imm8);
-+  uint32_t enc = uint32_t(xt.encoding());
-+  return writeInst(PPC_xxspltib | (enc & 31) << 21 | (uint32_t)imm8 << 11 |
-+                   ((enc >> 5) & 1));
-+}
-+
-+// --- VMX instructions ---
-+
-+// VX-form: (4<<26) | VRT<<21 | UIMM<<16 | VRB<<11 | XO
-+// VRT/VRB are 5-bit raw VR numbers (0-31). Simd128 FloatRegister.encoding()
-+// returns 32-63; masking with & 31 maps it back to the VR offset 0-31.
-+BufferOffset Assembler::as_vspltb(FloatRegister vrt, FloatRegister vrb,
-+                                  uint8_t uim) {
-+  MOZ_ASSERT(uim < 16);
-+  spew("vspltb\t%3s,%3s,%d", vrt.name(), vrb.name(), uim);
-+  return writeInst(PPC_vspltb | (vrt.encoding() & 31) << 21 |
-+                   (uint32_t)uim << 16 | (vrb.encoding() & 31) << 11);
-+}
-+
-+BufferOffset Assembler::as_vsplth(FloatRegister vrt, FloatRegister vrb,
-+                                  uint8_t uim) {
-+  MOZ_ASSERT(uim < 8);
-+  spew("vsplth\t%3s,%3s,%d", vrt.name(), vrb.name(), uim);
-+  return writeInst(PPC_vsplth | (vrt.encoding() & 31) << 21 |
-+                   (uint32_t)uim << 16 | (vrb.encoding() & 31) << 11);
-+}
-+
-+// VA-form: (4<<26) | VRT<<21 | VRA<<16 | VRB<<11 | SHB<<6 | XO(6-bit)
-+BufferOffset Assembler::as_vsldoi(FloatRegister vrt, FloatRegister vra,
-+                                  FloatRegister vrb, uint8_t shb) {
-+  MOZ_ASSERT(shb < 16);
-+  spew("vsldoi\t%3s,%3s,%3s,%d", vrt.name(), vra.name(), vrb.name(), shb);
-+  return writeInst(PPC_vsldoi | (vrt.encoding() & 31) << 21 |
-+                   (vra.encoding() & 31) << 16 | (vrb.encoding() & 31) << 11 |
-+                   (uint32_t)shb << 6);
-+}
-+
-+// --- VMX integer arithmetic (VR registers only) ---
-+
-+// VX-form: (4<<26) | VRT<<21 | VRA<<16 | VRB<<11 | XO
-+// The macro takes raw VR numbers (0-31).
-+#define DEF_VMX_VVV(op)                                                    \
-+  BufferOffset Assembler::as_##op(uint8_t vrt, uint8_t vra, uint8_t vrb) { \
-+    MOZ_ASSERT(vrt < 32 && vra < 32 && vrb < 32);                          \
-+    spew(#op "\tvr%d,vr%d,vr%d", vrt, vra, vrb);                           \
-+    return writeInst(PPC_##op | vrt << 21 | vra << 16 | vrb << 11);        \
-+  }
-+
-+DEF_VMX_VVV(vaddubm)
-+DEF_VMX_VVV(vadduhm)
-+DEF_VMX_VVV(vadduwm)
-+DEF_VMX_VVV(vaddudm)
-+DEF_VMX_VVV(vsububm)
-+DEF_VMX_VVV(vsubuhm)
-+DEF_VMX_VVV(vsubuwm)
-+DEF_VMX_VVV(vsubudm)
-+DEF_VMX_VVV(vaddsbs)
-+DEF_VMX_VVV(vaddshs)
-+DEF_VMX_VVV(vaddubs)
-+DEF_VMX_VVV(vadduhs)
-+DEF_VMX_VVV(vsubsbs)
-+DEF_VMX_VVV(vsubshs)
-+DEF_VMX_VVV(vsububs)
-+DEF_VMX_VVV(vsubuhs)
-+DEF_VMX_VVV(vminsb)
-+DEF_VMX_VVV(vminsh)
-+DEF_VMX_VVV(vminsw)
-+DEF_VMX_VVV(vmaxsb)
-+DEF_VMX_VVV(vmaxsh)
-+DEF_VMX_VVV(vmaxsw)
-+DEF_VMX_VVV(vmaxsd)
-+DEF_VMX_VVV(vminub)
-+DEF_VMX_VVV(vminuh)
-+DEF_VMX_VVV(vminuw)
-+DEF_VMX_VVV(vmaxub)
-+DEF_VMX_VVV(vmaxuh)
-+DEF_VMX_VVV(vmaxuw)
-+DEF_VMX_VVV(vavgub)
-+DEF_VMX_VVV(vavguh)
-+DEF_VMX_VVV(vmuluwm)
-+DEF_VMX_VVV(vmulld)
-+
-+DEF_VMX_VVV(vslb)
-+DEF_VMX_VVV(vslh)
-+DEF_VMX_VVV(vslw)
-+DEF_VMX_VVV(vsld)
-+DEF_VMX_VVV(vsrb)
-+DEF_VMX_VVV(vsrh)
-+DEF_VMX_VVV(vsrw)
-+DEF_VMX_VVV(vsrd)
-+DEF_VMX_VVV(vsrab)
-+DEF_VMX_VVV(vsrah)
-+DEF_VMX_VVV(vsraw)
-+DEF_VMX_VVV(vsrad)
-+DEF_VMX_VVV(vslo)
-+DEF_VMX_VVV(vsro)
-+DEF_VMX_VVV(vcmpequb)
-+DEF_VMX_VVV(vcmpequh)
-+DEF_VMX_VVV(vcmpequw)
-+DEF_VMX_VVV(vcmpequd)
-+DEF_VMX_VVV(vcmpgtsb)
-+DEF_VMX_VVV(vcmpgtsh)
-+DEF_VMX_VVV(vcmpgtsw)
-+DEF_VMX_VVV(vcmpgtsd)
-+DEF_VMX_VVV(vcmpgtub)
-+DEF_VMX_VVV(vcmpgtuh)
-+DEF_VMX_VVV(vcmpgtuw)
-+DEF_VMX_VVV(vcmpgtud)
-+// POWER9 (ISA 3.0). NotEqual compare; saves the xxlnor that vcmpequX needs.
-+DEF_VMX_VVV(vcmpneb)
-+DEF_VMX_VVV(vcmpneh)
-+DEF_VMX_VVV(vcmpnew)
-+
-+// POWER8+ (ISA 2.07). vbpermq RT,RA,RB: bit-permute quadword.
-+DEF_VMX_VVV(vbpermq)
-+
-+#undef DEF_VMX_VVV
-+
-+// VC-form record forms: same as VX-form above with Rc bit (bit 10 LSB) set.
-+// vcmpXXX. sets CR6: LT = all-true, EQ = none-true.
-+#define DEF_VMX_VVV_RC(op)                                                  \
-+  BufferOffset Assembler::as_##op##_rc(uint8_t vrt, uint8_t vra,            \
-+                                       uint8_t vrb) {                       \
-+    MOZ_ASSERT(vrt < 32 && vra < 32 && vrb < 32);                           \
-+    spew(#op ".\tvr%d,vr%d,vr%d", vrt, vra, vrb);                           \
-+    return writeInst(PPC_##op | vrt << 21 | vra << 16 | vrb << 11 | 0x400); \
-+  }
-+
-+DEF_VMX_VVV_RC(vcmpequb)
-+DEF_VMX_VVV_RC(vcmpequh)
-+DEF_VMX_VVV_RC(vcmpequw)
-+DEF_VMX_VVV_RC(vcmpequd)
-+
-+#undef DEF_VMX_VVV_RC
-+
-+// VSX float compare (XX3-form).
-+#define DEF_VSX_CMP(op)                                               \
-+  BufferOffset Assembler::as_##op(FloatRegister xt, FloatRegister xa, \
-+                                  FloatRegister xb) {                 \
-+    spew(#op "\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name());       \
-+    return writeInst(XX3Form(PPC_##op, xt, xa, xb));                  \
-+  }
-+
-+DEF_VSX_CMP(xvcmpeqsp)
-+DEF_VSX_CMP(xvcmpgtsp)
-+DEF_VSX_CMP(xvcmpgesp)
-+DEF_VSX_CMP(xvcmpeqdp)
-+DEF_VSX_CMP(xvcmpgtdp)
-+DEF_VSX_CMP(xvcmpgedp)
-+
-+#undef DEF_VSX_CMP
-+
-+// VSX float arithmetic (XX3-form binary).
-+#define DEF_VSX_BIN(op)                                               \
-+  BufferOffset Assembler::as_##op(FloatRegister xt, FloatRegister xa, \
-+                                  FloatRegister xb) {                 \
-+    spew(#op "\t%3s,%3s,%3s", xt.name(), xa.name(), xb.name());       \
-+    return writeInst(XX3Form(PPC_##op, xt, xa, xb));                  \
-+  }
-+DEF_VSX_BIN(xvaddsp)
-+DEF_VSX_BIN(xvadddp) DEF_VSX_BIN(xvsubsp) DEF_VSX_BIN(xvsubdp) DEF_VSX_BIN(
-+    xvmulsp) DEF_VSX_BIN(xvmuldp) DEF_VSX_BIN(xvdivsp) DEF_VSX_BIN(xvdivdp)
-+    DEF_VSX_BIN(xvminsp) DEF_VSX_BIN(xvmindp) DEF_VSX_BIN(xvmaxsp) DEF_VSX_BIN(
-+        xvmaxdp) DEF_VSX_BIN(xvmaddasp) DEF_VSX_BIN(xvmaddadp)
-+        DEF_VSX_BIN(xvnmsubasp) DEF_VSX_BIN(xvnmsubadp)
-+#undef DEF_VSX_BIN
-+
-+// VSX unary (XX2-form): op | xt<<21 | xb<<11 | XO<<2
-+// XX2-form unary VSX op: T + B, no UIM. Uses XX2Form helper for TX/BX bits.
-+#define DEF_VSX_UN(op)                                                  \
-+  BufferOffset Assembler::as_##op(FloatRegister xt, FloatRegister xb) { \
-+    spew(#op "\t%3s,%3s", xt.name(), xb.name());                        \
-+    return writeInst(XX2Form(PPC_##op, xt.encoding(), xb.encoding()));  \
-+  }
-+            DEF_VSX_UN(xvabssp) DEF_VSX_UN(xvabsdp) DEF_VSX_UN(
-+                xvnegsp) DEF_VSX_UN(xvnegdp) DEF_VSX_UN(xvsqrtsp)
-+                DEF_VSX_UN(xvsqrtdp) DEF_VSX_UN(xvrspip) DEF_VSX_UN(
-+                    xvrdpip) DEF_VSX_UN(xvrspim) DEF_VSX_UN(xvrdpim)
-+                    DEF_VSX_UN(xvrspiz) DEF_VSX_UN(xvrdpiz) DEF_VSX_UN(
-+                        xvrspic) DEF_VSX_UN(xvrdpic) DEF_VSX_UN(xvcvsxwsp)
-+                        DEF_VSX_UN(xvcvuxwsp) DEF_VSX_UN(xvcvsxwdp) DEF_VSX_UN(
-+                            xvcvuxwdp) DEF_VSX_UN(xvcvspsxws)
-+                            DEF_VSX_UN(xvcvspuxws) DEF_VSX_UN(xvcvdpsxws)
-+                                DEF_VSX_UN(xvcvdpuxws) DEF_VSX_UN(xvcvdpsp)
-+                                    DEF_VSX_UN(xvcvspdp)
-+#undef DEF_VSX_UN
-+
-+// VMX unary VX-form: (4<<26) | VRT<<21 | 0<<16 | VRB<<11 | XO
-+#define DEF_VMX_UNARY(op)                                     \
-+  BufferOffset Assembler::as_##op(uint8_t vrt, uint8_t vrb) { \
-+    MOZ_ASSERT(vrt < 32 && vrb < 32);                         \
-+    spew(#op "\tvr%d,vr%d", vrt, vrb);                        \
-+    return writeInst(PPC_##op | vrt << 21 | vrb << 11);       \
-+  }
-+                                        DEF_VMX_UNARY(vupkhsb) DEF_VMX_UNARY(
-+                                            vupklsb) DEF_VMX_UNARY(vupkhsh)
-+                                            DEF_VMX_UNARY(vupklsh)
-+                                                DEF_VMX_UNARY(vupkhsw)
-+                                                    DEF_VMX_UNARY(vupklsw)
-+    // POWER9 per-lane integer negate. The VRA field holds the subop code
-+    // (6 for vnegw, 7 for vnegd) which is already baked into PPC_vneg{w,d}.
-+    DEF_VMX_UNARY(vnegw) DEF_VMX_UNARY(vnegd) DEF_VMX_UNARY(vpopcntb)
-+#undef DEF_VMX_UNARY
-+
-+    // POWER9 addpcis (DX-form). Computes rT = (CIA + 4) + (D << 16).
-+    // D is a 16-bit signed immediate, split across three instruction fields:
-+    //   d0 = bits 16..25 (10 bits, D[15:6])
-+    //   d1 = bits 11..15 (5 bits,  D[5:1])
-+    //   d2 = bit 31      (1 bit,   D[0])
-+    // Primary opcode 19, DX subop 2.
-+    BufferOffset Assembler::as_addpcis(Register rt, int16_t d) {
-+  spew("addpcis\t%s,%d", rt.name(), (int)d);
-+  uint32_t D = uint16_t(d);
-+  uint32_t inst = (19u << 26) | (uint32_t(rt.code()) << 21) |
-+                  ((D >> 1) & 0x1F) << 16 | ((D >> 6) & 0x3FF) << 6 |
-+                  (2u << 1) | (D & 1u);
-+  return writeInst(inst);
-+}
-+
-+// -----------------------------------------------------------------------------
-+// Power ISA v3.1 (POWER10) prefixed instructions.
-+//
-+// Layout:
-+//
-+//   Prefix word (BE bit numbering from the manual; LE bits in parentheses):
-+//     [0..5]   primary opcode = 1   (LE 31..26)
-+//     [6..7]   Type: 00 = 8LS, 10 = MLS   (LE 25..24)
-+//     [8..10]  reserved = 0   (LE 23..21)
-+//     [11]     R: 1 = PC-relative (RA must be r0)   (LE 20)
-+//     [12..13] reserved = 0   (LE 19..18)
-+//     [14..31] d0: high 18 bits of 34-bit signed immediate   (LE 17..0)
-+//
-+//   Suffix (paddi/pld, GPR target):
-+//     [0..5]   suffix opcode (paddi=14, pld=57)   (LE 31..26)
-+//     [6..10]  RT   (LE 25..21)
-+//     [11..15] RA   (LE 20..16)
-+//     [16..31] d1: low 16 bits of immediate   (LE 15..0)
-+//
-+//   Suffix (plxv, VSR target — has the TX bit at suffix bit 5/LE bit 26):
-+//     [0..4]   plxv 5-bit opcode = 11001 (=25)   (LE 31..27)
-+//     [5]      TX (high bit of 6-bit XT)   (LE 26)
-+//     [6..10]  T  (low 5 bits of XT)   (LE 25..21)
-+//     [11..15] RA   (LE 20..16)
-+//     [16..31] d1   (LE 15..0)
-+//
-+// The prefix and suffix of a prefixed instruction must lie in the same
-+// 64-byte aligned block at **runtime**. The JitCode allocator only
-+// guarantees 16-byte alignment, so the buffer-relative offset and the
-+// runtime address can differ by 0/16/32/48 mod 64. A buffer-only check
-+// `(currentOffset() & 63) == 60` is correct when the allocator base is
-+// 64-aligned but misses three of the four 16-aligned base classes — pad
-+// whenever `(currentOffset() & 15) == 12`, which catches all four. The
-+// enterNoPool guard prevents the constant-pool flusher from inserting
-+// bodies between the (optional) nop, prefix, and suffix.
-+
-+static uint32_t EncodePower10Prefix(uint32_t type, bool R, uint32_t d0) {
-+  MOZ_ASSERT(type == 0 || type == 2);  // 8LS=0, MLS=2
-+  MOZ_ASSERT(d0 < (1u << 18));
-+  return (1u << 26) | (type << 24) | (uint32_t(R ? 1 : 0) << 20) |
-+         (d0 & 0x3FFFFu);
-+}
-+
-+static void SplitImm34(int64_t imm34, uint32_t* d0, uint32_t* d1) {
-+  MOZ_ASSERT(imm34 >= -(int64_t(1) << 33));
-+  MOZ_ASSERT(imm34 < (int64_t(1) << 33));
-+  uint64_t u = uint64_t(imm34) & 0x3FFFFFFFFull;  // low 34 bits
-+  *d0 = uint32_t(u >> 16) & 0x3FFFFu;             // 18 bits
-+  *d1 = uint32_t(u) & 0xFFFFu;                    // 16 bits
-+}
-+
-+void Assembler::ensurePrefixedAlignment() {
-+  if ((currentOffset() & 15) == 12) {
-+    as_nop();
-+  }
-+}
-+
-+// paddi RT, RA, SI, R   (MLS, suffix opcode 14 = addi)
-+//   R=0: RT = (RA==0 ? 0 : RA) + sign_extend(SI, 34)
-+//   R=1: RT = CIA(prefix) + sign_extend(SI, 34)   (RA must be r0)
-+BufferOffset Assembler::as_paddi(Register rt, Register ra, int64_t imm34,
-+                                  bool R) {
-+  MOZ_ASSERT_IF(R, ra == r0);
-+  spew("paddi\t%s,%s,%lld,%d", rt.name(), ra.name(), (long long)imm34,
-+       R ? 1 : 0);
-+  uint32_t d0, d1;
-+  SplitImm34(imm34, &d0, &d1);
-+  uint32_t prefix = EncodePower10Prefix(/*type=MLS*/ 2, R, d0);
-+  uint32_t suffix = (14u << 26) | (uint32_t(rt.code()) << 21) |
-+                    (uint32_t(ra.code()) << 16) | d1;
-+  // Reservation = nop (worst case) + prefix + suffix.
-+  m_buffer.enterNoPool(3);
-+  ensurePrefixedAlignment();
-+  BufferOffset bo = writeInst(prefix);
-+  writeInst(suffix);
-+  m_buffer.leaveNoPool();
-+  return bo;
-+}
-+
-+// pld RT, D(RA), R   (8LS, suffix opcode 57)
-+BufferOffset Assembler::as_pld(Register rt, Register ra, int64_t imm34,
-+                                bool R) {
-+  MOZ_ASSERT_IF(R, ra == r0);
-+  spew("pld\t%s,%lld(%s),%d", rt.name(), (long long)imm34, ra.name(),
-+       R ? 1 : 0);
-+  uint32_t d0, d1;
-+  SplitImm34(imm34, &d0, &d1);
-+  uint32_t prefix = EncodePower10Prefix(/*type=8LS*/ 0, R, d0);
-+  uint32_t suffix = (57u << 26) | (uint32_t(rt.code()) << 21) |
-+                    (uint32_t(ra.code()) << 16) | d1;
-+  m_buffer.enterNoPool(3);
-+  ensurePrefixedAlignment();
-+  BufferOffset bo = writeInst(prefix);
-+  writeInst(suffix);
-+  m_buffer.leaveNoPool();
-+  return bo;
-+}
-+
-+// plxv XT, D(RA), R   (8LS, 5-bit suffix opcode 25, TX in suffix bit 26)
-+//   XT is 6-bit: TX (high) || T (low 5) — matches lxvx convention.
-+BufferOffset Assembler::as_plxv(uint8_t xt, Register ra, int64_t imm34,
-+                                 bool R) {
-+  MOZ_ASSERT(xt < 64);
-+  MOZ_ASSERT_IF(R, ra == r0);
-+  spew("plxv\tvs%u,%lld(%s),%d", xt, (long long)imm34, ra.name(),
-+       R ? 1 : 0);
-+  uint32_t d0, d1;
-+  SplitImm34(imm34, &d0, &d1);
-+  uint32_t prefix = EncodePower10Prefix(/*type=8LS*/ 0, R, d0);
-+  uint32_t T = xt & 0x1Fu;
-+  uint32_t TX = (xt >> 5) & 1u;
-+  uint32_t suffix = (25u << 27) | (TX << 26) | (T << 21) |
-+                    (uint32_t(ra.code()) << 16) | d1;
-+  m_buffer.enterNoPool(3);
-+  ensurePrefixedAlignment();
-+  BufferOffset bo = writeInst(prefix);
-+  writeInst(suffix);
-+  m_buffer.leaveNoPool();
-+  return bo;
-+}
-+
-+// plfd FRT, D(RA), R   (MLS, suffix opcode 50; D-form-like FPR load)
-+BufferOffset Assembler::as_plfd(FloatRegister frt, Register ra, int64_t imm34,
-+                                 bool R) {
-+  MOZ_ASSERT_IF(R, ra == r0);
-+  spew("plfd\tf%u,%lld(%s),%d", uint32_t(frt.encoding()),
-+       (long long)imm34, ra.name(), R ? 1 : 0);
-+  uint32_t d0, d1;
-+  SplitImm34(imm34, &d0, &d1);
-+  uint32_t prefix = EncodePower10Prefix(/*type=MLS*/ 2, R, d0);
-+  uint32_t suffix = (50u << 26) | (uint32_t(frt.encoding()) << 21) |
-+                    (uint32_t(ra.code()) << 16) | d1;
-+  m_buffer.enterNoPool(3);
-+  ensurePrefixedAlignment();
-+  BufferOffset bo = writeInst(prefix);
-+  writeInst(suffix);
-+  m_buffer.leaveNoPool();
-+  return bo;
-+}
-+
-+// plfs FRT, D(RA), R   (MLS, suffix opcode 48; widens single → double in FPR)
-+BufferOffset Assembler::as_plfs(FloatRegister frt, Register ra, int64_t imm34,
-+                                 bool R) {
-+  MOZ_ASSERT_IF(R, ra == r0);
-+  spew("plfs\tf%u,%lld(%s),%d", uint32_t(frt.encoding()),
-+       (long long)imm34, ra.name(), R ? 1 : 0);
-+  uint32_t d0, d1;
-+  SplitImm34(imm34, &d0, &d1);
-+  uint32_t prefix = EncodePower10Prefix(/*type=MLS*/ 2, R, d0);
-+  uint32_t suffix = (48u << 26) | (uint32_t(frt.encoding()) << 21) |
-+                    (uint32_t(ra.code()) << 16) | d1;
-+  m_buffer.enterNoPool(3);
-+  ensurePrefixedAlignment();
-+  BufferOffset bo = writeInst(prefix);
-+  writeInst(suffix);
-+  m_buffer.leaveNoPool();
-+  return bo;
-+}
-+
-+// pstd RS, D(RA), R   (8LS, suffix opcode 61 = std D-form)
-+BufferOffset Assembler::as_pstd(Register rs, Register ra, int64_t imm34,
-+                                 bool R) {
-+  MOZ_ASSERT_IF(R, ra == r0);
-+  spew("pstd\t%s,%lld(%s),%d", rs.name(), (long long)imm34, ra.name(),
-+       R ? 1 : 0);
-+  uint32_t d0, d1;
-+  SplitImm34(imm34, &d0, &d1);
-+  uint32_t prefix = EncodePower10Prefix(/*type=8LS*/ 0, R, d0);
-+  uint32_t suffix = (61u << 26) | (uint32_t(rs.code()) << 21) |
-+                    (uint32_t(ra.code()) << 16) | d1;
-+  m_buffer.enterNoPool(3);
-+  ensurePrefixedAlignment();
-+  BufferOffset bo = writeInst(prefix);
-+  writeInst(suffix);
-+  m_buffer.leaveNoPool();
-+  return bo;
-+}
-+
-+// pstxv XS, D(RA), R   (8LS, 5-bit suffix opcode 27, SX in suffix bit 26)
-+//   XS is 6-bit: SX (high) || S (low 5) — matches stxvx convention.
-+BufferOffset Assembler::as_pstxv(uint8_t xs, Register ra, int64_t imm34,
-+                                  bool R) {
-+  MOZ_ASSERT(xs < 64);
-+  MOZ_ASSERT_IF(R, ra == r0);
-+  spew("pstxv\tvs%u,%lld(%s),%d", xs, (long long)imm34, ra.name(),
-+       R ? 1 : 0);
-+  uint32_t d0, d1;
-+  SplitImm34(imm34, &d0, &d1);
-+  uint32_t prefix = EncodePower10Prefix(/*type=8LS*/ 0, R, d0);
-+  uint32_t sx = (xs >> 5) & 1;
-+  uint32_t s = xs & 0x1F;
-+  uint32_t suffix = (27u << 27) | (sx << 26) | (s << 21) |
-+                    (uint32_t(ra.code()) << 16) | d1;
-+  m_buffer.enterNoPool(3);
-+  ensurePrefixedAlignment();
-+  BufferOffset bo = writeInst(prefix);
-+  writeInst(suffix);
-+  m_buffer.leaveNoPool();
-+  return bo;
-+}
-+
-+// pstfd FRS, D(RA), R   (MLS, suffix opcode 54 = stfd)
-+BufferOffset Assembler::as_pstfd(FloatRegister frs, Register ra, int64_t imm34,
-+                                  bool R) {
-+  MOZ_ASSERT_IF(R, ra == r0);
-+  spew("pstfd\tf%u,%lld(%s),%d", uint32_t(frs.encoding()),
-+       (long long)imm34, ra.name(), R ? 1 : 0);
-+  uint32_t d0, d1;
-+  SplitImm34(imm34, &d0, &d1);
-+  uint32_t prefix = EncodePower10Prefix(/*type=MLS*/ 2, R, d0);
-+  uint32_t suffix = (54u << 26) | (uint32_t(frs.encoding()) << 21) |
-+                    (uint32_t(ra.code()) << 16) | d1;
-+  m_buffer.enterNoPool(3);
-+  ensurePrefixedAlignment();
-+  BufferOffset bo = writeInst(prefix);
-+  writeInst(suffix);
-+  m_buffer.leaveNoPool();
-+  return bo;
-+}
-+
-+// pstfs FRS, D(RA), R   (MLS, suffix opcode 52 = stfs)
-+BufferOffset Assembler::as_pstfs(FloatRegister frs, Register ra, int64_t imm34,
-+                                  bool R) {
-+  MOZ_ASSERT_IF(R, ra == r0);
-+  spew("pstfs\tf%u,%lld(%s),%d", uint32_t(frs.encoding()),
-+       (long long)imm34, ra.name(), R ? 1 : 0);
-+  uint32_t d0, d1;
-+  SplitImm34(imm34, &d0, &d1);
-+  uint32_t prefix = EncodePower10Prefix(/*type=MLS*/ 2, R, d0);
-+  uint32_t suffix = (52u << 26) | (uint32_t(frs.encoding()) << 21) |
-+                    (uint32_t(ra.code()) << 16) | d1;
-+  m_buffer.enterNoPool(3);
-+  ensurePrefixedAlignment();
-+  BufferOffset bo = writeInst(prefix);
-+  writeInst(suffix);
-+  m_buffer.leaveNoPool();
-+  return bo;
-+}
-+
-+// POWER10 (ISA 3.1) Vector Extract Mask. RT (GPR) gets the wasm-spec
-+// bitmask (one bit per lane MSB) directly in low 16/8/4/2 bits. UIM
-+// is baked into PPC_vextract{b,h,w,d}m (8/9/10/11). Caller must have
-+// verified HasPOWER10().
-+#define DEF_VEXTRACT_M(op)                                                 \
-+  BufferOffset Assembler::as_##op(Register rt, FloatRegister vrb) {        \
-+    spew(#op "\t%s,vr%u", rt.name(), uint32_t(vrb.encoding() & 31));       \
-+    return writeInst(PPC_##op | (uint32_t(rt.code()) << 21) |              \
-+                     ((uint32_t(vrb.encoding()) & 31) << 11));             \
-+  }
-+DEF_VEXTRACT_M(vextractbm)
-+DEF_VEXTRACT_M(vextracthm)
-+DEF_VEXTRACT_M(vextractwm)
-+DEF_VEXTRACT_M(vextractdm)
-+#undef DEF_VEXTRACT_M
-+
-+// POWER10 (ISA 3.1) Vector Insert Word/Doubleword from GPR. VX-form:
-+// VRT at bits 21..25, UIM at bits 16..20, RB at bits 11..15.
-+#define DEF_VINS(op, max_uim)                                              \
-+  BufferOffset Assembler::as_##op(FloatRegister vrt, Register rb,          \
-+                                  uint8_t uim) {                           \
-+    MOZ_ASSERT(uim <= (max_uim));                                          \
-+    spew(#op "\tvr%u,%s,%u", uint32_t(vrt.encoding() & 31), rb.name(),     \
-+         uint32_t(uim));                                                   \
-+    return writeInst(PPC_##op |                                            \
-+                     ((uint32_t(vrt.encoding()) & 31) << 21) |             \
-+                     (uint32_t(uim) << 16) |                               \
-+                     (uint32_t(rb.code()) << 11));                         \
-+  }
-+DEF_VINS(vinsw, 12)
-+DEF_VINS(vinsd, 8)
-+#undef DEF_VINS
-+
-+// POWER10 (ISA 3.1) Vector Insert byte/halfword from GPR with
-+// register-supplied byte position. VX-form: VRT at bits 21..25,
-+// RA at bits 16..20, RB at bits 11..15. "rx" is right-indexed
-+// (LE-natural — index 0 = LSB byte).
-+#define DEF_VINS_RX(op)                                                    \
-+  BufferOffset Assembler::as_##op(FloatRegister vrt, Register ra,          \
-+                                  Register rb) {                           \
-+    spew(#op "\tvr%u,%s,%s", uint32_t(vrt.encoding() & 31), ra.name(),     \
-+         rb.name());                                                       \
-+    return writeInst(PPC_##op |                                            \
-+                     ((uint32_t(vrt.encoding()) & 31) << 21) |             \
-+                     (uint32_t(ra.code()) << 16) |                         \
-+                     (uint32_t(rb.code()) << 11));                         \
-+  }
-+DEF_VINS_RX(vinsbrx)
-+DEF_VINS_RX(vinshrx)
-+#undef DEF_VINS_RX
-+
-+// POWER9 (ISA 3.0) V-form 3-operand instructions with VRT, UIM, VRB at
-+// bits 21..25, 16..20, 11..15 respectively (vinsert{b,h}, vextract{ub,uh}).
-+// Simd128 lives in VSR32-63 (= VR0-31), so we mask VRT and VRB to the
-+// 5-bit VR field via `encoding() & 31`.
-+#define DEF_VRT_UIM_VRB(op, max_uim, uim_step)                              \
-+  BufferOffset Assembler::as_##op(FloatRegister vrt, FloatRegister vrb,    \
-+                                  uint8_t uim) {                           \
-+    MOZ_ASSERT(uim <= (max_uim));                                          \
-+    MOZ_ASSERT((uim) % (uim_step) == 0);                                   \
-+    spew(#op "\tvr%u,vr%u,%u", uint32_t(vrt.encoding() & 31),              \
-+         uint32_t(vrb.encoding() & 31), uint32_t(uim));                    \
-+    return writeInst(PPC_##op |                                            \
-+                     ((uint32_t(vrt.encoding()) & 31) << 21) |             \
-+                     (uint32_t(uim) << 16) |                               \
-+                     ((uint32_t(vrb.encoding()) & 31) << 11));             \
-+  }
-+DEF_VRT_UIM_VRB(vinsertb, 15, 1)
-+DEF_VRT_UIM_VRB(vinserth, 14, 2)
-+DEF_VRT_UIM_VRB(vextractub, 15, 1)
-+DEF_VRT_UIM_VRB(vextractuh, 14, 2)
-+#undef DEF_VRT_UIM_VRB
-+
-+// VMX binary VX-form pack/merge (re-use DEF_VMX_VVV pattern).
-+#define DEF_VMX_VVV(op)                                                    \
-+  BufferOffset Assembler::as_##op(uint8_t vrt, uint8_t vra, uint8_t vrb) { \
-+    MOZ_ASSERT(vrt < 32 && vra < 32 && vrb < 32);                          \
-+    spew(#op "\tvr%d,vr%d,vr%d", vrt, vra, vrb);                           \
-+    return writeInst(PPC_##op | vrt << 21 | vra << 16 | vrb << 11);        \
-+  }
-+DEF_VMX_VVV(vpkshss)
-+DEF_VMX_VVV(vpkswss) DEF_VMX_VVV(vpkshus) DEF_VMX_VVV(vpkswus)
-+    DEF_VMX_VVV(vmrghb)
-+        DEF_VMX_VVV(vmrghh) DEF_VMX_VVV(vmrghw) DEF_VMX_VVV(vmrglb)
-+            DEF_VMX_VVV(vmrglh) DEF_VMX_VVV(vmrglw) DEF_VMX_VVV(vmulesb)
-+                DEF_VMX_VVV(vmulosb) DEF_VMX_VVV(vmuleub) DEF_VMX_VVV(vmuloub)
-+                    DEF_VMX_VVV(vmulesh) DEF_VMX_VVV(vmulosh)
-+                        DEF_VMX_VVV(vmuleuh) DEF_VMX_VVV(vmulouh)
-+                            DEF_VMX_VVV(vmulesw) DEF_VMX_VVV(vmulosw)
-+                                DEF_VMX_VVV(vmuleuw) DEF_VMX_VVV(vmulouw)
-+#undef DEF_VMX_VVV
-+
-+    // vperm VA-form: (4<<26) | VRT<<21 | VRA<<16 | VRB<<11 | VRC<<6 | XO
-+    BufferOffset Assembler::as_vperm(uint8_t vrt, uint8_t vra, uint8_t vrb,
-+                                     uint8_t vrc) {
-+  MOZ_ASSERT(vrt < 32 && vra < 32 && vrb < 32 && vrc < 32);
-+  spew("vperm\tvr%d,vr%d,vr%d,vr%d", vrt, vra, vrb, vrc);
-+  return writeInst(PPC_vperm | vrt << 21 | vra << 16 | vrb << 11 | vrc << 6);
-+}
-+
-+// VA-form ternary VMX: (4<<26) | VRT<<21 | VRA<<16 | VRB<<11 | VRC<<6 |
-+// XO(6-bit)
-+BufferOffset Assembler::as_vmladduhm(uint8_t vrt, uint8_t vra, uint8_t vrb,
-+                                     uint8_t vrc) {
-+  MOZ_ASSERT(vrt < 32 && vra < 32 && vrb < 32 && vrc < 32);
-+  spew("vmladduhm\tvr%d,vr%d,vr%d,vr%d", vrt, vra, vrb, vrc);
-+  return writeInst(PPC_vmladduhm | vrt << 21 | vra << 16 | vrb << 11 |
-+                   vrc << 6);
-+}
-+
-+BufferOffset Assembler::as_vmhraddshs(uint8_t vrt, uint8_t vra, uint8_t vrb,
-+                                      uint8_t vrc) {
-+  MOZ_ASSERT(vrt < 32 && vra < 32 && vrb < 32 && vrc < 32);
-+  spew("vmhraddshs\tvr%d,vr%d,vr%d,vr%d", vrt, vra, vrb, vrc);
-+  return writeInst(PPC_vmhraddshs | vrt << 21 | vra << 16 | vrb << 11 |
-+                   vrc << 6);
-+}
-+
-+BufferOffset Assembler::as_vmsumshm(uint8_t vrt, uint8_t vra, uint8_t vrb,
-+                                    uint8_t vrc) {
-+  MOZ_ASSERT(vrt < 32 && vra < 32 && vrb < 32 && vrc < 32);
-+  spew("vmsumshm\tvr%d,vr%d,vr%d,vr%d", vrt, vra, vrb, vrc);
-+  return writeInst(PPC_vmsumshm | vrt << 21 | vra << 16 | vrb << 11 |
-+                   vrc << 6);
-+}
-+
-+BufferOffset Assembler::as_vmsumuhm(uint8_t vrt, uint8_t vra, uint8_t vrb,
-+                                    uint8_t vrc) {
-+  MOZ_ASSERT(vrt < 32 && vra < 32 && vrb < 32 && vrc < 32);
-+  spew("vmsumuhm\tvr%d,vr%d,vr%d,vr%d", vrt, vra, vrb, vrc);
-+  return writeInst(PPC_vmsumuhm | vrt << 21 | vra << 16 | vrb << 11 |
-+                   vrc << 6);
-+}
-+
-+BufferOffset Assembler::as_vspltisb(uint8_t vrt, int8_t simm5) {
-+  MOZ_ASSERT(vrt < 32);
-+  MOZ_ASSERT(simm5 >= -16 && simm5 <= 15);
-+  spew("vspltisb\tvr%d,%d", vrt, simm5);
-+  return writeInst(PPC_vspltisb | uint32_t(vrt) << 21 |
-+                   (uint32_t(simm5) & 0x1F) << 16);
-+}
-+
-+BufferOffset Assembler::as_vspltish(uint8_t vrt, int8_t simm5) {
-+  MOZ_ASSERT(vrt < 32);
-+  MOZ_ASSERT(simm5 >= -16 && simm5 <= 15);
-+  spew("vspltish\tvr%d,%d", vrt, simm5);
-+  return writeInst(PPC_vspltish | uint32_t(vrt) << 21 |
-+                   (uint32_t(simm5) & 0x1F) << 16);
-+}
-+
-+BufferOffset Assembler::as_vspltisw(uint8_t vrt, int8_t simm5) {
-+  MOZ_ASSERT(vrt < 32);
-+  MOZ_ASSERT(simm5 >= -16 && simm5 <= 15);
-+  spew("vspltisw\tvr%d,%d", vrt, simm5);
-+  return writeInst(PPC_vspltisw | uint32_t(vrt) << 21 |
-+                   (uint32_t(simm5) & 0x1F) << 16);
-+}
-+
-+// --- Convenience pseudo-instructions ---
-+
-+BufferOffset Assembler::xs_trap() {
-+  spew("trap @ %08x", currentOffset());
-+  return writeInst(PPC_trap);
-+}
-+
-+BufferOffset Assembler::xs_trap_tagged(TrapTag tag) {
-+  uint32_t tv = PPC_trap | ((uint8_t)tag << 16) | ((uint8_t)tag << 11);
-+  spew("trap @ %08x ; MARK %d %08x", currentOffset(), (uint8_t)tag, tv);
-+  return writeInst(tv);
-+}
-+
-+BufferOffset Assembler::xs_mr(Register rd, Register ra) {
-+  return as_or_(rd, ra, ra);
-+}
-+
-+BufferOffset Assembler::xs_mtctr(Register ra) {
-+  return as_mtspr((SPRegisterID)spr_ctr, ra);
-+}
-+
-+BufferOffset Assembler::xs_mtlr(Register ra) {
-+  return as_mtspr((SPRegisterID)spr_lr, ra);
-+}
-+
-+BufferOffset Assembler::xs_mflr(Register rd) {
-+  return as_mfspr(rd, (SPRegisterID)spr_lr);
-+}
-+
-+BufferOffset Assembler::xs_mtcr(Register rs) { return as_mtcrf(0xff, rs); }
-+
-+BufferOffset Assembler::xs_mfxer(Register ra) {
-+  return as_mfspr(ra, (SPRegisterID)spr_xer);
-+}
-+
-+BufferOffset Assembler::xs_mtxer(Register ra) {
-+  return as_mtspr((SPRegisterID)spr_xer, ra);
-+}
-+
-+BufferOffset Assembler::xs_li(Register rd, int16_t im) {
-+  return as_addi(rd, r0, im, true);
-+}
-+
-+BufferOffset Assembler::xs_lis(Register rd, int16_t im) {
-+  return as_addis(rd, r0, im, true);
-+}
-+
-+BufferOffset Assembler::x_subi(Register rd, Register ra, int16_t im) {
-+  return as_addi(rd, ra, -im);
-+}
-+
-+BufferOffset Assembler::x_not(Register rd, Register ra) {
-+  return as_nor(rd, ra, ra);
-+}
-+
-+BufferOffset Assembler::x_slwi(Register rd, Register rs, int n) {
-+  MOZ_ASSERT(n >= 0 && n < 32);
-+  return as_rlwinm(rd, rs, n, 0, 31 - n);
-+}
-+
-+BufferOffset Assembler::x_sldi(Register rd, Register rs, int n) {
-+  return as_rldicr(rd, rs, n, 63 - n);
-+}
-+
-+BufferOffset Assembler::x_srwi(Register rd, Register rs, int n) {
-+  MOZ_ASSERT(n >= 0 && n < 32);
-+  if (n == 0) {
-+    return as_rlwinm(rd, rs, 0, 0, 31);
-+  }
-+  return as_rlwinm(rd, rs, 32 - n, n, 31);
-+}
-+
-+BufferOffset Assembler::x_srdi(Register rd, Register rs, int n) {
-+  MOZ_ASSERT(n >= 0 && n < 64);
-+  if (n == 0) {
-+    return as_or_(rd, rs, rs);
-+  }
-+  return as_rldicl(rd, rs, 64 - n, n);
-+}
-+
-+BufferOffset Assembler::x_bit_value(Register rd, Register rs, unsigned bit) {
-+  return as_rlwinm(rd, rs, bit + 1, 31, 31);
-+}
-+
-+BufferOffset Assembler::x_insertbits0_15(Register rd, Register rs) {
-+  return as_rlwimi(rd, rs, 0, 16, 31);
-+}
-+
-+BufferOffset Assembler::x_sr_mulli(Register rd, Register ra, int16_t im) {
-+  as_sradi(rd, ra, 63);
-+  return as_mulli(rd, rd, im);
-+}
-+
-+void Assembler::as_break(uint32_t code) {
-+  spew("break\t%d", code);
-+  writeInst(PPC_trap);
-+}
-+
-+// ========================================================================
-+// Label binding, retarget, and code label processing.
-+// ========================================================================
-+
-+// Forward-declared shape helpers; full definitions and the layout
-+// commentary live with the WriteLoad64Instructions section below.
-+static bool IsAddpcisLoad64Stanza(uint32_t enc0);
-+static uint8_t Load64StanzaDestReg(Instruction* inst0);
-+
-+InstImm Assembler::invertBranch(InstImm branch, BOffImm16 skipOffset) {
-+  // Flip the BO condition-true/condition-false bit (bit 24).
-+  uint32_t data = branch.encode();
-+  data = (data ^ 0x01000000) & 0xFFFF0003;
-+  data |= skipOffset.encode();
-+  branch.setData(data);
-+  return branch;
-+}
-+
-+void Assembler::bind(InstImm* inst, uintptr_t branch, uintptr_t target) {
-+  intptr_t offset = target - branch;
-+  Instruction* i0 = (Instruction*)inst;
-+
-+  if (i0->next()->encode() == PPC_bcl_always_plus4 ||
-+      IsAddpcisLoad64Stanza(i0->encode())) {
-+    // Pre-existing long stanza, either P8 (mflr + bcl marker at [1]) or
-+    // P9+ (addpcis at [0]; major opcode 19). Either way, just register
-+    // the long jump — the stanza's .quad at [6..7] gets patched later
-+    // via UpdateLoad64Value.
-+    addLongJump(BufferOffset(branch), BufferOffset(target));
-+    return;
-+  }
-+
-+  if (i0->isOpcode((uint32_t)PPC_tw)) {
-+    // Tagged trap stanza. The tag tells us which branch type was reserved.
-+    TrapTag tag = (TrapTag)inst->traptag();
-+    Instruction* i1 = i0->next();
-+    Instruction* i2 = i1->next();
-+    Instruction* i3 = i2->next();
-+    Instruction* i4 = i3->next();
-+    Instruction* i5 = i4->next();
-+    Instruction* i6 = i5->next();
-+    Instruction* i7 = i6->next();
-+    Instruction* i8 = i7->next();
-+    Instruction* i9 = i8->next();
-+
-+    switch (tag) {
-+      case BCTag: {
-+        // inst[-1] is the original bc instruction.
-+        Instruction* bc = i0 - 1;
-+        // Try short bc (offset + 4 because bc is one instruction before tw).
-+        if (BOffImm16::IsInRange(offset + (intptr_t)sizeof(uint32_t))) {
-+          bc->setData(((bc->encode() ^ 0x01000000) & 0xFFFF0003) |
-+                      BOffImm16(offset + sizeof(uint32_t)).encode());
-+          i0->makeNop();
-+          i1->makeNop();
-+          i2->makeNop();
-+          i3->makeNop();
-+          i4->makeNop();
-+          i5->makeNop();
-+          i6->makeNop();
-+          i7->makeNop();
-+          i8->makeNop();
-+          i9->makeNop();
-+          return;
-+        }
-+        // Try short b (unconditional).
-+        if (JOffImm26::IsInRange(offset)) {
-+          i0->setData(PPC_b | JOffImm26(offset).encode());
-+          i1->makeNop();
-+          i2->makeNop();
-+          i3->makeNop();
-+          i4->makeNop();
-+          i5->makeNop();
-+          i6->makeNop();
-+          i7->makeNop();
-+          i8->makeNop();
-+          i9->makeNop();
-+          return;
-+        }
-+        // Long: WriteLoad64 to SecondScratchReg + mtctr + bctr.
-+        addLongJump(BufferOffset(branch), BufferOffset(target));
-+        WriteLoad64Instructions(i0, SecondScratchReg,
-+                                LabelBase::INVALID_OFFSET);
-+        i8->makeOp_mtctr(SecondScratchReg);
-+        i9->makeOp_bctr();
-+        break;
-+      }
-+      case CallTag: {
-+        // For calls, the actual call instruction goes at inst[9] and
-+        // the return address must be after the stanza.
-+        intptr_t callOffset = offset - 9 * (intptr_t)sizeof(uint32_t);
-+        if (JOffImm26::IsInRange(callOffset)) {
-+          i0->makeNop();
-+          i1->makeNop();
-+          i2->makeNop();
-+          i3->makeNop();
-+          i4->makeNop();
-+          i5->makeNop();
-+          i6->makeNop();
-+          i7->makeNop();
-+          i8->makeNop();
-+          i9->setData(PPC_b | JOffImm26(callOffset).encode() | LinkB);
-+          return;
-+        }
-+        // Long: WriteLoad64 to SecondScratchReg + mtctr + bctrl.
-+        addLongJump(BufferOffset(branch), BufferOffset(target));
-+        WriteLoad64Instructions(i0, SecondScratchReg,
-+                                LabelBase::INVALID_OFFSET);
-+        i8->makeOp_mtctr(SecondScratchReg);
-+        i9->makeOp_bctr(LinkB);
-+        break;
-+      }
-+      case BTag: {
-+        if (JOffImm26::IsInRange(offset)) {
-+          i0->setData(PPC_b | JOffImm26(offset).encode());
-+          i1->makeNop();
-+          i2->makeNop();
-+          i3->makeNop();
-+          i4->makeNop();
-+          i5->makeNop();
-+          i6->makeNop();
-+          i7->makeNop();
-+          i8->makeNop();
-+          i9->makeNop();
-+          return;
-+        }
-+        // Long: WriteLoad64 to SecondScratchReg + mtctr + bctr.
-+        addLongJump(BufferOffset(branch), BufferOffset(target));
-+        WriteLoad64Instructions(i0, SecondScratchReg,
-+                                LabelBase::INVALID_OFFSET);
-+        i8->makeOp_mtctr(SecondScratchReg);
-+        i9->makeOp_bctr();
-+        break;
-+      }
-+      default:
-+        MOZ_CRASH("Unexpected TrapTag");
-+    }
-+    return;
-+  }
-+
-+  if (i0->isOpcode(PPC_b)) {
-+    // Short unconditional branch — set offset, nop next-in-chain slot.
-+    MOZ_ASSERT(JOffImm26::IsInRange(offset));
-+    i0->setData((i0->encode() & ~0x03FFFFFC) | JOffImm26(offset).encode());
-+    i0->next()->makeNop();
-+    return;
-+  }
-+
-+  if (i0->isOpcode(PPC_bc)) {
-+    // Short conditional branch — preserve upper 16 bits, set offset.
-+    MOZ_ASSERT(BOffImm16::IsInRange(offset));
-+    i0->setData((i0->encode() & 0xFFFF0003) | BOffImm16(offset).encode());
-+    i0->next()->makeNop();
-+    return;
-+  }
-+
-+  MOZ_CRASH("Unexpected instruction in bind");
-+}
-+
-+void Assembler::bind(Label* label, BufferOffset boff) {
-+  if (label->used()) {
-+    bool more;
-+    BufferOffset b(label);
-+    do {
-+      BufferOffset next;
-+      InstImm* inst = (InstImm*)editSrc(b);
-+      Instruction* i1 = ((Instruction*)inst)->next();
-+      more = (i1->encode() != LabelBase::INVALID_OFFSET);
-+      if (more) {
-+        next = BufferOffset(i1->encode());
-+      }
-+      bind(inst, b.getOffset(), boff.getOffset());
-+      b = next;
-+    } while (more);
-+  }
-+  label->bind(boff.getOffset());
-+}
-+
-+void Assembler::retarget(Label* label, Label* target) {
-+  spew("retarget");
-+  if (label->used() && !oom()) {
-+    if (target->bound()) {
-+      bind(label, BufferOffset(target));
-+    } else if (target->used()) {
-+      // Prepend label's use chain to target's use chain.
-+      BufferOffset b(label);
-+      BufferOffset next;
-+      do {
-+        Instruction* inst = (Instruction*)editSrc(b);
-+        Instruction* i1 = inst->next();
-+        if (i1->encode() != LabelBase::INVALID_OFFSET) {
-+          next = BufferOffset(i1->encode());
-+        } else {
-+          // End of label's chain — link to target's head.
-+          i1->setData(target->offset());
-+          break;
-+        }
-+        b = next;
-+      } while (true);
-+    }
-+    // Transfer label's use list to target.
-+    if (!target->bound()) {
-+      target->use(label->offset());
-+    }
-+  }
-+  label->reset();
-+}
-+
-+void Assembler::processCodeLabels(uint8_t* rawCode) {
-+  for (const CodeLabel& label : codeLabels_) {
-+    Bind(rawCode, label);
-+  }
-+}
-+
-+// ========================================================================
-+// Load64 instruction sequence (8 slots, literal pool format):
-+//   [0] mflr r0            — save LR
-+//   [1] bcl 20,0,.+4      — LR = address of [2]
-+//   [2] mflr rD            — rD = address of [2]
-+//   [3] mtlr r0            — restore LR
-+//   [4] ld rD, 16(rD)      — load from [6..7] (offset = 24 - 8 = 16)
-+//   [5] b .+12             — skip data
-+//   [6..7] .quad VALUE     — 8-byte data
-+// ========================================================================
-+
-+// ========================================================================
-+// Constant pool callbacks (required by AssemblerBufferWithConstantPools).
-+// ========================================================================
-+
-+/* static */
-+void Assembler::InsertIndexIntoTag(uint8_t* load, uint32_t index) {
-+  // Stash the pool entry index in the hint word's low 16 bits; the high
-+  // bits carry the dest reg and load type, consumed by
-+  // PatchConstantPoolLoad when the pool is resolved.
-+  uint32_t* inst = (uint32_t*)load;
-+  *inst = (*inst & 0xFFFF0000) | (index & 0xFFFF);
-+}
-+
-+/* static */
-+bool Assembler::PatchConstantPoolLoad(void* loadAddr, void* constPoolAddr) {
-+  // Rewrite placeholder instructions with a pool load sequence.
-+  // Hint word layout (set by loadFromPoolFloat64 / loadFromPoolFloat32 /
-+  // loadFromPoolSimd128):
-+  //   bits 0-15:  pool entry index
-+  //   bits 16-20: destination register (FPR encoding)
-+  //   bits 21-22: load type (PoolLoadFPR64, PoolLoadSimd128, PoolLoadFPR32)
-+  //   bits 28-31: sentinel 0xF
-+
-+  uint32_t* inst = (uint32_t*)loadAddr;
-+
-+  uint32_t hint = inst[0];
-+  uint32_t index = hint & 0xFFFF;
-+  uint32_t destReg = (hint >> 16) & 0x1F;
-+  uint32_t loadType = (hint >> 21) & 0x3;
-+
-+  // Displacement: pool entry address relative to inst[1] (mflr target) for the
-+  // bcl path, or relative to inst[0]+4 (addpcis target = CIA+4, which is the
-+  // address of inst[1]) for the addpcis path. Both conventions resolve to the
-+  // same value: (pool entry) − (loadAddr + 4).
-+  int32_t displacement =
-+      (int32_t)((uint8_t*)constPoolAddr + index * 4 - ((uint8_t*)loadAddr + 4));
-+
-+  if (loadType == PoolLoadFPR64 || loadType == PoolLoadFPR32) {
-+    // Three emission paths:
-+    //
-+    // POWER10 (preferred): plfd/plfs FRT, SI(0), R=1 — single PC-relative
-+    //   prefixed FP load. 8 bytes = 2 slots; slot 2 becomes a nop. If
-+    //   loadAddr % 64 == 60, plfd would straddle a 64-byte block, so emit
-+    //   a leading nop at slot 0 and place plfd at slots 1-2 instead.
-+    //   Reach: ±8 GB (34-bit signed). No LR clobber, no r16 base.
-+    //
-+    // POWER9: addpcis + lfd/lfs + nop. 2 real insns, no LR clobber, no
-+    //   Return Address Stack corruption. Base register is r16.
-+    //   Displacement splits into (hi << 16) + lo where lo is the 16-bit
-+    //   signed D-field of lfd/lfs. Reach: ±2 GB.
-+    //
-+    // POWER8: bcl + mflr r16 + lfd/lfs. Same clobber + RAS caveat as before.
-+    //   Kept as a correctness fallback; not exercised today because the
-+    //   loadConstantDouble/Float32 wrappers skip the pool on POWER8.
-+    //
-+    // lfs/plfs (32-bit) auto-expand their result to double-precision in the
-+    // FPR, replacing the non-pool path's separate xscvspdpn step.
-+    uint32_t baseReg = SavedScratchRegister.code();
-+    uint32_t loadOp = (loadType == PoolLoadFPR64) ? PPC_lfd : PPC_lfs;
-+
-+    if (HasPOWER10()) {
-+      // MLS prefixed FP load. plfd suffix opcode = 50, plfs = 48. Same
-+      // alignment-driven slot placement as PoolLoadSimd128 above.
-+      uint64_t loadAddrBits = reinterpret_cast<uint64_t>(loadAddr);
-+      // loadAddr is the buffer-time pointer; the final executable base is
-+      // only 16-byte aligned, so the unsafe straddle is when
-+      // (loadAddrBits & 15) == 12 (matches ensurePrefixedAlignment above).
-+      bool needLeadingNop = (loadAddrBits & 15) == 12;
-+      int prefixSlot = needLeadingNop ? 1 : 0;
-+      int prefixByteOffset = prefixSlot * 4;
-+      int64_t SI = int64_t(displacement) + 4 - prefixByteOffset;
-+      MOZ_ASSERT(SI >= -(int64_t(1) << 33) && SI < (int64_t(1) << 33));
-+      uint32_t d0 = uint32_t((uint64_t(SI) >> 16) & 0x3FFFFu);
-+      uint32_t d1 = uint32_t(uint64_t(SI) & 0xFFFFu);
-+      // Type 2 (MLS), R=1, RA=0.
-+      uint32_t prefix =
-+          (1u << 26) | (2u << 24) | (1u << 20) | (d0 & 0x3FFFFu);
-+      uint32_t suffixOp = (loadType == PoolLoadFPR64) ? 50u : 48u;
-+      uint32_t suffix = (suffixOp << 26) | (destReg << 21) | d1;
-+
-+      if (needLeadingNop) {
-+        inst[0] = NopInst;
-+        inst[1] = prefix;
-+        inst[2] = suffix;
-+      } else {
-+        inst[0] = prefix;
-+        inst[1] = suffix;
-+        inst[2] = NopInst;
-+      }
-+    } else if (HasPOWER9()) {
-+      // Split displacement into addpcis hi field and lfd/lfs lo field so that
-+      //   target = (CIA + 4) + (hi << 16) + SEXT16(lo).
-+      // Only 2 slots are reserved on P9 (loadFromPoolFloat{32,64} above);
-+      // do NOT touch inst[2], it belongs to the next entry.
-+      int16_t lo = (int16_t)(displacement & 0xFFFF);
-+      int32_t hiAdj = displacement - lo;
-+      MOZ_ASSERT((hiAdj & 0xFFFF) == 0);
-+      int32_t hi = hiAdj >> 16;
-+      MOZ_ASSERT(hi >= -32768 && hi <= 32767);
-+      // [0] addpcis r16, hi
-+      uint32_t Dhi = uint16_t(hi);
-+      inst[0] = (19u << 26) | (baseReg << 21) | ((Dhi >> 1) & 0x1F) << 16 |
-+                ((Dhi >> 6) & 0x3FF) << 6 | (2u << 1) | (Dhi & 1u);
-+      // [1] lfd/lfs fD, lo(r16)
-+      inst[1] = loadOp | (destReg << 21) | (baseReg << 16) | (uint16_t(lo));
-+    } else {
-+      MOZ_ASSERT(displacement >= -32768 && displacement < 32768);
-+      // [0] bcl 20,0,$+4
-+      inst[0] = PPC_bcl_always_plus4;
-+      // [1] mflr r16
-+      inst[1] = PPC_mfspr | (baseReg << 21) | PPC_SPR(spr_lr);
-+      // [2] lfd/lfs fD, displacement(r16)
-+      inst[2] =
-+          loadOp | (destReg << 21) | (baseReg << 16) | (displacement & 0xFFFF);
-+    }
-+  } else if (loadType == PoolLoadSimd128) {
-+    // Three emission paths (5 slots reserved by loadFromPoolSimd128):
-+    //
-+    // POWER10 (preferred): plxv vsD, SI(0), R=1 — single PC-relative
-+    //   prefixed load, natural-LE byte order (no xxpermdi needed). 8 bytes
-+    //   = 2 slots; slots 2-4 become nops. If the prefix would straddle a
-+    //   64-byte block (loadAddr % 64 == 60), emit a leading nop at slot 0
-+    //   and place plxv at slots 1-2 instead. Reach: ±8 GB (34-bit signed).
-+    //
-+    // POWER9: addpcis-equivalent via bcl + mflr + addi + lxvx + nop. 5
-+    //   real insns, natural LE.
-+    //
-+    // POWER8: same prelude + lxvd2x + xxpermdi (BE-DW byte-swap fixup).
-+    //
-+    // See PoolLoadFPR64 above for why r16 instead of r12.
-+    MOZ_ASSERT(displacement >= -32768 && displacement < 32768);
-+    // Simd128 dest is in VR-namespace (encoding 32-63). Hint stores only
-+    // the low 5 bits (loadFromPoolSimd128 masks); we set TX unconditionally
-+    // since PoolLoadSimd128 always targets a Simd128.
-+    constexpr uint32_t kTX = 1u;
-+    constexpr uint32_t kAxBxTx_xxpermdi = (1u << 2) | (1u << 1) | 1u;
-+
-+    if (HasPOWER10()) {
-+      // Place plxv prefix at the highest 4-byte-aligned offset within
-+      // the 5 reserved slots that doesn't straddle a 64-byte block.
-+      uint64_t loadAddrBits = reinterpret_cast<uint64_t>(loadAddr);
-+      // loadAddr is the buffer-time pointer; the final executable base is
-+      // only 16-byte aligned, so the unsafe straddle is when
-+      // (loadAddrBits & 15) == 12 (matches ensurePrefixedAlignment above).
-+      bool needLeadingNop = (loadAddrBits & 15) == 12;
-+      int prefixSlot = needLeadingNop ? 1 : 0;
-+      int prefixByteOffset = prefixSlot * 4;
-+      // SI = (pool entry addr) - (prefix addr)
-+      //    = (loadAddr + 4 + displacement) - (loadAddr + prefixByteOffset)
-+      //    = displacement + 4 - prefixByteOffset
-+      int64_t SI = int64_t(displacement) + 4 - prefixByteOffset;
-+      MOZ_ASSERT(SI >= -(int64_t(1) << 33) && SI < (int64_t(1) << 33));
-+      uint32_t d0 = uint32_t((uint64_t(SI) >> 16) & 0x3FFFFu);
-+      uint32_t d1 = uint32_t(uint64_t(SI) & 0xFFFFu);
-+      // Prefix: primary opcode 1, Type 0 (8LS), R=1, d0 at LE bits 17..0.
-+      uint32_t prefix =
-+          (1u << 26) | (0u << 24) | (1u << 20) | (d0 & 0x3FFFFu);
-+      // Suffix: 5-bit opcode 25 at LE 31..27, TX at LE 26, T at LE 25..21,
-+      //         RA=0 at LE 20..16, d1 at LE 15..0.
-+      uint32_t suffix = (25u << 27) | (kTX << 26) | (destReg << 21) | d1;
-+
-+      // P10 reserves 3 slots; only inst[0..2] are written. Slots 3..4
-+      // belong to the next pool entry on P10.
-+      if (needLeadingNop) {
-+        inst[0] = NopInst;
-+        inst[1] = prefix;
-+        inst[2] = suffix;
-+      } else {
-+        inst[0] = prefix;
-+        inst[1] = suffix;
-+        inst[2] = NopInst;
-+      }
-+    } else if (HasPOWER9()) {
-+      // addpcis + addi + lxvx (3 slots) — no LR clobber, no RAS hazard.
-+      // Same displacement split as the FP scalar P9 path: target =
-+      // (CIA+4) + (hi << 16) + SEXT16(lo). lxvx is X-form indexed (no
-+      // immediate offset), so combine the low 16 bits into r16 via addi
-+      // before the load.
-+      int16_t lo = (int16_t)(displacement & 0xFFFF);
-+      int32_t hiAdj = displacement - lo;
-+      MOZ_ASSERT((hiAdj & 0xFFFF) == 0);
-+      int32_t hi = hiAdj >> 16;
-+      MOZ_ASSERT(hi >= -32768 && hi <= 32767);
-+      uint32_t Dhi = uint16_t(hi);
-+      uint32_t baseReg = SavedScratchRegister.code();
-+      // [0] addpcis r16, hi
-+      inst[0] = (19u << 26) | (baseReg << 21) | ((Dhi >> 1) & 0x1F) << 16 |
-+                ((Dhi >> 6) & 0x3FF) << 6 | (2u << 1) | (Dhi & 1u);
-+      // [1] addi r16, r16, lo
-+      inst[1] = PPC_addi | (baseReg << 21) | (baseReg << 16) | uint16_t(lo);
-+      // [2] lxvx vsD, 0, r16  (XT[0:4] in bits 21-25, TX at bit 0)
-+      inst[2] = PPC_lxvx | (destReg << 21) | (baseReg << 11) | kTX;
-+    } else {
-+      // P8 fallback: bcl + mflr + addi + lxvd2x + xxpermdi (5 slots).
-+      // Clobbers LR; correctness-only path.
-+      uint32_t baseReg = SavedScratchRegister.code();
-+      inst[0] = PPC_bcl_always_plus4;
-+      inst[1] = PPC_mfspr | (baseReg << 21) | PPC_SPR(spr_lr);
-+      inst[2] = PPC_addi | (baseReg << 21) | (baseReg << 16) |
-+                (displacement & 0xFFFF);
-+      // lxvd2x XT, RA=0, RB=r16 — loads in BE order on LE.
-+      inst[3] = PPC_lxvd2x | (destReg << 21) | (baseReg << 11) | kTX;
-+      // xxpermdi XT, XT, XT, 2 — swap doublewords for LE byte order.
-+      inst[4] = PPC_xxpermdi | (destReg << 21) | (destReg << 16) |
-+                (destReg << 11) | (2u << 8) | kAxBxTx_xxpermdi;
-+    }
-+  } else {
-+    MOZ_CRASH("PatchConstantPoolLoad: unsupported load type");
-+  }
-+
-+  return false;
-+}
-+
-+/* static */
-+void Assembler::WritePoolGuard(BufferOffset branch, Instruction* inst,
-+                               BufferOffset dest) {
-+  // Emit an unconditional branch over the pool data.
-+  int32_t offset = dest.getOffset() - branch.getOffset();
-+  MOZ_ASSERT(JOffImm26::IsInRange(offset));
-+  inst->setData(PPC_b | (offset & 0x03FFFFFC));
-+}
-+
-+/* static */
-+void Assembler::WritePoolHeader(uint8_t* start, Pool* p, bool isNatural) {
-+  // Write pool identification header.
-+  // Encode pool size and isNatural flag in a single 32-bit word.
-+  uint32_t poolSize = p->getPoolSize();
-+  uint32_t sizeInWords = (poolSize + 4 + 3) >> 2;  // header + data, in words
-+  MOZ_ASSERT(sizeInWords < (1 << 15));
-+  uint32_t header = (sizeInWords & 0x7FFF) | (isNatural ? (1 << 15) : 0) |
-+                    0xFFFF0000;  // sentinel
-+  *(uint32_t*)start = header;
-+}
-+
-+/* static */
-+void Assembler::PatchShortRangeBranchToVeneer(PPCBuffer*, unsigned rangeIdx,
-+                                              BufferOffset deadline,
-+                                              BufferOffset veneer) {
-+  // PPC64 does not use short-range branch tracking (NumShortBranchRanges = 0).
-+  MOZ_CRASH("PatchShortRangeBranchToVeneer: should not be called");
-+}
-+
-+// Two stanza shapes share the same 8-slot footprint and the same .quad
-+// location at slots [6..7] (so ExtractLoad64Value / UpdateLoad64Value are
-+// shape-agnostic):
-+//
-+//   POWER8 (no addpcis):
-+//     [0] mflr r0
-+//     [1] bcl 20,0,.+4         (LR := pc of [2])
-+//     [2] mflr rD
-+//     [3] mtlr r0
-+//     [4] ld rD, 16(rD)
-+//     [5] b .+12
-+//     [6..7] .quad VALUE
-+//
-+//   POWER9+ (addpcis):
-+//     [0] addpcis rD, 0        (rD := NIA = pc of [1])
-+//     [1] ld rD, 20(rD)        (rD := mem[pc_of_[1] + 20] = mem[slot[6]])
-+//     [2] b .+24
-+//     [3..5] NOP, NOP, NOP
-+//     [6..7] .quad VALUE
-+//
-+// The P9+ form drops the bcl/mflr/mtlr LR-bounce (no RAS thrash) and runs
-+// 2 dynamic insns instead of 6. Distinguished at patch time by inst[0]'s
-+// major opcode: 31 = mfspr (P8) vs 19 = addpcis (P9+).
-+static bool IsAddpcisLoad64Stanza(uint32_t enc0) {
-+  return ((enc0 >> 26) & 0x3f) == 19;
-+}
-+
-+// Extract the destination register from a load64 stanza in either shape.
-+// P8 stores rD in `mflr rD` at slot [2]; P9+ stores rD in `addpcis rD, 0`
-+// at slot [0]. Both encode RT at LE bits [21..25].
-+static uint8_t Load64StanzaDestReg(Instruction* inst0) {
-+  if (IsAddpcisLoad64Stanza(inst0->encode())) {
-+    return (inst0[0].encode() >> 21) & 0x1f;
-+  }
-+  return (inst0[2].encode() >> 21) & 0x1f;
-+}
-+
-+/* static */
-+void Assembler::WriteLoad64Instructions(Instruction* inst0, Register reg,
-+                                        uint64_t value) {
-+  Instruction* i1 = inst0->next();
-+  Instruction* i2 = i1->next();
-+  Instruction* i3 = i2->next();
-+  Instruction* i4 = i3->next();
-+  Instruction* i5 = i4->next();
-+  Instruction* i6 = i5->next();
-+  Instruction* i7 = i6->next();
-+
-+  if (HasPOWER9()) {
-+    // [0] addpcis rD, 0   (DX-form: opcode=19, XO=2, all D fields = 0)
-+    inst0->setData(0x4C000004u | (uint32_t(reg.code()) << 21));
-+    // [1] ld rD, 20(rD)   (rD := *(slot[1] + 20) = *(slot[6]) = .quad)
-+    i1->setData(PPC_ld | (uint32_t(reg.code()) << 21) |
-+                (uint32_t(reg.code()) << 16) | 20);
-+    // [2] b .+24          (skip slots [3..7] to land at slot [8])
-+    i2->setData(PPC_b | (24 & 0x03FFFFFC));
-+    // [3..5] NOP filler — unreachable but kept aligned for the patcher.
-+    i3->setData(NopInst);
-+    i4->setData(NopInst);
-+    i5->setData(NopInst);
-+  } else {
-+    // [0] mflr r0
-+    inst0->setData(PPC_mfspr | (r0.code() << 21) | PPC_SPR(spr_lr));
-+    // [1] bcl 20,0,.+4
-+    i1->setData(PPC_bcl_always_plus4);
-+    // [2] mflr rD
-+    i2->setData(PPC_mfspr | (reg.code() << 21) | PPC_SPR(spr_lr));
-+    // [3] mtlr r0
-+    i3->setData(PPC_mtspr | (r0.code() << 21) | PPC_SPR(spr_lr));
-+    // [4] ld rD, 16(rD)
-+    i4->setData(PPC_ld | (reg.code() << 21) | (reg.code() << 16) | 16);
-+    // [5] b .+12
-+    i5->setData(PPC_b | (12 & 0x03FFFFFC));
-+  }
-+
-+  // [6..7] .quad VALUE (low 32 at lower addr, high 32 at higher addr).
-+  i6->setData((uint32_t)(value & 0xFFFFFFFF));
-+  i7->setData((uint32_t)(value >> 32));
-+}
-+
-+/* static */
-+uint64_t Assembler::ExtractLoad64Value(Instruction* inst0) {
-+  // The 8-byte value is at inst0[6..7] in both shapes.
-+  Instruction* i6 = inst0 + 6;
-+  Instruction* i7 = inst0 + 7;
-+
-+  uint64_t lo = (uint64_t)i6->encode();  // low 32 at lower addr
-+  uint64_t hi = (uint64_t)i7->encode();  // high 32 at higher addr
-+  return (hi << 32) | lo;
-+}
-+
-+/* static */
-+void Assembler::UpdateLoad64Value(Instruction* inst0, uint64_t value) {
-+  // Sanity-check that inst0 is the start of a load64 stanza in either shape.
-+  // P8: inst0[1] == bcl 20,0,.+4. P9+: inst0[0] is addpcis (major opcode 19).
-+  MOZ_ASSERT(inst0[1].encode() == PPC_bcl_always_plus4 ||
-+                 IsAddpcisLoad64Stanza(inst0->encode()),
-+             "UpdateLoad64Value: inst0 is not a load64 stanza");
-+
-+  // .quad lives at inst0[6..7] in both shapes.
-+  Instruction* i6 = inst0 + 6;
-+  Instruction* i7 = inst0 + 7;
-+
-+  i6->setData((uint32_t)(value & 0xFFFFFFFF));  // low 32 at lower addr
-+  i7->setData((uint32_t)(value >> 32));         // high 32 at higher addr
-+}
-+
-+// ========================================================================
-+// Patching and toggle operations.
-+// ========================================================================
-+
-+/* static */
-+uint32_t Assembler::PatchWrite_NearCallSize() {
-+  // 8 instructions for Load64 + mtctr + bctrl = 10 instructions.
-+  return 10 * sizeof(uint32_t);
-+}
-+
-+/* static */
-+void Assembler::PatchWrite_NearCall(CodeLocationLabel start,
-+                                    CodeLocationLabel toCall) {
-+  Instruction* inst = (Instruction*)start.raw();
-+  uint8_t* dest = toCall.raw();
-+
-+  Assembler::WriteLoad64Instructions(inst, SavedScratchRegister,
-+                                     (uint64_t)dest);
-+  inst[8].makeOp_mtctr(SavedScratchRegister);
-+  inst[9].makeOp_bctr(LinkB);
-+  FlushICache(inst, 10 * sizeof(Instruction));
-+}
-+
-+/* static */
-+void Assembler::PatchWrite_Imm32(CodeLocationLabel label, Imm32 imm) {
-+  uint32_t* l = (uint32_t*)label.raw();
-+  *(l - 1) = imm.value;
-+  FlushICache(l - 1, sizeof(uint32_t));
-+}
-+
-+void Assembler::PatchDataWithValueCheck(CodeLocationLabel label,
-+                                        ImmPtr newValue, ImmPtr expectedValue) {
-+  PatchDataWithValueCheck(label, PatchedImmPtr(newValue.value),
-+                          PatchedImmPtr(expectedValue.value));
-+}
-+
-+void Assembler::PatchDataWithValueCheck(CodeLocationLabel label,
-+                                        PatchedImmPtr newValue,
-+                                        PatchedImmPtr expectedValue) {
-+  Instruction* inst = (Instruction*)label.raw();
-+
-+  DebugOnly<uint64_t> value = Assembler::ExtractLoad64Value(inst);
-+  MOZ_ASSERT(value == uint64_t(expectedValue.value));
-+
-+  Assembler::UpdateLoad64Value(inst, uint64_t(newValue.value));
-+  FlushICache(inst, 8 * sizeof(Instruction));
-+}
-+
-+// ToggleCall toggles the call portion of a toggledCall stanza.
-+// Layout: 8 load64 instructions + mtctr + bctrl (10 total).
-+// We toggle the last two instructions (mtctr/bctrl vs nop/nop).
-+// The destination register is extracted via Load64StanzaDestReg, which
-+// handles both the P8 (mflr-rD at slot [2]) and P9+ (addpcis-rD at slot
-+// [0]) shapes.
-+
-+/* static */
-+void Assembler::ToggleCall(CodeLocationLabel inst_, bool enabled) {
-+  Instruction* i0 = (Instruction*)inst_.raw();
-+  Instruction* i8 = (Instruction*)(inst_.raw() + 8 * sizeof(uint32_t));
-+  Instruction* i9 = (Instruction*)(inst_.raw() + 9 * sizeof(uint32_t));
-+
-+  // Accept either P8 stanza (mflr r0 at slot [0]) or P9+ stanza (addpcis at
-+  // slot [0]; major opcode 19).
-+  MOZ_ASSERT(i0->encode() == (PPC_mfspr | (r0.code() << 21) | PPC_SPR(spr_lr)) ||
-+                 IsAddpcisLoad64Stanza(i0->encode()));
-+
-+  // ToggleCall is idempotent across the same `enabled` value: re-enabling
-+  // an already-enabled site (or re-disabling a disabled one) is a no-op.
-+  // Mozilla's debugger machinery may legitimately toggle the same call site
-+  // multiple times in the same direction (e.g. setting both a breakpoint
-+  // and a frame.onStep on the same script).
-+  Register scratch = Register::FromCode(Load64StanzaDestReg(i0));
-+  uint32_t mtctr = PPC_mtspr | (scratch.code() << 21) | PPC_SPR(spr_ctr);
-+  uint32_t bctrl = (uint32_t)PPC_bctr | (uint32_t)LinkB;
-+  if (enabled) {
-+    MOZ_ASSERT(i8->encode() == NopInst || i8->encode() == mtctr);
-+    MOZ_ASSERT(i9->encode() == NopInst || i9->encode() == bctrl);
-+    i8->setData(mtctr);
-+    i9->setData(bctrl);
-+  } else {
-+    MOZ_ASSERT(i8->encode() == NopInst || i8->encode() == mtctr);
-+    MOZ_ASSERT(i9->encode() == NopInst || i9->encode() == bctrl);
-+    i8->setData(NopInst);
-+    i9->setData(NopInst);
-+  }
-+  FlushICache(i8, 2 * sizeof(Instruction));
-+}
-+
-+// toggledJump emits a trap stanza via jump(label). After binding, the first
-+// instruction becomes "b offset" (short branch). We toggle between b and ori:
-+//   b offset:       [010010][LI:24][0][0]
-+//   ori r0,r0,imm:  [011000][00000][00000][UI:16]
-+// For short forward jumps (offset < 64KB), bits 25:16 of LI are 0, so
-+// swapping the opcode preserves the offset in the lower 16 bits.
-+// ori r0,r0,X is effectively a nop (writes to r0).
-+
-+/* static */
-+void Assembler::ToggleToJmp(CodeLocationLabel inst_) {
-+  Instruction* inst = (Instruction*)inst_.raw();
-+  MOZ_ASSERT(inst->isOpcode(PPC_ori));
-+  // Verify RS=0 and RA=0 (r0).
-+  MOZ_ASSERT((inst->encode() & 0x03E00000) == 0);
-+  MOZ_ASSERT((inst->encode() & 0x001F0000) == 0);
-+  // Swap opcode from ori (011000) to b (010010).
-+  uint32_t encoding = inst->encode();
-+  encoding = (encoding & 0x03FFFFFF) | (uint32_t)PPC_b;
-+  inst->setData(encoding);
-+  FlushICache(inst, sizeof(Instruction));
-+}
-+
-+/* static */
-+void Assembler::ToggleToCmp(CodeLocationLabel inst_) {
-+  Instruction* inst = (Instruction*)inst_.raw();
-+  MOZ_ASSERT(inst->isOpcode(PPC_b));
-+  // Verify short forward branch: upper LI bits (25:16) are 0, AA=0, LK=0.
-+  MOZ_ASSERT((inst->encode() & 0x03FF0003) == 0);
-+  // Swap opcode from b (010010) to ori (011000).
-+  uint32_t encoding = inst->encode();
-+  encoding = (encoding & 0x03FFFFFF) | (uint32_t)PPC_ori;
-+  inst->setData(encoding);
-+  FlushICache(inst, sizeof(Instruction));
-+}
-+
-+// ========================================================================
-+// Bind, tracing, and pointer extraction.
-+// ========================================================================
-+
-+void Assembler::Bind(uint8_t* rawCode, const CodeLabel& label) {
-+  if (label.patchAt().bound()) {
-+    auto mode = label.linkMode();
-+    intptr_t offset = label.patchAt().offset();
-+    intptr_t target = label.target().offset();
-+
-+    if (mode == CodeLabel::RawPointer) {
-+      *reinterpret_cast<const void**>(rawCode + offset) = rawCode + target;
-+    } else {
-+      MOZ_ASSERT(mode == CodeLabel::MoveImmediate ||
-+                 mode == CodeLabel::JumpImmediate);
-+      Instruction* inst = (Instruction*)(rawCode + offset);
-+      Assembler::UpdateLoad64Value(inst, (uint64_t)(rawCode + target));
-+    }
-+  }
-+}
-+
-+uintptr_t Assembler::GetPointer(uint8_t* instPtr) {
-+  Instruction* inst = (Instruction*)instPtr;
-+  return Assembler::ExtractLoad64Value(inst);
-+}
-+
-+static JitCode* CodeFromJump(Instruction* jump) {
-+  uint8_t* target = (uint8_t*)Assembler::ExtractLoad64Value(jump);
-+  return JitCode::FromExecutable(target);
-+}
-+
-+void Assembler::TraceJumpRelocations(JSTracer* trc, JitCode* code,
-+                                     CompactBufferReader& reader) {
-+  while (reader.more()) {
-+    JitCode* child =
-+        CodeFromJump((Instruction*)(code->raw() + reader.readUnsigned()));
-+    TraceManuallyBarrieredEdge(trc, &child, "rel32");
-+  }
-+}
-+
-+static void TraceOneDataRelocation(JSTracer* trc,
-+                                   mozilla::Maybe<AutoWritableJitCode>& awjc,
-+                                   JitCode* code, Instruction* inst) {
-+  void* ptr = (void*)Assembler::ExtractLoad64Value(inst);
-+  void* prior = ptr;
-+
-+  uintptr_t word = reinterpret_cast<uintptr_t>(ptr);
-+  if (word >> JSVAL_TAG_SHIFT) {
-+    Value v = Value::fromRawBits(word);
-+    TraceManuallyBarrieredEdge(trc, &v, "jit-masm-value");
-+    ptr = (void*)v.bitsAsPunboxPointer();
-+  } else {
-+    TraceManuallyBarrieredGenericPointerEdge(
-+        trc, reinterpret_cast<gc::Cell**>(&ptr), "jit-masm-ptr");
-+  }
-+
-+  if (ptr != prior) {
-+    if (awjc.isNothing()) {
-+      awjc.emplace(code);
-+    }
-+    Assembler::UpdateLoad64Value(inst, uint64_t(ptr));
-+  }
-+}
-+
-+/* static */
-+void Assembler::TraceDataRelocations(JSTracer* trc, JitCode* code,
-+                                     CompactBufferReader& reader) {
-+  mozilla::Maybe<AutoWritableJitCode> awjc;
-+  while (reader.more()) {
-+    size_t offset = reader.readUnsigned();
-+    Instruction* inst = (Instruction*)(code->raw() + offset);
-+    TraceOneDataRelocation(trc, awjc, code, inst);
-+  }
-+}
-+
-+/* static */
-+uint8_t* Assembler::NextInstruction(uint8_t* instruction, uint32_t* count) {
-+  if (count != nullptr) {
-+    *count += sizeof(Instruction);
-+  }
-+  return instruction + sizeof(Instruction);
-+}
-+
-+// ========================================================================
-+// UseScratchRegisterScope implementation.
-+// ========================================================================
-+
-+UseScratchRegisterScope::UseScratchRegisterScope(Assembler& assembler)
-+    : available_(assembler.GetScratchRegisterList()),
-+      old_available_(*available_) {}
-+
-+UseScratchRegisterScope::UseScratchRegisterScope(Assembler* assembler)
-+    : available_(assembler->GetScratchRegisterList()),
-+      old_available_(*available_) {}
-+
-+UseScratchRegisterScope::~UseScratchRegisterScope() {
-+  *available_ = old_available_;
-+}
-+
-+Register UseScratchRegisterScope::Acquire() {
-+  MOZ_ASSERT(available_ != nullptr);
-+  MOZ_ASSERT(!available_->empty());
-+  Register index = GeneralRegisterSet::FirstRegister(available_->bits());
-+  available_->takeRegisterIndex(index);
-+  return index;
-+}
-+
-+void UseScratchRegisterScope::Release(const Register& reg) {
-+  MOZ_ASSERT(available_ != nullptr);
-+  MOZ_ASSERT(old_available_.hasRegisterIndex(reg));
-+  MOZ_ASSERT(!available_->hasRegisterIndex(reg));
-+  Include(GeneralRegisterSet(1 << reg.code()));
-+}
-+
-+bool UseScratchRegisterScope::hasAvailable() const {
-+  return (available_->size()) != 0;
-+}
-diff --git a/js/src/jit/ppc64/Assembler-ppc64.h b/js/src/jit/ppc64/Assembler-ppc64.h
-new file mode 100644
-index 000000000000..60e84bf71cf7
---- /dev/null
-+++ b/js/src/jit/ppc64/Assembler-ppc64.h
-@@ -0,0 +1,2114 @@
-+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
-+ * vim: set ts=8 sts=2 et sw=2 tw=80:
-+ * This Source Code Form is subject to the terms of the Mozilla Public
-+ * License, v. 2.0. If a copy of the MPL was not distributed with this
-+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-+
-+#ifndef jit_ppc64_Assembler_ppc64_h
-+#define jit_ppc64_Assembler_ppc64_h
-+
-+#include "jit/CompactBuffer.h"
-+#include "jit/JitCode.h"
-+#include "jit/JitSpewer.h"
-+#include "jit/ppc64/Architecture-ppc64.h"
-+#include "jit/shared/Assembler-shared.h"
-+#include "jit/shared/Disassembler-shared.h"
-+#include "jit/shared/IonAssemblerBuffer.h"
-+#include "jit/shared/IonAssemblerBufferWithConstantPools.h"
-+#include "wasm/WasmTypeDecls.h"
-+
-+namespace js {
-+namespace jit {
-+
-+// GPR register constants.
-+static constexpr Register r0{Registers::r0};
-+static constexpr Register r1{Registers::r1};
-+static constexpr Register r2{Registers::r2};
-+static constexpr Register r3{Registers::r3};
-+static constexpr Register r4{Registers::r4};
-+static constexpr Register r5{Registers::r5};
-+static constexpr Register r6{Registers::r6};
-+static constexpr Register r7{Registers::r7};
-+static constexpr Register r8{Registers::r8};
-+static constexpr Register r9{Registers::r9};
-+static constexpr Register r10{Registers::r10};
-+static constexpr Register r11{Registers::r11};
-+static constexpr Register r12{Registers::r12};
-+static constexpr Register r13{Registers::r13};
-+static constexpr Register r14{Registers::r14};
-+static constexpr Register r15{Registers::r15};
-+static constexpr Register r16{Registers::r16};
-+static constexpr Register r17{Registers::r17};
-+static constexpr Register r18{Registers::r18};
-+static constexpr Register r19{Registers::r19};
-+static constexpr Register r20{Registers::r20};
-+static constexpr Register r21{Registers::r21};
-+static constexpr Register r22{Registers::r22};
-+static constexpr Register r23{Registers::r23};
-+static constexpr Register r24{Registers::r24};
-+static constexpr Register r25{Registers::r25};
-+static constexpr Register r26{Registers::r26};
-+static constexpr Register r27{Registers::r27};
-+static constexpr Register r28{Registers::r28};
-+static constexpr Register r29{Registers::r29};
-+static constexpr Register r30{Registers::r30};
-+static constexpr Register r31{Registers::r31};
-+
-+// FPR register constants.
-+static constexpr FloatRegister f0{FloatRegisters::f0, FloatRegisters::Double};
-+static constexpr FloatRegister f1{FloatRegisters::f1, FloatRegisters::Double};
-+static constexpr FloatRegister f2{FloatRegisters::f2, FloatRegisters::Double};
-+static constexpr FloatRegister f3{FloatRegisters::f3, FloatRegisters::Double};
-+static constexpr FloatRegister f4{FloatRegisters::f4, FloatRegisters::Double};
-+static constexpr FloatRegister f5{FloatRegisters::f5, FloatRegisters::Double};
-+static constexpr FloatRegister f6{FloatRegisters::f6, FloatRegisters::Double};
-+static constexpr FloatRegister f7{FloatRegisters::f7, FloatRegisters::Double};
-+static constexpr FloatRegister f8{FloatRegisters::f8, FloatRegisters::Double};
-+static constexpr FloatRegister f9{FloatRegisters::f9, FloatRegisters::Double};
-+static constexpr FloatRegister f10{FloatRegisters::f10, FloatRegisters::Double};
-+static constexpr FloatRegister f11{FloatRegisters::f11, FloatRegisters::Double};
-+static constexpr FloatRegister f12{FloatRegisters::f12, FloatRegisters::Double};
-+static constexpr FloatRegister f13{FloatRegisters::f13, FloatRegisters::Double};
-+static constexpr FloatRegister f14{FloatRegisters::f14, FloatRegisters::Double};
-+static constexpr FloatRegister f15{FloatRegisters::f15, FloatRegisters::Double};
-+static constexpr FloatRegister f16{FloatRegisters::f16, FloatRegisters::Double};
-+static constexpr FloatRegister f17{FloatRegisters::f17, FloatRegisters::Double};
-+static constexpr FloatRegister f18{FloatRegisters::f18, FloatRegisters::Double};
-+static constexpr FloatRegister f19{FloatRegisters::f19, FloatRegisters::Double};
-+static constexpr FloatRegister f20{FloatRegisters::f20, FloatRegisters::Double};
-+static constexpr FloatRegister f21{FloatRegisters::f21, FloatRegisters::Double};
-+static constexpr FloatRegister f22{FloatRegisters::f22, FloatRegisters::Double};
-+static constexpr FloatRegister f23{FloatRegisters::f23, FloatRegisters::Double};
-+static constexpr FloatRegister f24{FloatRegisters::f24, FloatRegisters::Double};
-+static constexpr FloatRegister f25{FloatRegisters::f25, FloatRegisters::Double};
-+static constexpr FloatRegister f26{FloatRegisters::f26, FloatRegisters::Double};
-+static constexpr FloatRegister f27{FloatRegisters::f27, FloatRegisters::Double};
-+static constexpr FloatRegister f28{FloatRegisters::f28, FloatRegisters::Double};
-+static constexpr FloatRegister f29{FloatRegisters::f29, FloatRegisters::Double};
-+static constexpr FloatRegister f30{FloatRegisters::f30, FloatRegisters::Double};
-+static constexpr FloatRegister f31{FloatRegisters::f31, FloatRegisters::Double};
-+
-+static constexpr Register InvalidReg{Registers::Invalid};
-+static constexpr FloatRegister InvalidFloatReg;
-+
-+static constexpr Register StackPointer = r1;
-+static constexpr Register FramePointer = r31;
-+static constexpr Register ReturnReg = r3;
-+static constexpr Register64 ReturnReg64(ReturnReg);
-+static constexpr FloatRegister ReturnFloat32Reg{FloatRegisters::f1,
-+                                                FloatRegisters::Single};
-+static constexpr FloatRegister ReturnDoubleReg = f1;
-+static constexpr FloatRegister ReturnSimd128Reg{FloatRegisters::f1,
-+                                                FloatRegisters::Simd128};
-+
-+// r16 is non-volatile and non-allocatable, used as a saved scratch.
-+static constexpr Register SavedScratchRegister = r16;
-+
-+static constexpr Register SecondScratchReg = r12;
-+
-+static constexpr FloatRegister ScratchFloat32Reg{FloatRegisters::f0,
-+                                                 FloatRegisters::Single};
-+static constexpr FloatRegister ScratchDoubleReg = f0;
-+static constexpr FloatRegister ScratchSimd128Reg{FloatRegisters::f0,
-+                                                 FloatRegisters::Simd128};
-+
-+struct ScratchFloat32Scope : public AutoFloatRegisterScope {
-+  explicit ScratchFloat32Scope(MacroAssembler& masm)
-+      : AutoFloatRegisterScope(masm, ScratchFloat32Reg) {}
-+};
-+
-+struct ScratchDoubleScope : public AutoFloatRegisterScope {
-+  explicit ScratchDoubleScope(MacroAssembler& masm)
-+      : AutoFloatRegisterScope(masm, ScratchDoubleReg) {}
-+};
-+
-+// PPC64: ScratchSimd128Scope is a simple register wrapper, NOT a scoped
-+// acquire/release. On PPC64, ScratchSimd128Reg is v0 (VSR32; encoded as
-+// {FloatRegisters::f0, Simd128} so encoding() = 0 + 32 = 32) — distinct
-+// from ScratchDoubleReg = f0 (VSR0). It is non-allocatable and always
-+// available. Many SIMD functions call other SIMD functions that also need
-+// v0, creating nested "scopes". Using AutoFloatRegisterScope would assert
-+// on double-acquire in debug builds. Since v0 is never allocated by the
-+// register allocator, nesting is safe.
-+struct ScratchSimd128Scope : public FloatRegister {
-+  explicit ScratchSimd128Scope(MacroAssembler&)
-+      : FloatRegister(ScratchSimd128Reg) {}
-+};
-+
-+class Assembler;
-+
-+class UseScratchRegisterScope {
-+ public:
-+  explicit UseScratchRegisterScope(Assembler& assembler);
-+  explicit UseScratchRegisterScope(Assembler* assembler);
-+  ~UseScratchRegisterScope();
-+
-+  Register Acquire();
-+  void Release(const Register& reg);
-+  bool hasAvailable() const;
-+  void Include(const GeneralRegisterSet& list) {
-+    *available_ = GeneralRegisterSet::Union(*available_, list);
-+  }
-+  void Exclude(const GeneralRegisterSet& list) {
-+    *available_ = GeneralRegisterSet::Subtract(*available_, list);
-+  }
-+
-+ private:
-+  GeneralRegisterSet* available_;
-+  GeneralRegisterSet old_available_;
-+};
-+
-+static constexpr Register OsrFrameReg = r6;
-+static constexpr Register PreBarrierReg = r4;
-+static constexpr Register InterpreterPCReg = r17;
-+
-+static constexpr Register CallTempReg0 = r4;
-+static constexpr Register CallTempReg1 = r9;
-+static constexpr Register CallTempReg2 = r10;
-+static constexpr Register CallTempReg3 = r7;
-+// CallTempReg4 must NOT be JSReturnReg (r5): LMegamorphicLoadSlotPermissive
-+// uses tempFixed(CallTempReg4) for a saved obj pointer AND defineReturn
-+// (JSReturnOperand=r5) for output. If they alias, the megamorphic cache
-+// lookup clobbers the saved obj, corrupting the 'this' pointer.
-+static constexpr Register CallTempReg4 = r8;
-+static constexpr Register CallTempReg5 = r6;
-+
-+// PPC64 ELFv2 has no volatile non-arg GPRs (r3-r10 are all arg regs).
-+// Use allocatable non-volatile registers as overflow temps.
-+static constexpr Register CallTempNonArgRegs[] = {r14, r15};
-+static const uint32_t NumCallTempNonArgRegs = std::size(CallTempNonArgRegs);
-+
-+static constexpr Register IntArgReg0 = r3;
-+static constexpr Register IntArgReg1 = r4;
-+static constexpr Register IntArgReg2 = r5;
-+static constexpr Register IntArgReg3 = r6;
-+static constexpr Register IntArgReg4 = r7;
-+static constexpr Register IntArgReg5 = r8;
-+static constexpr Register IntArgReg6 = r9;
-+static constexpr Register IntArgReg7 = r10;
-+
-+// Registers used by RegExpMatcher and RegExpExecMatch stubs.
-+static constexpr Register RegExpMatcherRegExpReg = CallTempReg0;
-+static constexpr Register RegExpMatcherStringReg = CallTempReg1;
-+static constexpr Register RegExpMatcherLastIndexReg = CallTempReg2;
-+
-+// Registers used by RegExpExecTest stub (do not use ReturnReg).
-+static constexpr Register RegExpExecTestRegExpReg = CallTempReg0;
-+static constexpr Register RegExpExecTestStringReg = CallTempReg1;
-+
-+// Registers used by RegExpSearcher stub (do not use ReturnReg).
-+static constexpr Register RegExpSearcherRegExpReg = CallTempReg0;
-+static constexpr Register RegExpSearcherStringReg = CallTempReg1;
-+static constexpr Register RegExpSearcherLastIndexReg = CallTempReg2;
-+
-+static constexpr Register JSReturnReg_Type = r6;
-+static constexpr Register JSReturnReg_Data = r5;
-+static constexpr Register JSReturnReg = r5;
-+static constexpr ValueOperand JSReturnOperand = ValueOperand(JSReturnReg);
-+
-+static constexpr Register ABINonArgReg0 = r19;
-+static constexpr Register ABINonArgReg1 = r20;
-+static constexpr Register ABINonArgReg2 = r21;
-+static constexpr Register ABINonArgReg3 = r22;
-+static constexpr Register ABINonArgReturnReg0 = r29;
-+static constexpr Register ABINonArgReturnReg1 = r30;
-+static constexpr Register ABINonVolatileReg = r14;
-+static constexpr Register ABINonArgReturnVolatileReg = r11;
-+
-+static constexpr FloatRegister ABINonArgDoubleReg{FloatRegisters::f14,
-+                                                  FloatRegisters::Double};
-+
-+// Wasm instance pointer register. Preserved across wasm function calls.
-+static constexpr Register InstanceReg = r18;
-+static constexpr Register HeapReg = r24;
-+static constexpr Register GlobalReg = r23;
-+
-+// Wasm table call registers.
-+static constexpr Register WasmTableCallScratchReg0 = ABINonArgReg0;
-+static constexpr Register WasmTableCallScratchReg1 = ABINonArgReg1;
-+static constexpr Register WasmTableCallSigReg = ABINonArgReg2;
-+static constexpr Register WasmTableCallIndexReg = ABINonArgReg3;
-+
-+// Wasm ref call registers.
-+static constexpr Register WasmCallRefCallScratchReg0 = ABINonArgReg0;
-+static constexpr Register WasmCallRefCallScratchReg1 = ABINonArgReg1;
-+static constexpr Register WasmCallRefCallScratchReg2 = ABINonArgReg2;
-+static constexpr Register WasmCallRefReg = ABINonArgReg3;
-+
-+// Wasm tail call scratch registers.
-+// WasmTailCallRAScratchReg must NOT be ABINonArgReg0: the shared tail-call
-+// code (wasmReturnCallImport, wasmReturnCallIndirect, wasmReturnCallRef)
-+// stores the callee address in ABINonArgReg0, and CollapseWasmFrame*
-+// overwrites tempForRA. On architectures with a GPR link register (ARM,
-+// MIPS, LA64, RISC-V) this is ra/lr. PPC64's LR is an SPR, so we use r14
-+// (ABINonVolatileReg) which is callee-saved and not used in call setup.
-+static constexpr Register WasmTailCallInstanceScratchReg = ABINonArgReg1;
-+static constexpr Register WasmTailCallRAScratchReg = ABINonVolatileReg;
-+static constexpr Register WasmTailCallFPScratchReg = ABINonArgReg3;
-+
-+// Register used as a scratch along the return path in the fast js -> wasm stub
-+// code. Must not overlap ReturnReg, JSReturnOperand, or InstanceReg.
-+// Must be volatile.
-+static constexpr Register WasmJitEntryReturnScratch = r10;
-+
-+static constexpr uint32_t ABIStackAlignment = 16;
-+static constexpr uint32_t CodeAlignment = 16;
-+static constexpr uint32_t JitStackAlignment = 16;
-+
-+static constexpr uint32_t JitStackValueAlignment =
-+    JitStackAlignment / sizeof(Value);
-+static_assert(JitStackAlignment % sizeof(Value) == 0 &&
-+                  JitStackValueAlignment >= 1,
-+              "Stack alignment should be a non-zero multiple of sizeof(Value)");
-+
-+static constexpr uint32_t SimdMemoryAlignment = 16;
-+static_assert(
-+    CodeAlignment % SimdMemoryAlignment == 0,
-+    "Code alignment should be larger than any of the alignments "
-+    "which are used for the constant sections of the code buffer. "
-+    "Thus it should be larger than the alignment for SIMD constants.");
-+
-+static constexpr uint32_t WasmStackAlignment = SimdMemoryAlignment;
-+static const uint32_t WasmTrapInstructionLength = 4;
-+
-+static constexpr uint32_t WasmCheckedCallEntryOffset = 0u;
-+static constexpr uint32_t WasmCheckedTailEntryOffset = 32u;
-+
-+static constexpr Scale ScalePointer = TimesEight;
-+
-+class ABIArgGenerator : public ABIArgGeneratorShared {
-+ public:
-+  explicit ABIArgGenerator(ABIKind kind)
-+      : ABIArgGeneratorShared(kind),
-+        intRegIndex_(0),
-+        floatRegIndex_(0),
-+        current_() {
-+    // PPC64 ELFv2 ABI: the callee saves LR, CR, TOC into the caller's
-+    // frame (offsets 8, 16, 24 from caller SP). Reserve 32 bytes so that
-+    // callWithABIPre always allocates enough space for this link area.
-+    stackOffset_ += ShadowStackSpace;
-+  }
-+
-+  ABIArg next(MIRType argType);
-+  ABIArg& current() { return current_; }
-+
-+ protected:
-+  unsigned intRegIndex_;
-+  unsigned floatRegIndex_;
-+  ABIArg current_;
-+};
-+
-+static constexpr uint32_t NumIntArgRegs = 8;
-+static constexpr uint32_t NumFloatArgRegs = 13;
-+
-+static inline bool GetIntArgReg(uint32_t usedIntArgs, Register* out) {
-+  if (usedIntArgs < NumIntArgRegs) {
-+    *out = Register::FromCode(r3.code() + usedIntArgs);
-+    return true;
-+  }
-+  return false;
-+}
-+
-+static inline bool GetFloatArgReg(uint32_t usedFloatArgs, FloatRegister* out) {
-+  if (usedFloatArgs < NumFloatArgRegs) {
-+    *out = FloatRegister::FromCode(f1.code() + usedFloatArgs);
-+    return true;
-+  }
-+  return false;
-+}
-+
-+static inline bool GetTempRegForIntArg(uint32_t usedIntArgs,
-+                                       uint32_t usedFloatArgs, Register* out) {
-+  MOZ_ASSERT(usedFloatArgs == 0);
-+
-+  if (GetIntArgReg(usedIntArgs, out)) {
-+    return true;
-+  }
-+
-+  usedIntArgs -= NumIntArgRegs;
-+  if (usedIntArgs >= NumCallTempNonArgRegs) {
-+    return false;
-+  }
-+  *out = CallTempNonArgRegs[usedIntArgs];
-+  return true;
-+}
-+
-+// PPC64 instruction field positions.
-+// PPC uses big-endian bit numbering (bit 0 = MSB), but we store instructions
-+// in a uint32_t where bit 0 = LSB. The shifts below are in LSB-0 terms.
-+//
-+//   [0:5]  primary opcode   (OpcodeShift=26)
-+//   [6:10]  RT/RS/BF/TO     (RTShift=21, 5 bits)
-+//   [11:15] RA/BI           (RAShift=16, 5 bits)
-+//   [16:20] RB/SH           (RBShift=11, 5 bits)
-+//   [16:31] SI/UI/D         (Imm16Shift=0, 16 bits)
-+//   [21:25] subop bits      (varies)
-+//   [21:30] XO              (X-form; A/M/MD/MDS narrower)
-+//   [31]    Rc bit          (RcShift=0)
-+
-+static const uint32_t OpcodeShift = 26;
-+static const uint32_t OpcodeBits = 6;
-+
-+static const uint32_t RTShift = 21;
-+static const uint32_t RTBits = 5;
-+static const uint32_t RSShift = 21;
-+static const uint32_t RSBits = 5;
-+static const uint32_t RAShift = 16;
-+static const uint32_t RABits = 5;
-+static const uint32_t RBShift = 11;
-+static const uint32_t RBBits = 5;
-+static const uint32_t RCShift = 6;
-+static const uint32_t RCBits = 5;
-+
-+static const uint32_t BOShift = 21;
-+static const uint32_t BOBits = 5;
-+static const uint32_t BIShift = 16;
-+static const uint32_t BIBits = 5;
-+
-+static const uint32_t Imm16Shift = 0;
-+static const uint32_t Imm16Bits = 16;
-+
-+static const uint32_t RcShift = 0;
-+static const uint32_t RcBit = 1;
-+
-+static const uint32_t RTMask = ((1 << RTBits) - 1) << RTShift;
-+static const uint32_t RSMask = ((1 << RSBits) - 1) << RSShift;
-+static const uint32_t RAMask = ((1 << RABits) - 1) << RAShift;
-+static const uint32_t RBMask = ((1 << RBBits) - 1) << RBShift;
-+static const uint32_t Imm16Mask = (1 << Imm16Bits) - 1;
-+static const uint32_t RegMask = (1 << RTBits) - 1;
-+
-+static inline uint32_t RT(Register r) { return (uint32_t)r.code() << RTShift; }
-+static inline uint32_t RT(FloatRegister r) {
-+  return (uint32_t)r.code() << RTShift;
-+}
-+static inline uint32_t RS(Register r) { return (uint32_t)r.code() << RSShift; }
-+static inline uint32_t RS(FloatRegister r) {
-+  return (uint32_t)r.code() << RSShift;
-+}
-+static inline uint32_t RA(Register r) { return (uint32_t)r.code() << RAShift; }
-+static inline uint32_t RA(FloatRegister r) {
-+  return (uint32_t)r.code() << RAShift;
-+}
-+static inline uint32_t RB(Register r) { return (uint32_t)r.code() << RBShift; }
-+static inline uint32_t RB(FloatRegister r) {
-+  return (uint32_t)r.code() << RBShift;
-+}
-+
-+// SPR encoding: the SPR number is split across bits 11-15 and 16-20 in a
-+// swapped arrangement.  PPC_SPR(x) produces the value to OR into an
-+// mtspr/mfspr instruction at the RB+RA position (bits 11-20).
-+#define PPC_SPR(x) ((((int)(x) >> 5) & 0x1f) << 11 | ((int)(x) & 0x1f) << 16)
-+
-+enum PPCOpcodes {
-+  PPC_add = 0x7C000214,
-+  PPC_addc = 0x7C000014,
-+  PPC_adde = 0x7C000114,
-+  PPC_addi = 0x38000000,
-+  PPC_addis = 0x3C000000,
-+  PPC_and_ = 0x7C000038,
-+  // andi. is always record form (no non-record andi exists).
-+  PPC_andi_dot = 0x70000000,
-+  PPC_b = 0x48000000,
-+  PPC_bc = 0x40000000,
-+  // Encoded "bcl 20, lt, $+4": PC-relative branch-and-link by 4 bytes
-+  // (land at the next instruction) with BO=20 (branch always); BI=0
-+  // (=lt) is don't-care because BO=20 forces the branch. Used by
-+  // PoolLoadFPR{32,64}'s POWER8 stanza and PoolLoadSimd128's stanza to
-+  // seed LR with the current PC for the subsequent mflr+ld base
-+  // computation. Used by patch sites that write raw instruction memory
-+  // (PatchConstantPoolLoad, WriteLoad64Instructions, etc.). Named for
-+  // grep-ability and to avoid magic-number copies.
-+  PPC_bcl_always_plus4 = 0x42800005,
-+  PPC_bctr = 0x4E800420,
-+  PPC_bcctr = 0x4C000420,
-+  PPC_blr = 0x4E800020,
-+  PPC_cmpd = 0x7C200000,
-+  PPC_cmpdi = 0x2C200000,
-+  PPC_cmpld = 0x7C200040,
-+  PPC_cmpldi = 0x28200000,
-+  PPC_cmpw = 0x7C000000,
-+  PPC_cmpwi = 0x2C000000,
-+  PPC_cmplw = 0x7C000040,
-+  PPC_cmplwi = 0x28000000,
-+  PPC_cntlzd = 0x7C000074,
-+  PPC_cntlzw = 0x7C000034,
-+  PPC_cnttzd = 0x7C000474,
-+  PPC_cnttzw = 0x7C000434,
-+  PPC_crandc = 0x4C000102,
-+  PPC_cror = 0x4C000382,
-+  PPC_crorc = 0x4C000342,
-+  PPC_divd = 0x7C0003D2,
-+  PPC_divdu = 0x7C000392,
-+  PPC_divw = 0x7C0003D6,
-+  PPC_divwu = 0x7C000396,
-+  // POWER9 (ISA 3.0) modulo instructions.
-+  PPC_modsd = 0x7C000612,
-+  PPC_modsw = 0x7C000616,
-+  PPC_modud = 0x7C000212,
-+  PPC_moduw = 0x7C000216,
-+  PPC_extsb = 0x7C000774,
-+  PPC_extsh = 0x7C000734,
-+  PPC_extsw = 0x7C0007B4,
-+  PPC_fabs = 0xFC000210,
-+  PPC_fadd = 0xFC00002A,
-+  PPC_fadds = 0xEC00002A,
-+  PPC_fcpsgn = 0xFC000010,
-+  PPC_fcfid = 0xFC00069C,
-+  PPC_fcfids = 0xEC00069C,
-+  PPC_fcfidu = 0xFC00079C,
-+  PPC_fcfidus = 0xEC00079C,
-+  PPC_fcmpu = 0xFC000000,
-+  PPC_fctid = 0xFC00065C,
-+  PPC_fctidz = 0xFC00065E,
-+  PPC_fctiduz = 0xFC00075E,
-+  PPC_fctiwz = 0xFC00001E,
-+  PPC_fdiv = 0xFC000024,
-+  PPC_fdivs = 0xEC000024,
-+  PPC_fmr = 0xFC000090,
-+  PPC_fmul = 0xFC000032,
-+  PPC_fmuls = 0xEC000032,
-+  PPC_fneg = 0xFC000050,
-+  PPC_frim = 0xFC0003D0,
-+  PPC_frip = 0xFC000390,
-+  PPC_friz = 0xFC000350,
-+  PPC_frsp = 0xFC000018,
-+  PPC_fsub = 0xFC000028,
-+  PPC_fsubs = 0xEC000028,
-+  PPC_fsqrt = 0xFC00002C,
-+  PPC_fsqrts = 0xEC00002C,
-+  PPC_isel = 0x7C00001E,
-+  // POWER10 (ISA 3.1). RT = (CR[BI]==1) ? 1 : 0. XO=384 at bits 21-30.
-+  PPC_setbc = 0x7C000300,
-+  // POWER10 (ISA 3.1). RT = (CR[BI]==0) ? 1 : 0. XO=416.
-+  PPC_setbcr = 0x7C000340,
-+  PPC_lbarx = 0x7C000068,
-+  PPC_lbz = 0x88000000,
-+  PPC_lbzx = 0x7C0000AE,
-+  PPC_ld = 0xE8000000,
-+  PPC_ldarx = 0x7C0000A8,
-+  PPC_ldx = 0x7C00002A,
-+  PPC_lfd = 0xC8000000,
-+  PPC_lfdx = 0x7C0004AE,
-+  PPC_lfiwax = 0x7C0006AE,
-+  PPC_lfiwzx = 0x7C0006EE,
-+  PPC_lfs = 0xC0000000,
-+  PPC_lfsx = 0x7C00042E,
-+  PPC_lha = 0xA8000000,
-+  PPC_lharx = 0x7C0000E8,
-+  PPC_lhax = 0x7C0002AE,
-+  PPC_lhz = 0xA0000000,
-+  PPC_lhzx = 0x7C00022E,
-+  PPC_lwa = 0xE8000002,
-+  PPC_lwarx = 0x7C000028,
-+  PPC_lwz = 0x80000000,
-+  // X-form sign-extending word load (opcode 31, XO=341). Single-insn
-+  // equivalent of lwzx + extsw.
-+  PPC_lwax = 0x7C0002AA,
-+  PPC_lwzx = 0x7C00002E,
-+  PPC_mcrxrx = 0x7C000480,
-+  PPC_mcrfs = 0xFC000080,
-+  PPC_mfocrf = 0x7C100026,
-+  PPC_mffs = 0xFC00048E,
-+  PPC_mfspr = 0x7C0002A6,
-+  PPC_mfvsrd = 0x7C000066,
-+  PPC_mtcrf = 0x7C000120,
-+  PPC_mtfsb0 = 0xFC00008C,
-+  PPC_mtvsrd = 0x7C000166,
-+  // POWER8+ (ISA 2.07). VSR[XT].dw[0] = sign_ext_64(RA[32:63]).
-+  // XO=211 at bits 21-30. Combines extsw + mtvsrd into one insn.
-+  PPC_mtvsrwa = 0x7C0001A6,
-+  PPC_mtvsrws = 0x7C000326,
-+  PPC_mtvsrwz = 0x7C0001E6,
-+  PPC_mtspr = 0x7C0003A6,
-+  PPC_mulhd = 0x7C000092,
-+  PPC_mulhdu = 0x7C000012,
-+  PPC_mulhwu = 0x7C000016,
-+  PPC_mulli = 0x1C000000,
-+  PPC_mulld = 0x7C0001D2,
-+  PPC_mulldo = 0x7C0005D2,
-+  PPC_mullw = 0x7C0001D6,
-+  PPC_neg = 0x7C0000D0,
-+  PPC_nor = 0x7C0000F8,
-+  PPC_or_ = 0x7C000378,
-+  PPC_ori = 0x60000000,
-+  PPC_oris = 0x64000000,
-+  PPC_popcntb = 0x7C0000F4,
-+  PPC_popcntd = 0x7C0003F4,
-+  PPC_popcntw = 0x7C0002F4,
-+  PPC_brd = 0x7C000176,  // POWER10: byte-reverse doubleword (X-form, XO=187)
-+  PPC_brh = 0x7C0001B6,  // POWER10: byte-reverse each halfword (X-form, XO=219)
-+  PPC_brw = 0x7C000136,  // POWER10: byte-reverse each word     (X-form, XO=155)
-+  PPC_rldcl = 0x78000010,
-+  PPC_rldicl = 0x78000000,
-+  PPC_rldcr = 0x78000012,
-+  PPC_rldicr = 0x78000004,
-+  PPC_rldimi = 0x7800000C,
-+  PPC_rlwimi = 0x50000000,
-+  PPC_rlwinm = 0x54000000,
-+  PPC_rlwnm = 0x5C000000,
-+  PPC_sld = 0x7C000036,
-+  PPC_slw = 0x7C000030,
-+  PPC_srad = 0x7C000634,
-+  PPC_sradi = 0x7C000674,
-+  PPC_sraw = 0x7C000630,
-+  PPC_srawi = 0x7C000670,
-+  PPC_srd = 0x7C000436,
-+  PPC_srw = 0x7C000430,
-+  PPC_stb = 0x98000000,
-+  PPC_stbcx = 0x7C00056D,
-+  PPC_stbx = 0x7C0001AE,
-+  PPC_std = 0xF8000000,
-+  PPC_stdcx = 0x7C0001AD,
-+  PPC_stdu = 0xF8000001,
-+  PPC_stdx = 0x7C00012A,
-+  PPC_stfd = 0xD8000000,
-+  PPC_stfdu = 0xDC000000,
-+  PPC_stfdx = 0x7C0005AE,
-+  PPC_stfs = 0xD0000000,
-+  PPC_stfsu = 0xD4000000,
-+  PPC_stfsx = 0x7C00052E,
-+  PPC_sth = 0xB0000000,
-+  PPC_sthcx = 0x7C0005AD,
-+  PPC_sthx = 0x7C00032E,
-+  PPC_stw = 0x90000000,
-+  PPC_stwx = 0x7C00012E,
-+  PPC_stwbrx = 0x7C00052C,
-+  PPC_stwcx = 0x7C00012D,
-+  PPC_subf = 0x7C000050,
-+  PPC_subfc = 0x7C000010,
-+  PPC_subfe = 0x7C000110,
-+  PPC_subfic = 0x20000000,
-+  PPC_sync = 0x7C0004AC,
-+  // isync — execution synchronization. Discards prefetched instructions and
-+  // forces a refetch+reexecute of everything past the barrier; prevents
-+  // speculative bypass. Used for Spectre v1 mitigation in speculationBarrier.
-+  // Encoding: bytes `2c 01 00 4c` (LE) = 0x4C00012C.
-+  PPC_isync = 0x4C00012C,
-+  PPC_trap = 0x7FE00008,
-+  PPC_tw = 0x7C000008,
-+  PPC_xor_ = 0x7C000278,
-+  PPC_xori = 0x68000000,
-+  PPC_xoris = 0x6C000000,
-+  // VMX register load/store (X-form, opcode 31, XO=103/231).
-+  // Operate on raw VR0-31 (the lvx/stvx mnemonics predate VSX, so the
-+  // assembler exposes them with a uint8_t VR index rather than via the
-+  // VSR-namespace FloatRegister overloads used for lxvx/stxvx.)
-+  PPC_lvx = 0x7C0000CE,
-+  PPC_lxvd2x = 0x7C000698,
-+  PPC_lxvx = 0x7C000218,
-+  PPC_mfvsrld = 0x7C000266,
-+  PPC_mtvsrdd = 0x7C000366,
-+  PPC_stvx = 0x7C0001CE,
-+  PPC_stxvd2x = 0x7C000798,
-+  PPC_stxvx = 0x7C000318,
-+  PPC_vaddubm = 0x10000000,
-+  PPC_vavgub = 0x10000402,
-+  PPC_vavguh = 0x10000442,
-+  PPC_vcmpequb = 0x10000006,
-+  PPC_vcmpequh = 0x10000046,
-+  PPC_vcmpequw = 0x10000086,
-+  PPC_vcmpequd = 0x100000C7,
-+  PPC_vcmpgtsb = 0x10000306,
-+  PPC_vcmpgtsh = 0x10000346,
-+  PPC_vcmpgtsw = 0x10000386,
-+  PPC_vcmpgtsd = 0x100003C7,
-+  PPC_vcmpgtub = 0x10000206,
-+  PPC_vcmpgtuh = 0x10000246,
-+  PPC_vcmpgtuw = 0x10000286,
-+  PPC_vcmpgtud = 0x100002C7,
-+  PPC_vcmpneb = 0x10000007,  // POWER9 (ISA 3.0)
-+  PPC_vcmpneh = 0x10000047,  // POWER9
-+  PPC_vcmpnew = 0x10000087,  // POWER9
-+  PPC_vadduhm = 0x10000040,
-+  PPC_vadduwm = 0x10000080,
-+  PPC_vaddudm = 0x100000C0,
-+  PPC_vaddubs = 0x10000200,
-+  PPC_vadduhs = 0x10000240,
-+  PPC_vaddsbs = 0x10000300,
-+  PPC_vaddshs = 0x10000340,
-+  PPC_vmaxsb = 0x10000102,
-+  PPC_vmaxsh = 0x10000142,
-+  PPC_vmaxsw = 0x10000182,
-+  PPC_vmaxsd = 0x100001C2,
-+  PPC_vmaxub = 0x10000002,
-+  PPC_vmaxuh = 0x10000042,
-+  PPC_vmaxuw = 0x10000082,
-+  PPC_vmhraddshs = 0x10000021,
-+  PPC_vmrghb = 0x1000000C,
-+  PPC_vmrghh = 0x1000004C,
-+  PPC_vmrghw = 0x1000008C,
-+  PPC_vmrglb = 0x1000010C,
-+  PPC_vmrglh = 0x1000014C,
-+  PPC_vmrglw = 0x1000018C,
-+  PPC_vminsb = 0x10000302,
-+  PPC_vminsh = 0x10000342,
-+  PPC_vminsw = 0x10000382,
-+  PPC_vminub = 0x10000202,
-+  PPC_vminuh = 0x10000242,
-+  PPC_vminuw = 0x10000282,
-+  // POWER9 (ISA 3.0) per-lane integer negate. VRA field carries the subop
-+  // code: 6 for vnegw, 7 for vnegd. Base XO is 0x602.
-+  PPC_vnegw = 0x10060602,
-+  PPC_vnegd = 0x10070602,
-+  PPC_vmladduhm = 0x10000022,
-+  PPC_vmuluwm = 0x10000089,
-+  PPC_vmulld = 0x100001C9,      // POWER10 (XO=457, vector i64x2 multiply low)
-+  PPC_vmulesb = 0x10000308,
-+  PPC_vmuleub = 0x10000208,
-+  PPC_vmulesh = 0x10000348,
-+  PPC_vmuleuh = 0x10000248,
-+  PPC_vmulesw = 0x10000388,
-+  PPC_vmuleuw = 0x10000288,
-+  PPC_vmulosb = 0x10000108,
-+  PPC_vmuloub = 0x10000008,
-+  PPC_vmulosh = 0x10000148,
-+  PPC_vmulouh = 0x10000048,
-+  PPC_vmulosw = 0x10000188,
-+  PPC_vmulouw = 0x10000088,
-+  PPC_vmsumshm = 0x10000028,
-+  PPC_vmsumuhm = 0x10000026,
-+  PPC_vperm = 0x1000002B,
-+  // VX-form, opcode 4, XO=0x54C. Per-byte bit-permute of a 128-bit value;
-+  // result 16-bit bitmap lands in dw0 low 16 bits, recoverable via mfvsrd.
-+  // Available on POWER8+ (ISA 2.07).
-+  PPC_vbpermq = 0x1000054C,
-+  // POWER10 (ISA 3.1) Vector Extract Mask. VX-form, opcode 4, XO=0x642,
-+  // with UIM at bits 11..15 selecting lane width: 8=byte, 9=halfword,
-+  // 10=word, 11=doubleword. RT is a GPR (low N bits = wasm bitmask).
-+  PPC_vextractbm = 0x10080642,
-+  PPC_vextracthm = 0x10090642,
-+  PPC_vextractwm = 0x100A0642,
-+  PPC_vextractdm = 0x100B0642,
-+  // POWER10 vector insert from GPR at immediate byte offset:
-+  //   vinsw VRT, RB, UIM   VRT[UIM*8:UIM*8+31] ← RB[32:63]
-+  //   vinsd VRT, RB, UIM   VRT[UIM*8:UIM*8+63] ← RB[0:63]
-+  // VX-form, opcode 4. RB at bits 16..20, UIM at bits 11..15.
-+  PPC_vinsw = 0x100000CF,  // POWER10 (XO=207)
-+  PPC_vinsd = 0x100001CF,  // POWER10 (XO=463)
-+  // POWER10 vector insert byte/halfword from GPR with register-supplied
-+  // (right-indexed = LE-natural) byte position:
-+  //   vinsbrx VRT, RA, RB   VRT.byte[RA & 0xF]  ← RB & 0xFF
-+  //   vinshrx VRT, RA, RB   VRT.hword[(RA & 0xE)/2] ← RB & 0xFFFF
-+  // VX-form, opcode 4. RA at bits 16..20, RB at bits 11..15.
-+  PPC_vinsbrx = 0x1000030F,  // POWER10 (XO=783)
-+  PPC_vinshrx = 0x1000034F,  // POWER10 (XO=847)
-+  // POWER9 (ISA 3.0) vector insert byte/halfword from VR at immediate
-+  // byte position:
-+  //   vinsertb VRT, VRB, UIM  VRT.byte[UIM]      ← VRB.byte[7]    (BE)
-+  //   vinserth VRT, VRB, UIM  VRT.hword[UIM..+1] ← VRB.byte[6..7] (BE)
-+  // V-form, opcode 4. VRB at bits 11..15, UIM at bits 16..20. Simd128
-+  // lives in VSR32-63 (= VR0-31), so the V-form VRT field addresses our
-+  // Simd128 storage via `encoding() & 31`.
-+  PPC_vinsertb = 0x1000030D,  // POWER9 (XO=781)
-+  PPC_vinserth = 0x1000034D,  // POWER9 (XO=845)
-+  PPC_vextractub = 0x1000020D,  // POWER9 (XO=525)
-+  PPC_vextractuh = 0x1000024D,  // POWER9 (XO=589)
-+  PPC_vspltisb = 0x1000030C,    // POWER7+ (XO=780, splat 5-bit SIMM to all 16 byte lanes)
-+  PPC_vspltish = 0x1000034C,    // POWER7+ (XO=844, splat 5-bit SIMM to all 8 i16 lanes)
-+  PPC_vspltisw = 0x1000038C,    // POWER7+ (XO=908, splat 5-bit SIMM to all 4 i32 lanes)
-+  PPC_vpopcntb = 0x10000703,
-+  PPC_vslb = 0x10000104,
-+  PPC_vsld = 0x100005C4,
-+  PPC_vsldoi = 0x1000002C,
-+  PPC_vslh = 0x10000144,
-+  PPC_vslo = 0x1000040C,
-+  PPC_vslw = 0x10000184,
-+  PPC_vspltb = 0x1000020C,
-+  PPC_vsplth = 0x1000024C,
-+  PPC_vsrab = 0x10000304,
-+  PPC_vsrad = 0x100003C4,
-+  PPC_vsrah = 0x10000344,
-+  PPC_vsraw = 0x10000384,
-+  PPC_vsrb = 0x10000204,
-+  PPC_vsrd = 0x100006C4,
-+  PPC_vsrh = 0x10000244,
-+  PPC_vsro = 0x1000044C,
-+  PPC_vsrw = 0x10000284,
-+  PPC_vpkshss = 0x1000018E,
-+  PPC_vpkshus = 0x1000010E,
-+  PPC_vpkswss = 0x100001CE,
-+  PPC_vpkswus = 0x1000014E,
-+  PPC_vupkhsb = 0x1000020E,
-+  PPC_vupkhsh = 0x1000024E,
-+  PPC_vupkhsw = 0x1000064E,
-+  PPC_vupklsb = 0x1000028E,
-+  PPC_vupklsh = 0x100002CE,
-+  PPC_vupklsw = 0x100006CE,
-+  PPC_vsububm = 0x10000400,
-+  PPC_vsubuhm = 0x10000440,
-+  PPC_vsubuwm = 0x10000480,
-+  PPC_vsubudm = 0x100004C0,
-+  PPC_vsububs = 0x10000600,
-+  PPC_vsubuhs = 0x10000640,
-+  PPC_vsubsbs = 0x10000700,
-+  PPC_vsubshs = 0x10000740,
-+  PPC_xscvdpspn = 0xF000042C,
-+  PPC_xscvspdpn = 0xF000052C,
-+  // POWER9 (ISA 3.0) scalar FP16 conversions, XX2-form. The UIM
-+  // disambiguator is baked into the constant (xscvdphp=17, xscvhpdp=16).
-+  // Encodings cross-checked against binutils with `.machine power9`.
-+  PPC_xscvdphp = 0xF011056C,
-+  PPC_xscvhpdp = 0xF010056C,
-+  // POWER9 (ISA 3.0) scalar VSX extract biased exponent, XX2-form.
-+  // XT.dword[0] = (zero || biased_exp_11bit), XT.dword[1] = 0. XO=347
-+  // (shares XO with xscv{dp,hp}{hp,dp} — disambiguated by bits 16-20=0).
-+  // Encoding cross-checked against binutils with `.machine power9`.
-+  PPC_xsxexpdp = 0xF000056C,
-+  // POWER9 (ISA 3.0) scalar FP16 load/store, X-form (opcode 31).
-+  // lxsihzx zero-extends; stxsihx writes 16 bits from VSR dword 0
-+  // word 1's low halfword.
-+  PPC_lxsihzx = 0x7C00065A,
-+  PPC_stxsihx = 0x7C00075A,
-+  // POWER9 scalar VSX max/min with Java/JavaScript semantics — handles
-+  // ±0 and NaN identically to Math.max/Math.min in ECMA-262 (covers
-+  // 19 corner cases against the JS shell).
-+  // XX3-form, primary opcode 60, XO=144 (max) / XO=152 (min).
-+  PPC_xsmaxjdp = 0xF0000480,
-+  PPC_xsminjdp = 0xF00004C0,
-+  PPC_xxbrd = 0xF017076C,
-+  PPC_xvabsdp = 0xF0000764,
-+  PPC_xvabssp = 0xF0000664,
-+  PPC_xvadddp = 0xF0000300,
-+  PPC_xvaddsp = 0xF0000200,
-+  PPC_xvcmpeqdp = 0xF0000318,
-+  PPC_xvcmpeqsp = 0xF0000218,
-+  PPC_xvcmpgedp = 0xF0000398,
-+  PPC_xvcmpgesp = 0xF0000298,
-+  PPC_xvcmpgtdp = 0xF0000358,
-+  PPC_xvcmpgtsp = 0xF0000258,
-+  PPC_xvcvdpsp = 0xF0000624,
-+  PPC_xvcvdpsxws = 0xF0000360,
-+  PPC_xvcvdpuxws = 0xF0000320,
-+  PPC_xvcvspdp = 0xF0000724,
-+  PPC_xvcvspsxws = 0xF0000260,
-+  PPC_xvcvspuxws = 0xF0000220,
-+  PPC_xvcvsxwdp = 0xF00003E0,
-+  PPC_xvcvsxwsp = 0xF00002E0,
-+  PPC_xvcvuxwdp = 0xF00003A0,
-+  PPC_xvcvuxwsp = 0xF00002A0,
-+  PPC_xvdivdp = 0xF00003C0,
-+  PPC_xvdivsp = 0xF00002C0,
-+  PPC_xvmaddadp = 0xF0000308,
-+  PPC_xvmaddasp = 0xF0000208,
-+  PPC_xvmaxdp = 0xF0000700,
-+  PPC_xvmaxsp = 0xF0000600,
-+  PPC_xvmindp = 0xF0000740,
-+  PPC_xvminsp = 0xF0000640,
-+  PPC_xvmuldp = 0xF0000380,
-+  PPC_xvmulsp = 0xF0000280,
-+  PPC_xvnegdp = 0xF00007E4,
-+  PPC_xvnmsubadp = 0xF0000788,
-+  PPC_xvnmsubasp = 0xF0000688,
-+  PPC_xvnegsp = 0xF00006E4,
-+  PPC_xvrdpic = 0xF00003AC,
-+  PPC_xvrdpim = 0xF00003E4,
-+  PPC_xvrdpip = 0xF00003A4,
-+  PPC_xvrdpiz = 0xF0000364,
-+  PPC_xvrspic = 0xF00002AC,
-+  PPC_xvrspim = 0xF00002E4,
-+  PPC_xvrspip = 0xF00002A4,
-+  PPC_xvrspiz = 0xF0000264,
-+  PPC_xvsqrtdp = 0xF000032C,
-+  PPC_xvsqrtsp = 0xF000022C,
-+  PPC_xvsubdp = 0xF0000340,
-+  PPC_xvsubsp = 0xF0000240,
-+  PPC_xxextractuw = 0xF0000294,
-+  PPC_xxinsertw = 0xF00002D4,
-+  PPC_xxland = 0xF0000410,
-+  PPC_xxlandc = 0xF0000450,
-+  PPC_xxlnor = 0xF0000510,
-+  PPC_xxlor = 0xF0000490,
-+  PPC_xxlxor = 0xF00004D0,
-+  PPC_xxpermdi = 0xF0000050,
-+  PPC_xxsel = 0xF0000030,
-+  PPC_xxspltib = 0xF00002D0,  // POWER9 (ISA 3.0): XX1-form, no Rc
-+  PPC_xxspltw = 0xF0000290,
-+
-+  // Simplified mnemonics.
-+  PPC_mr = PPC_or_,
-+  PPC_not = PPC_nor,
-+  PPC_nop = PPC_ori,
-+  PPC_lwsync = PPC_sync | (1 << 21),
-+
-+  PPC_MAJOR_OPCODE_MASK = 0xFC000000
-+};
-+
-+static const uint32_t NopInst = (uint32_t)PPC_nop;
-+static const uint32_t PPC_STANZA_LENGTH = 16;
-+
-+class Instruction;
-+class InstReg;
-+class InstImm;
-+class BOffImm16;
-+class JOffImm26;
-+
-+// PPC64 base instruction type: a single 32-bit word.
-+class Instruction {
-+ protected:
-+  uint32_t data;
-+
-+ public:
-+  explicit Instruction(uint32_t data_) : data(data_) {}
-+  explicit Instruction(PPCOpcodes op) : data((uint32_t)op) {}
-+
-+  uint32_t encode() const { return data; }
-+
-+  void makeNop() { data = NopInst; }
-+  void makeOp_mtctr(Register r) {
-+    data = PPC_mtspr | ((uint32_t)r.code()) << 21 | PPC_SPR(9);
-+  }
-+  void makeOp_bctr(uint32_t linkBit = 0) { data = PPC_bctr | linkBit; }
-+
-+  void setData(uint32_t data) { this->data = data; }
-+
-+  const Instruction& operator=(const Instruction& src) {
-+    data = src.data;
-+    return *this;
-+  }
-+
-+  uint32_t extractBit(uint32_t bit) const { return (encode() >> bit) & 1; }
-+  uint32_t extractBitField(uint32_t hi, uint32_t lo) const {
-+    return (encode() >> lo) & ((2 << (hi - lo)) - 1);
-+  }
-+
-+  uint32_t extractOpcode() const { return data & PPC_MAJOR_OPCODE_MASK; }
-+  bool isOpcode(uint32_t op) const {
-+    return extractOpcode() == (op & PPC_MAJOR_OPCODE_MASK);
-+  }
-+
-+  uint32_t extractRT() const {
-+    return extractBitField(RTShift + RTBits - 1, RTShift);
-+  }
-+  uint32_t extractRA() const {
-+    return extractBitField(RAShift + RABits - 1, RAShift);
-+  }
-+  uint32_t extractRB() const {
-+    return extractBitField(RBShift + RBBits - 1, RBShift);
-+  }
-+  uint32_t extractImm16() const { return data & Imm16Mask; }
-+
-+  Instruction* next() { return this + 1; }
-+
-+  const uint32_t* raw() const { return &data; }
-+  uint32_t size() const { return 4; }
-+};
-+
-+static_assert(sizeof(Instruction) == 4);
-+
-+class InstNOP : public Instruction {
-+ public:
-+  InstNOP() : Instruction(NopInst) {}
-+};
-+
-+// Register-register-register instruction (X-form and XO-form).
-+class InstReg : public Instruction {
-+ public:
-+  explicit InstReg(PPCOpcodes op) : Instruction(op) {}
-+  InstReg(PPCOpcodes op, Register rt, Register ra, Register rb)
-+      : Instruction((uint32_t)op | RT(rt) | RA(ra) | RB(rb)) {}
-+  InstReg(PPCOpcodes op, FloatRegister frt, FloatRegister fra,
-+          FloatRegister frb)
-+      : Instruction((uint32_t)op | RT(frt) | RA(fra) | RB(frb)) {}
-+
-+  void setRT(Register r) { data = (data & ~RTMask) | RT(r); }
-+  void setRA(Register r) { data = (data & ~RAMask) | RA(r); }
-+  void setRB(Register r) { data = (data & ~RBMask) | RB(r); }
-+
-+  void setImm16(uint32_t imm) {
-+    data = (data & 0xFFFF0000) | (imm & Imm16Mask);
-+  }
-+  uint32_t extractImm16Value() const { return data & Imm16Mask; }
-+};
-+
-+// Register-immediate instruction (D-form).
-+// Bits 21-25 hold RT (loads, addi) or RS (stores, ori). Both encode identically
-+// since RT and RS occupy the same field; the caller simply passes the right
-+// register.
-+class InstImm : public Instruction {
-+ public:
-+  explicit InstImm(PPCOpcodes op) : Instruction(op) {}
-+  InstImm(PPCOpcodes op, Register rt, Register ra, uint32_t imm16)
-+      : Instruction((uint32_t)op | RT(rt) | RA(ra) | (imm16 & Imm16Mask)) {}
-+
-+  void setRT(Register r) { data = (data & ~RTMask) | RT(r); }
-+  void setRA(Register r) { data = (data & ~RAMask) | RA(r); }
-+
-+  void setImm16(uint32_t imm) {
-+    data = (data & 0xFFFF0000) | (imm & Imm16Mask);
-+  }
-+  void setLowerReg(Register rl) {
-+    data = (data & 0xFFE0FFFF) | ((uint32_t)rl.code() << 16);
-+  }
-+  uint32_t extractImm16Value() const { return data & Imm16Mask; }
-+
-+  // Extract the TrapTag from a tagged trap instruction (tw).
-+  // Defined in Assembler-ppc64.cpp. Returns a TrapTag value as uint8_t
-+  // because Assembler::TrapTag is not yet defined at this point in the header.
-+  uint8_t traptag();
-+};
-+
-+// A BOffImm16 is a 16-bit signed branch offset for conditional branches
-+// (bc-form instructions).  The offset is stored in bits 2..15 and is
-+// 4-byte aligned, giving a range of +/-32 KB.
-+class BOffImm16 {
-+  int32_t data;
-+
-+ public:
-+  uint32_t encode() const {
-+    MOZ_ASSERT(!isInvalid());
-+    return static_cast<uint32_t>(data) & 0xFFFC;
-+  }
-+  int32_t decode() const {
-+    MOZ_ASSERT(!isInvalid());
-+    return data;
-+  }
-+
-+  explicit BOffImm16(int offset) : data(offset) {
-+    MOZ_ASSERT((offset & 0x3) == 0);
-+    MOZ_ASSERT(IsInRange(offset));
-+  }
-+  static bool IsInRange(int offset) {
-+    return offset >= -32768 && offset <= 32764;
-+  }
-+
-+  static const int32_t INVALID = 0x00020000;
-+  BOffImm16() : data(INVALID) {}
-+
-+  bool isInvalid() const { return data == INVALID; }
-+
-+  Instruction* getDest(Instruction* src) const;
-+
-+  explicit BOffImm16(InstImm inst);
-+};
-+
-+// A JOffImm26 is a 26-bit signed branch offset for unconditional branches
-+// (b/bl instructions).  Bits 2..25 encode the offset, 4-byte aligned,
-+// giving a range of +/-32 MB.
-+class JOffImm26 {
-+  int32_t data;
-+
-+ public:
-+  uint32_t encode() const {
-+    MOZ_ASSERT(!isInvalid());
-+    return static_cast<uint32_t>(data) & 0x03FFFFFC;
-+  }
-+  int32_t decode() const {
-+    MOZ_ASSERT(!isInvalid());
-+    return data;
-+  }
-+
-+  explicit JOffImm26(int offset) : data(offset) {
-+    MOZ_ASSERT((offset & 0x3) == 0);
-+    MOZ_ASSERT(IsInRange(offset));
-+  }
-+  static bool IsInRange(int offset) {
-+    return offset >= -33554432 && offset <= 33554428;
-+  }
-+
-+  static const int32_t INVALID = 0x20000000;
-+  JOffImm26() : data(INVALID) {}
-+
-+  bool isInvalid() const { return data == INVALID; }
-+
-+  Instruction* getDest(Instruction* src) const;
-+};
-+
-+// A 16-bit immediate value used in D-form instructions.
-+class Imm16 {
-+  int32_t value;
-+
-+ public:
-+  Imm16();
-+  explicit Imm16(uint32_t imm) : value(imm) {}
-+  uint32_t encode() const { return static_cast<uint32_t>(value) & 0xffff; }
-+  int32_t decodeSigned() const { return value; }
-+  uint32_t decodeUnsigned() const { return value; }
-+  static bool IsInSignedRange(int32_t imm) {
-+    return imm >= INT16_MIN && imm <= INT16_MAX;
-+  }
-+  static bool IsInUnsignedRange(uint32_t imm) { return imm <= UINT16_MAX; }
-+  static Imm16 Lower(Imm32 imm) { return Imm16(imm.value & 0xffff); }
-+  static Imm16 Upper(Imm32 imm) { return Imm16((imm.value >> 16) & 0xffff); }
-+};
-+
-+class Imm8 {
-+  uint8_t value;
-+
-+ public:
-+  Imm8();
-+  explicit Imm8(uint32_t imm) : value(imm) {}
-+  uint32_t encode(uint32_t shift) const { return value << shift; }
-+  int32_t decodeSigned() const { return value; }
-+  uint32_t decodeUnsigned() const { return value; }
-+  static bool IsInSignedRange(int32_t imm) {
-+    return imm >= INT8_MIN && imm <= INT8_MAX;
-+  }
-+  static bool IsInUnsignedRange(uint32_t imm) { return imm <= UINT8_MAX; }
-+  static Imm8 Lower(Imm16 imm) { return Imm8(imm.decodeSigned() & 0xff); }
-+  static Imm8 Upper(Imm16 imm) {
-+    return Imm8((imm.decodeSigned() >> 8) & 0xff);
-+  }
-+};
-+
-+class Operand {
-+ public:
-+  enum Tag { REG, FREG, MEM };
-+
-+ private:
-+  Tag tag : 3;
-+  uint32_t reg : 5;
-+  int32_t offset;
-+
-+ public:
-+  MOZ_IMPLICIT Operand(Register reg_) : tag(REG), reg(reg_.code()) {}
-+
-+  explicit Operand(FloatRegister freg) : tag(FREG), reg(freg.code()) {}
-+
-+  Operand(Register base, Imm32 off)
-+      : tag(MEM), reg(base.code()), offset(off.value) {}
-+
-+  Operand(Register base, int32_t off)
-+      : tag(MEM), reg(base.code()), offset(off) {}
-+
-+  explicit Operand(const Address& addr)
-+      : tag(MEM), reg(addr.base.code()), offset(addr.offset) {}
-+
-+  Tag getTag() const { return tag; }
-+
-+  Register toReg() const {
-+    MOZ_ASSERT(tag == REG);
-+    return Register::FromCode(reg);
-+  }
-+
-+  FloatRegister toFReg() const {
-+    MOZ_ASSERT(tag == FREG);
-+    return FloatRegister::FromCode(reg);
-+  }
-+
-+  void toAddr(Register* r, Imm32* dest) const {
-+    MOZ_ASSERT(tag == MEM);
-+    *r = Register::FromCode(reg);
-+    *dest = Imm32(offset);
-+  }
-+  Address toAddress() const {
-+    MOZ_ASSERT(tag == MEM);
-+    return Address(Register::FromCode(reg), offset);
-+  }
-+  int32_t disp() const {
-+    MOZ_ASSERT(tag == MEM);
-+    return offset;
-+  }
-+
-+  int32_t base() const {
-+    MOZ_ASSERT(tag == MEM);
-+    return reg;
-+  }
-+  Register baseReg() const {
-+    MOZ_ASSERT(tag == MEM);
-+    return Register::FromCode(reg);
-+  }
-+};
-+
-+// Bug 2034064 collapsed the per-buffer compile-time configuration of
-+// AssemblerBufferWithConstantPools into AssemblerBufferSettings, and reduced
-+// the runtime ctor to (poolMaxOffset, nopFill). instBufferAlign and the
-+// NumShortBranchRanges template arg were dropped: PPC64 previously passed
-+// instBufferAlign=8 (unused on this backend; pool entries are 4-byte aligned)
-+// and NumShortBranchRanges=0.
-+using PPCBuffer = js::jit::AssemblerBufferWithConstantPools<
-+    Instruction, Assembler,
-+    js::jit::AssemblerBufferSettings{
-+        .instSize = 4,
-+        .guardSize = 1,
-+        .headerSize = 1,
-+        .pcBias = 0,
-+        .alignFillInst = NopInst,
-+        .nopFillInst = NopInst,
-+    }>;
-+
-+// Inherits executableCopy() and appendRawCode() from
-+// AssemblerBufferWithConstantPools, which assert pool is flushed.
-+class PPCBufferWithExecutableCopy : public PPCBuffer {
-+ public:
-+  PPCBufferWithExecutableCopy(size_t poolMaxOffset, unsigned nopFill)
-+      : PPCBuffer(poolMaxOffset, nopFill) {}
-+};
-+
-+class Assembler : public AssemblerShared {
-+ public:
-+  // Trap tags encoded in the low bits of a trap word.
-+  // FreeBSD and others may use r1 in their trap word, so bit 0 is avoided.
-+  enum TrapTag {
-+    BTag = 2,
-+    BCTag = 4,
-+    CallTag = 6,
-+    DebugTag0 = 10,
-+    DebugTag1 = 12,
-+    DebugTag2 = 14
-+  };
-+
-+  // Pool load types encoded in bits 21-22 of pool hint words.
-+  // Used by InsertIndexIntoTag / PatchConstantPoolLoad.
-+  enum PoolLoadType {
-+    PoolLoadFPR64 = 1,    // lfd fD, offset(rBase)
-+    PoolLoadSimd128 = 2,  // addi rBase, rBase, offset; lxvx vsD, 0, rBase
-+    PoolLoadFPR32 = 3     // lfs fD, offset(rBase) — auto-expands to double
-+  };
-+
-+  enum BranchBits {
-+    BranchOnClear = 0x04,
-+    BranchOnSet = 0x0c,
-+    BranchOptionMask = 0x0f,
-+    BranchOptionInvert = 0x08
-+  };
-+
-+  // PPC condition encoding. The top nybble is the offset to the CR field
-+  // (the x in BIF*4+x), and the bottom is the BO field.
-+  // Synthetic flags sit in the MSB and are masked off before use.
-+  enum Condition {
-+    ConditionUnsigned = 0x100,
-+    ConditionUnsignedHandled = 0x2ff,
-+    ConditionZero = 0x400,
-+    ConditionOnlyXER = 0x200,
-+    ConditionXERCA = 0x23c,
-+    ConditionXERNCA = 0x234,
-+    ConditionXEROV = 0x21c,
-+
-+    Equal = 0x2c,
-+    NotEqual = 0x24,
-+    GreaterThan = 0x1c,
-+    GreaterThanOrEqual = 0x04,
-+    LessThan = 0x0c,
-+    LessThanOrEqual = 0x14,
-+
-+    Above = GreaterThan | ConditionUnsigned,
-+    AboveOrEqual = GreaterThanOrEqual | ConditionUnsigned,
-+    Below = LessThan | ConditionUnsigned,
-+    BelowOrEqual = LessThanOrEqual | ConditionUnsigned,
-+
-+    Signed = LessThan | ConditionZero,
-+    NotSigned = GreaterThanOrEqual | ConditionZero,
-+    Zero = Equal | ConditionZero,
-+    NonZero = NotEqual | ConditionZero,
-+
-+    Overflow = ConditionXEROV,
-+    NotOverflow = ConditionOnlyXER | LessThanOrEqual,
-+    CarrySet = ConditionXERCA,
-+    CarryClear = ConditionXERNCA,
-+
-+    Always = 0x1f,
-+    SOBit = 0x3c,
-+    NSOBit = 0x34
-+  };
-+
-+  enum DoubleCondition {
-+    DoubleConditionUnordered = 0x100,
-+    DoubleOrdered = 0x34,
-+    DoubleEqual = 0x2c,
-+    DoubleNotEqual = 0x24,
-+    DoubleGreaterThan = 0x1c,
-+    DoubleGreaterThanOrEqual = 0x04,
-+    DoubleLessThan = 0x0c,
-+    DoubleLessThanOrEqual = 0x14,
-+    DoubleUnordered = 0x3c,
-+    DoubleEqualOrUnordered = DoubleEqual | DoubleConditionUnordered,
-+    DoubleNotEqualOrUnordered = DoubleNotEqual | DoubleConditionUnordered,
-+    DoubleGreaterThanOrUnordered = DoubleGreaterThan | DoubleConditionUnordered,
-+    DoubleGreaterThanOrEqualOrUnordered =
-+        DoubleGreaterThanOrEqual | DoubleConditionUnordered,
-+    DoubleLessThanOrUnordered = DoubleLessThan | DoubleConditionUnordered,
-+    DoubleLessThanOrEqualOrUnordered =
-+        DoubleLessThanOrEqual | DoubleConditionUnordered,
-+  };
-+
-+  enum JumpOrCall { BranchIsJump, BranchIsCall };
-+
-+  enum LinkBit {
-+    DontLinkB = 0,
-+    LinkB = 1,
-+  };
-+
-+  enum LikelyBit {
-+    NotLikelyB = 0,
-+    LikelyB = 1,
-+  };
-+
-+  enum BranchAddressType {
-+    RelativeBranch = 0,
-+    AbsoluteBranch = 2,
-+  };
-+
-+  enum FloatFormat { SingleFloat, DoubleFloat };
-+  enum FloatTestKind { TestForTrue, TestForFalse };
-+
-+  BufferOffset nextOffset() { return m_buffer.nextOffset(); }
-+
-+ protected:
-+  Instruction* editSrc(BufferOffset bo) {
-+    if (!bo.assigned()) {
-+      // Under OOM, writeInst may return an unassigned BufferOffset.
-+      // Return a dummy writable area so callers (WriteLoad64Instructions)
-+      // can proceed harmlessly; the compilation will be discarded.
-+      static uint32_t oomDummy_[8];
-+      return (Instruction*)oomDummy_;
-+    }
-+    return m_buffer.getInst(bo);
-+  }
-+
-+  struct RelativePatch {
-+    BufferOffset offset;
-+    void* target;
-+    RelocationKind kind;
-+
-+    RelativePatch(BufferOffset offset, void* target, RelocationKind kind)
-+        : offset(offset), target(target), kind(kind) {}
-+  };
-+
-+  js::Vector<RelativePatch, 8, SystemAllocPolicy> jumps_;
-+
-+  CompactBufferWriter jumpRelocations_;
-+  CompactBufferWriter dataRelocations_;
-+
-+  PPCBufferWithExecutableCopy m_buffer;
-+
-+#ifdef JS_JITSPEW
-+  Sprinter* printer;
-+#endif
-+
-+ public:
-+  // Which absolute bit number does a CR + Condition pair refer to?
-+  static uint8_t crBit(CRegisterID cr, Condition cond) {
-+    return (cr << 2) + ((cond & 0xf0) >> 4);
-+  }
-+  static uint8_t crBit(CRegisterID cr, DoubleCondition cond) {
-+    return (cr << 2) + ((cond & 0xf0) >> 4);
-+  }
-+
-+  Assembler()
-+      : m_buffer(/* poolMaxOffset */ 8192, /* nopFill */ 0),
-+#ifdef JS_JITSPEW
-+        printer(nullptr),
-+#endif
-+        isFinished(false),
-+        scratch_register_list_((1 << Registers::r11) | (1 << Registers::r12)) {
-+  }
-+
-+  void setUnlimitedBuffer() { m_buffer.setUnlimited(); }
-+
-+  // Constant pool callbacks required by AssemblerBufferWithConstantPools.
-+  static void InsertIndexIntoTag(uint8_t* load, uint32_t index);
-+  static bool PatchConstantPoolLoad(void* loadAddr, void* constPoolAddr);
-+  static void WritePoolGuard(BufferOffset branch, Instruction* inst,
-+                             BufferOffset dest);
-+  static void WritePoolHeader(uint8_t* start, js::jit::Pool* p, bool isNatural);
-+  static void PatchShortRangeBranchToVeneer(PPCBuffer*, unsigned rangeIdx,
-+                                            BufferOffset deadline,
-+                                            BufferOffset veneer);
-+
-+  static Condition InvertCondition(Condition cond);
-+  static DoubleCondition InvertCondition(DoubleCondition cond);
-+
-+  void writeRelocation(BufferOffset src) {
-+    jumpRelocations_.writeUnsigned(src.getOffset());
-+  }
-+
-+  void writeDataRelocation(ImmGCPtr ptr) {
-+    if (ptr.value) {
-+      if (gc::IsInsideNursery(ptr.value)) {
-+        embedsNurseryPointers_ = true;
-+      }
-+      dataRelocations_.writeUnsigned(nextOffset().getOffset());
-+    }
-+  }
-+  void writeDataRelocation(BufferOffset bo, ImmGCPtr ptr) {
-+    if (ptr.value) {
-+      if (gc::IsInsideNursery(ptr.value)) {
-+        embedsNurseryPointers_ = true;
-+      }
-+      dataRelocations_.writeUnsigned(bo.getOffset());
-+    }
-+  }
-+
-+  void assertNoGCThings() const {
-+#ifdef DEBUG
-+    MOZ_ASSERT(dataRelocations_.length() == 0);
-+    for (auto& j : jumps_) {
-+      MOZ_ASSERT(j.kind == RelocationKind::HARDCODED);
-+    }
-+#endif
-+  }
-+
-+  bool oom() const;
-+
-+  void setPrinter(Sprinter* sp) {
-+#ifdef JS_JITSPEW
-+    printer = sp;
-+#endif
-+  }
-+
-+#ifdef JS_JITSPEW
-+  inline void spew(const char* fmt, ...) MOZ_FORMAT_PRINTF(2, 3) {
-+    if (MOZ_UNLIKELY(printer || JitSpewEnabled(JitSpew_Codegen))) {
-+      va_list va;
-+      va_start(va, fmt);
-+      spewVA(fmt, va);
-+      va_end(va);
-+    }
-+  }
-+  MOZ_COLD void spewVA(const char* fmt, va_list va) MOZ_FORMAT_PRINTF(2, 0) {
-+    char buf[200];
-+    int i = VsprintfLiteral(buf, fmt, va);
-+    if (i > -1) {
-+      if (printer) {
-+        printer->printf("%s\n", buf);
-+      }
-+      js::jit::JitSpew(js::jit::JitSpew_Codegen, "%s", buf);
-+    }
-+  }
-+#else
-+  MOZ_ALWAYS_INLINE void spew(const char* fmt, ...) MOZ_FORMAT_PRINTF(2, 3) {}
-+#endif
-+
-+  Register getStackPointer() const { return StackPointer; }
-+
-+ protected:
-+  bool isFinished;
-+
-+ public:
-+  static uintptr_t GetPointer(uint8_t*);
-+  void flush() {
-+    MOZ_ASSERT(!isFinished);
-+    m_buffer.flushPool();
-+  }
-+  // Inhibit pool flushes for the next maxInst instructions. Mirrors the
-+  // ARM/ARM64 wrappers; lets shared code (e.g. WasmFrameIter epilogues
-+  // that need static byte distances between currentOffset() captures)
-+  // fence a small instruction window without reaching into m_buffer.
-+  void enterNoPool(size_t maxInst) { m_buffer.enterNoPool(maxInst); }
-+  void leaveNoPool() { m_buffer.leaveNoPool(); }
-+  void finish();
-+  bool appendRawCode(const uint8_t* code, size_t numBytes);
-+  bool reserve(size_t size);
-+  bool swapBuffer(wasm::Bytes& bytes);
-+  void executableCopy(void* buffer);
-+  void copyJumpRelocationTable(uint8_t* dest);
-+  void copyDataRelocationTable(uint8_t* dest);
-+
-+  size_t size() const;
-+  size_t jumpRelocationTableBytes() const;
-+  size_t dataRelocationTableBytes() const;
-+  size_t bytesNeeded() const;
-+
-+  BufferOffset writeInst(uint32_t x, uint32_t* dest = nullptr);
-+  static void WriteInstStatic(uint32_t x, uint32_t* dest);
-+
-+ public:
-+  BufferOffset haltingAlign(int alignment);
-+  BufferOffset nopAlign(int alignment);
-+  BufferOffset as_nop();
-+
-+  // --- Instruction emission (declarations only, implemented in later commits)
-+
-+  // Branch instructions.
-+  uint16_t computeConditionCode(Condition op, CRegisterID cr = cr0);
-+  uint16_t computeConditionCode(DoubleCondition cond, CRegisterID cr = cr0);
-+  BufferOffset as_b(JOffImm26 off, BranchAddressType bat = RelativeBranch,
-+                    LinkBit lb = DontLinkB);
-+  BufferOffset as_b(int32_t off, BranchAddressType bat = RelativeBranch,
-+                    LinkBit lb = DontLinkB);
-+  BufferOffset as_blr(LinkBit lb = DontLinkB);
-+  BufferOffset as_bctr(LinkBit lb = DontLinkB);
-+  BufferOffset as_bc(BOffImm16 off, Condition cond, CRegisterID cr = cr0,
-+                     LikelyBit lkb = NotLikelyB, LinkBit lb = DontLinkB);
-+  BufferOffset as_bc(int16_t off, Condition cond, CRegisterID cr = cr0,
-+                     LikelyBit lkb = NotLikelyB, LinkBit lb = DontLinkB);
-+  BufferOffset as_bc(BOffImm16 off, DoubleCondition cond, CRegisterID cr = cr0,
-+                     LikelyBit lkb = NotLikelyB, LinkBit lb = DontLinkB);
-+  BufferOffset as_bc(int16_t off, DoubleCondition cond, CRegisterID cr = cr0,
-+                     LikelyBit lkb = NotLikelyB, LinkBit lb = DontLinkB);
-+  BufferOffset as_bcctr(Condition cond, CRegisterID cr = cr0,
-+                        LikelyBit lkb = NotLikelyB, LinkBit lb = DontLinkB);
-+  BufferOffset as_bcctr(DoubleCondition cond, CRegisterID cr = cr0,
-+                        LikelyBit lkb = NotLikelyB, LinkBit lb = DontLinkB);
-+  BufferOffset as_bc(int16_t off, uint16_t op, LikelyBit lkb = NotLikelyB,
-+                     LinkBit lb = DontLinkB);
-+  BufferOffset as_bcctr(uint16_t op, LikelyBit lkb = NotLikelyB,
-+                        LinkBit lb = DontLinkB);
-+
-+  // SPR operations.
-+  BufferOffset as_mtspr(SPRegisterID spr, Register ra);
-+  BufferOffset as_mfspr(Register rd, SPRegisterID spr);
-+
-+  // CR operations.
-+  BufferOffset as_crand(uint8_t t, uint8_t a, uint8_t b);
-+  BufferOffset as_crandc(uint8_t t, uint8_t a, uint8_t b);
-+  BufferOffset as_cror(uint8_t t, uint8_t a, uint8_t b);
-+  BufferOffset as_crorc(uint8_t t, uint8_t a, uint8_t b);
-+  BufferOffset as_crxor(uint8_t t, uint8_t a, uint8_t b);
-+  BufferOffset as_mtcrf(uint32_t mask, Register rs);
-+  BufferOffset as_mfocrf(Register rd, CRegisterID crfs);
-+  BufferOffset as_mcrxrx(CRegisterID crt);
-+
-+  // Compare instructions.
-+  BufferOffset as_cmpd(CRegisterID cr, Register ra, Register rb);
-+  BufferOffset as_cmpdi(CRegisterID cr, Register ra, int16_t im);
-+  BufferOffset as_cmpld(CRegisterID cr, Register ra, Register rb);
-+  BufferOffset as_cmpldi(CRegisterID cr, Register ra, int16_t im);
-+  BufferOffset as_cmpw(CRegisterID cr, Register ra, Register rb);
-+  BufferOffset as_cmpwi(CRegisterID cr, Register ra, int16_t im);
-+  BufferOffset as_cmplw(CRegisterID cr, Register ra, Register rb);
-+  BufferOffset as_cmplwi(CRegisterID cr, Register ra, int16_t im);
-+  BufferOffset as_cmpd(Register ra, Register rb);
-+  BufferOffset as_cmpdi(Register ra, int16_t im);
-+  BufferOffset as_cmpld(Register ra, Register rb);
-+  BufferOffset as_cmpldi(Register ra, int16_t im);
-+  BufferOffset as_cmpw(Register ra, Register rb);
-+  BufferOffset as_cmpwi(Register ra, int16_t im);
-+  BufferOffset as_cmplw(Register ra, Register rb);
-+  BufferOffset as_cmplwi(Register ra, int16_t im);
-+
-+  // ALU (three-register).
-+  BufferOffset as_add(Register rd, Register ra, Register rb);
-+  BufferOffset as_addc(Register rd, Register ra, Register rb);
-+  BufferOffset as_adde(Register rd, Register ra, Register rb);
-+  BufferOffset as_subf(Register rd, Register ra, Register rb);
-+  BufferOffset as_subfc(Register rd, Register ra, Register rb);
-+  BufferOffset as_subfe(Register rd, Register ra, Register rb);
-+  BufferOffset as_neg(Register rd, Register rs);
-+
-+  BufferOffset as_mulld(Register rd, Register ra, Register rb);
-+  BufferOffset as_mulhd(Register rd, Register ra, Register rb);
-+  BufferOffset as_mulhdu(Register rd, Register ra, Register rb);
-+  BufferOffset as_mulldo(Register rd, Register ra, Register rb);
-+  BufferOffset as_mullw(Register rd, Register ra, Register rb);
-+  BufferOffset as_mulhwu(Register rd, Register ra, Register rb);
-+
-+  BufferOffset as_divd(Register rd, Register ra, Register rb);
-+  BufferOffset as_divdu(Register rd, Register ra, Register rb);
-+  BufferOffset as_divw(Register rd, Register ra, Register rb);
-+  BufferOffset as_divwu(Register rd, Register ra, Register rb);
-+  // POWER9 modulo.
-+  BufferOffset as_modsd(Register rd, Register ra, Register rb);
-+  BufferOffset as_modsw(Register rd, Register ra, Register rb);
-+  BufferOffset as_modud(Register rd, Register ra, Register rb);
-+  BufferOffset as_moduw(Register rd, Register ra, Register rb);
-+
-+  // ALU immediate.
-+  BufferOffset as_addi(Register rd, Register ra, int16_t im,
-+                       bool actually_li = false);
-+  BufferOffset as_addis(Register rd, Register ra, int16_t im,
-+                        bool actually_lis = false);
-+  BufferOffset as_mulli(Register rd, Register ra, int16_t im);
-+  BufferOffset as_subfic(Register rd, Register ra, int16_t im);
-+
-+  // ALU unary/extended.
-+  BufferOffset as_cntlzw(Register rd, Register ra);
-+  BufferOffset as_cntlzd(Register rd, Register ra);
-+  BufferOffset as_cnttzd(Register rd, Register ra);
-+  BufferOffset as_cnttzw(Register rd, Register ra);
-+  BufferOffset as_popcntd(Register ra, Register rs);
-+  BufferOffset as_popcntw(Register ra, Register rs);
-+  // POWER10 byte-reverse doubleword: ra = bswap64(rs). 1 insn replacing the
-+  // POWER9 mtvsrd / xxbrd / mfvsrd round-trip in byteSwap64.
-+  BufferOffset as_brd(Register ra, Register rs);
-+  // POWER10 byte-reverse each halfword (4 halfwords) / each word (2 words)
-+  // in the 64-bit doubleword. The wasm/asm caller usually masks or
-+  // sign-extends the low halfword/word afterwards.
-+  BufferOffset as_brh(Register ra, Register rs);
-+  BufferOffset as_brw(Register ra, Register rs);
-+
-+  // Bit operations (logical, three-register).
-+  BufferOffset as_and_(Register rd, Register rs, Register rb);
-+  BufferOffset as_and__rc(Register rd, Register rs, Register rb);
-+  BufferOffset as_nor(Register rd, Register rs, Register rb);
-+  BufferOffset as_or_(Register rd, Register rs, Register rb);
-+  BufferOffset as_xor_(Register rd, Register rs, Register rb);
-+  BufferOffset as_slw(Register rd, Register rs, Register rb);
-+  BufferOffset as_srw(Register rd, Register rs, Register rb);
-+  BufferOffset as_sraw(Register rd, Register rs, Register rb);
-+  BufferOffset as_sld(Register rd, Register rs, Register rb);
-+  BufferOffset as_srd(Register rd, Register rs, Register rb);
-+  BufferOffset as_srad(Register rd, Register rs, Register rb);
-+
-+  // Bit operations (logical, immediate).
-+  BufferOffset as_ori(Register rd, Register ra, uint16_t im);
-+  BufferOffset as_oris(Register rd, Register ra, uint16_t im);
-+  BufferOffset as_xori(Register rd, Register ra, uint16_t im);
-+  BufferOffset as_xoris(Register rd, Register ra, uint16_t im);
-+  BufferOffset as_andi_rc(Register rd, Register ra, uint16_t im);
-+
-+  // Sign extension.
-+  BufferOffset as_extsb(Register rd, Register rs);
-+  BufferOffset as_extsh(Register rd, Register rs);
-+  BufferOffset as_extsw(Register rd, Register rs);
-+  BufferOffset as_extsw_rc(Register rd, Register rs);
-+
-+  // Shift/rotate with immediates.
-+  BufferOffset as_srawi(Register id, Register rs, uint8_t n);
-+  BufferOffset as_sradi(Register rd, Register rs, int n);
-+  BufferOffset as_rldcl(Register ra, Register rs, Register rb, uint8_t mb);
-+  BufferOffset as_rldicl(Register ra, Register rs, uint8_t sh, uint8_t mb);
-+  BufferOffset as_rldicl_rc(Register ra, Register rs, uint8_t sh, uint8_t mb);
-+  BufferOffset as_rldicr(Register ra, Register rs, uint8_t sh, uint8_t mb);
-+  BufferOffset as_rldicr_rc(Register ra, Register rs, uint8_t sh, uint8_t mb);
-+  BufferOffset as_rlwinm(Register rd, Register rs, uint8_t sh, uint8_t mb,
-+                         uint8_t me);
-+  BufferOffset as_rlwinm_rc(Register rd, Register rs, uint8_t sh, uint8_t mb,
-+                            uint8_t me);
-+  BufferOffset as_rlwimi(Register rd, Register rs, uint8_t sh, uint8_t mb,
-+                         uint8_t me);
-+  BufferOffset as_rldimi(Register rd, Register rs, uint8_t sh, uint8_t mb);
-+  BufferOffset as_rlwnm(Register rd, Register rs, Register rb, uint8_t mb,
-+                        uint8_t me);
-+
-+  // Integer loads (D-form).
-+  BufferOffset as_lbz(Register rd, Register rb, int16_t off);
-+  BufferOffset as_lha(Register rd, Register rb, int16_t off);
-+  BufferOffset as_lhz(Register rd, Register rb, int16_t off);
-+  BufferOffset as_lwa(Register rd, Register rb, int16_t off);
-+  BufferOffset as_lwz(Register rd, Register rb, int16_t off);
-+  BufferOffset as_ld(Register rd, Register rb, int16_t off);
-+
-+  // Integer stores (D-form).
-+  BufferOffset as_stb(Register rd, Register rb, int16_t off);
-+  BufferOffset as_sth(Register rd, Register rb, int16_t off);
-+  BufferOffset as_stw(Register rd, Register rb, int16_t off);
-+  BufferOffset as_std(Register rd, Register rb, int16_t off);
-+  BufferOffset as_stdu(Register rd, Register rb, int16_t off);
-+
-+  // Integer loads/stores (X-form, indexed).
-+  BufferOffset as_lbzx(Register rd, Register ra, Register rb);
-+  BufferOffset as_lhax(Register rd, Register ra, Register rb);
-+  BufferOffset as_lhzx(Register rd, Register ra, Register rb);
-+  BufferOffset as_lwzx(Register rd, Register ra, Register rb);
-+  // X-form sign-extending word load. Single-insn equivalent of lwzx + extsw.
-+  BufferOffset as_lwax(Register rd, Register ra, Register rb);
-+  BufferOffset as_lwarx(Register rd, Register ra, Register rb);
-+  BufferOffset as_lbarx(Register rd, Register ra, Register rb);
-+  BufferOffset as_lharx(Register rd, Register ra, Register rb);
-+  BufferOffset as_ldx(Register rd, Register ra, Register rb);
-+  BufferOffset as_ldarx(Register rd, Register ra, Register rb);
-+  BufferOffset as_stbx(Register rd, Register ra, Register rb);
-+  BufferOffset as_stbcx(Register rd, Register ra, Register rb);
-+  BufferOffset as_stwx(Register rd, Register ra, Register rb);
-+  BufferOffset as_stwbrx(Register rd, Register ra, Register rb);
-+  BufferOffset as_sthx(Register rd, Register ra, Register rb);
-+  BufferOffset as_sthcx(Register rd, Register ra, Register rb);
-+  BufferOffset as_stdx(Register rd, Register ra, Register rb);
-+  BufferOffset as_stdcx(Register rd, Register ra, Register rb);
-+  BufferOffset as_stwcx(Register rd, Register ra, Register rb);
-+
-+  // Integer select.
-+  // POWER10 (ISA 3.1). Set RT = 1/0 based on a CR bit.
-+  BufferOffset as_setbc(Register rt, uint16_t bc, CRegisterID cr);
-+  BufferOffset as_setbcr(Register rt, uint16_t bc, CRegisterID cr);
-+  BufferOffset as_isel(Register rt, Register ra, Register rb, uint16_t rc,
-+                       CRegisterID cr = cr0);
-+  BufferOffset as_isel0(Register rt, Register ra, Register rb, uint16_t rc,
-+                        CRegisterID cr = cr0);
-+
-+  // FP compare.
-+  BufferOffset as_fcmpu(CRegisterID cr, FloatRegister ra, FloatRegister rb);
-+  BufferOffset as_fcmpu(FloatRegister ra, FloatRegister rb);
-+
-+  // FP arithmetic (two-source).
-+  BufferOffset as_fadd(FloatRegister rd, FloatRegister ra, FloatRegister rc);
-+  BufferOffset as_fadds(FloatRegister rd, FloatRegister ra, FloatRegister rc);
-+  BufferOffset as_fsub(FloatRegister rd, FloatRegister ra, FloatRegister rc);
-+  BufferOffset as_fsubs(FloatRegister rd, FloatRegister ra, FloatRegister rc);
-+  BufferOffset as_fdiv(FloatRegister rd, FloatRegister ra, FloatRegister rc);
-+  BufferOffset as_fdivs(FloatRegister rd, FloatRegister ra, FloatRegister rc);
-+  BufferOffset as_fmul(FloatRegister rd, FloatRegister ra, FloatRegister rc);
-+  BufferOffset as_fmuls(FloatRegister rd, FloatRegister ra, FloatRegister rc);
-+  BufferOffset as_fcpsgn(FloatRegister rd, FloatRegister ra, FloatRegister rc);
-+  // FP unary.
-+  BufferOffset as_fabs(FloatRegister rd, FloatRegister rs);
-+  BufferOffset as_fneg(FloatRegister rd, FloatRegister rs);
-+  BufferOffset as_fmr(FloatRegister rd, FloatRegister rs);
-+  BufferOffset as_fsqrt(FloatRegister rd, FloatRegister rs);
-+  BufferOffset as_fsqrts(FloatRegister rd, FloatRegister rs);
-+  BufferOffset as_frsp(FloatRegister rd, FloatRegister rs);
-+
-+  // FP conversions.
-+  BufferOffset as_fcfid(FloatRegister rd, FloatRegister rs);
-+  BufferOffset as_fcfids(FloatRegister rd, FloatRegister rs);
-+  BufferOffset as_fcfidu(FloatRegister rd, FloatRegister rs);
-+  BufferOffset as_fcfidus(FloatRegister rd, FloatRegister rs);
-+  BufferOffset as_fctid(FloatRegister rd, FloatRegister rs);
-+  BufferOffset as_fctidz(FloatRegister rd, FloatRegister rs);
-+  BufferOffset as_fctiduz(FloatRegister rd, FloatRegister rs);
-+  BufferOffset as_fctiwz(FloatRegister rd, FloatRegister rs);
-+
-+  // FP rounding.
-+  BufferOffset as_frim(FloatRegister rd, FloatRegister rs);
-+  BufferOffset as_frip(FloatRegister rd, FloatRegister rs);
-+  BufferOffset as_friz(FloatRegister rd, FloatRegister rs);
-+
-+  // FP loads (D-form).
-+  BufferOffset as_lfd(FloatRegister rd, Register rb, int16_t off);
-+  BufferOffset as_lfs(FloatRegister rd, Register rb, int16_t off);
-+
-+  // FP stores (D-form).
-+  BufferOffset as_stfd(FloatRegister rd, Register rb, int16_t off);
-+  BufferOffset as_stfs(FloatRegister rd, Register rb, int16_t off);
-+  BufferOffset as_stfdu(FloatRegister rd, Register rb, int16_t off);
-+  BufferOffset as_stfsu(FloatRegister rd, Register rb, int16_t off);
-+
-+  // FP loads/stores (X-form, indexed).
-+  BufferOffset as_lfdx(FloatRegister rd, Register ra, Register rb);
-+  BufferOffset as_lfsx(FloatRegister rd, Register ra, Register rb);
-+  BufferOffset as_lfiwax(FloatRegister rd, Register ra, Register rb);
-+  BufferOffset as_stfdx(FloatRegister rd, Register ra, Register rb);
-+  BufferOffset as_stfsx(FloatRegister rd, Register ra, Register rb);
-+
-+  // FPSCR operations.
-+  BufferOffset as_mtfsb0(uint8_t bt);
-+  BufferOffset as_mcrfs(CRegisterID bf, uint8_t bfa);
-+
-+  // VSX (FPR-only subset).
-+  BufferOffset as_mfvsrd(Register ra, FloatRegister xs);
-+  BufferOffset as_mtvsrd(FloatRegister xs, Register ra);
-+  // POWER8+ (ISA 2.07). Sign-extending move of RA's low 32 bits to FPR.
-+  BufferOffset as_mtvsrwa(FloatRegister xs, Register ra);
-+  BufferOffset as_mtvsrwz(FloatRegister xs, Register ra);
-+  BufferOffset as_mtvsrws(FloatRegister xs, Register ra);
-+  BufferOffset as_xxbrd(FloatRegister xt, FloatRegister xb);
-+  // POWER9 scalar VSX max/min with Java/JavaScript semantics (matches
-+  // ECMA-262 Math.max / Math.min). Operate on FPR-space (encoding 0..31).
-+  BufferOffset as_xsmaxjdp(FloatRegister xt, FloatRegister xa,
-+                           FloatRegister xb);
-+  BufferOffset as_xsminjdp(FloatRegister xt, FloatRegister xa,
-+                           FloatRegister xb);
-+  BufferOffset as_xscvdpspn(FloatRegister xt, FloatRegister xb);
-+  BufferOffset as_xscvspdpn(FloatRegister xt, FloatRegister xb);
-+  // POWER9 (ISA 3.0) scalar FP16 conversions.
-+  BufferOffset as_xscvdphp(FloatRegister xt, FloatRegister xb);
-+  BufferOffset as_xscvhpdp(FloatRegister xt, FloatRegister xb);
-+  // POWER9 (ISA 3.0) scalar extract biased exponent.
-+  BufferOffset as_xsxexpdp(FloatRegister xt, FloatRegister xb);
-+  // POWER9 (ISA 3.0) scalar FP16 load/store, X-form indexed.
-+  BufferOffset as_lxsihzx(FloatRegister xt, Register ra, Register rb);
-+  BufferOffset as_stxsihx(FloatRegister xs, Register ra, Register rb);
-+
-+  // VSX SIMD load/store (X-form, indexed).
-+  BufferOffset as_lxvx(FloatRegister xt, Register ra, Register rb);
-+  BufferOffset as_stxvx(FloatRegister xs, Register ra, Register rb);
-+  BufferOffset as_lxvd2x(FloatRegister xt, Register ra, Register rb);
-+  BufferOffset as_stxvd2x(FloatRegister xs, Register ra, Register rb);
-+
-+  // VMX SIMD load/store (X-form, indexed). Take a raw VR number (0-31)
-+  // because VR20-VR31 are outside the FloatRegister encoding (which only
-+  // covers VSR0-31 = f0-f31). Used by the JIT trampoline to save/restore
-+  // the ELFv2 callee-saved VR20-VR31. EA is force-aligned to 16 bytes
-+  // (low 4 bits of the address are ignored), so the slot's alignment
-+  // matters for layout but not for trap avoidance.
-+  BufferOffset as_lvx(uint8_t vrt, Register ra, Register rb);
-+  BufferOffset as_stvx(uint8_t vrs, Register ra, Register rb);
-+
-+  // VSX SIMD register operations (XX3-form / XX1-form / XX2-form).
-+  BufferOffset as_xxlor(FloatRegister xt, FloatRegister xa, FloatRegister xb);
-+
-+  // VSX bitwise operations (XX3-form).
-+  BufferOffset as_xxland(FloatRegister xt, FloatRegister xa, FloatRegister xb);
-+  BufferOffset as_xxlxor(FloatRegister xt, FloatRegister xa, FloatRegister xb);
-+  BufferOffset as_xxlnor(FloatRegister xt, FloatRegister xa, FloatRegister xb);
-+  BufferOffset as_xxlandc(FloatRegister xt, FloatRegister xa, FloatRegister xb);
-+  BufferOffset as_xxsel(FloatRegister xt, FloatRegister xa, FloatRegister xb,
-+                        FloatRegister xc);
-+
-+  // VMX integer arithmetic (VR0-31 = VSR32-63 only).
-+  // Callers must ensure operands are in VR space.
-+  BufferOffset as_vaddubm(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vadduhm(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vadduwm(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vaddudm(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vsububm(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vsubuhm(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vsubuwm(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vsubudm(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vaddsbs(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vaddshs(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vaddubs(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vadduhs(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vsubsbs(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vsubshs(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vsububs(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vsubuhs(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vminsb(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vminsh(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vminsw(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmaxsb(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmaxsh(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmaxsw(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmaxsd(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vminub(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vminuh(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vminuw(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmaxub(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmaxuh(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmaxuw(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  // POWER9 (ISA 3.0): per-lane integer negate.
-+  BufferOffset as_vnegw(uint8_t vrt, uint8_t vrb);
-+  BufferOffset as_vnegd(uint8_t vrt, uint8_t vrb);
-+  // POWER9 (ISA 3.0): addpcis rT, D.  Computes rT = (CIA + 4) + (D << 16).
-+  // D is a 16-bit signed immediate; DX-form splits D across three instruction
-+  // fields (d0[16..25] ∥ d1[11..15] ∥ d2[31]).  No LR clobber, no RAS hazard.
-+  BufferOffset as_addpcis(Register rt, int16_t d);
-+  // POWER10 (ISA 3.1) prefixed instructions. Each emits 8 bytes (prefix +
-+  // suffix) with a single nop inserted before iff the prefix would
-+  // straddle a 64-byte block. Caller must guarantee HasPOWER10().
-+  // imm34 is signed 34-bit; R=true selects PC-relative form (RA must be r0).
-+  // Returns the offset of the prefix word.
-+  BufferOffset as_paddi(Register rt, Register ra, int64_t imm34, bool R);
-+  BufferOffset as_pld(Register rt, Register ra, int64_t imm34, bool R);
-+  BufferOffset as_plxv(uint8_t xt, Register ra, int64_t imm34, bool R);
-+  // FP-target prefixed loads: plfd/plfs are MLS (Type=2) with suffix
-+  // opcodes 50 and 48. plfs widens single → double in the FPR
-+  // (matches non-prefixed lfs semantics).
-+  BufferOffset as_plfd(FloatRegister frt, Register ra, int64_t imm34,
-+                       bool R);
-+  BufferOffset as_plfs(FloatRegister frt, Register ra, int64_t imm34,
-+                       bool R);
-+  // Prefixed-store counterparts. Same prefix shape; suffix opcodes are
-+  // the D-form variants of std/stxv/stfd/stfs (61, 27, 54, 52).
-+  BufferOffset as_pstd(Register rs, Register ra, int64_t imm34, bool R);
-+  BufferOffset as_pstxv(uint8_t xs, Register ra, int64_t imm34, bool R);
-+  BufferOffset as_pstfd(FloatRegister frs, Register ra, int64_t imm34,
-+                        bool R);
-+  BufferOffset as_pstfs(FloatRegister frs, Register ra, int64_t imm34,
-+                        bool R);
-+
-+ private:
-+  // Emit a nop before a prefixed instruction iff the prefix would otherwise
-+  // start at offset 60 (mod 64) and the suffix would land in the next block.
-+  void ensurePrefixedAlignment();
-+
-+ public:
-+  BufferOffset as_vavgub(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vavguh(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmuluwm(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmulld(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  // VMX shift (VR0-31 only).
-+  BufferOffset as_vslb(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vslh(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vslw(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vsld(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vsrb(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vsrh(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vsrw(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vsrd(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vsrab(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vsrah(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vsraw(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vsrad(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vslo(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vsro(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+
-+  // VMX integer compare (VR0-31 only).
-+  BufferOffset as_vcmpequb(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vcmpequh(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vcmpequw(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vcmpequd(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  // Record forms set CR6: LT = all-true, EQ = none-true.
-+  BufferOffset as_vcmpequb_rc(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vcmpequh_rc(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vcmpequw_rc(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vcmpequd_rc(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vcmpgtsb(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vcmpgtsh(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vcmpgtsw(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vcmpgtsd(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vcmpgtub(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vcmpgtuh(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vcmpgtuw(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vcmpgtud(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  // POWER9 (ISA 3.0). NotEqual compare; no doubleword variant.
-+  BufferOffset as_vcmpneb(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vcmpneh(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vcmpnew(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+
-+  // VSX float compare (XX3-form, VSR0-63).
-+  BufferOffset as_xvcmpeqsp(FloatRegister xt, FloatRegister xa,
-+                            FloatRegister xb);
-+  BufferOffset as_xvcmpgtsp(FloatRegister xt, FloatRegister xa,
-+                            FloatRegister xb);
-+  BufferOffset as_xvcmpgesp(FloatRegister xt, FloatRegister xa,
-+                            FloatRegister xb);
-+  BufferOffset as_xvcmpeqdp(FloatRegister xt, FloatRegister xa,
-+                            FloatRegister xb);
-+  BufferOffset as_xvcmpgtdp(FloatRegister xt, FloatRegister xa,
-+                            FloatRegister xb);
-+  BufferOffset as_xvcmpgedp(FloatRegister xt, FloatRegister xa,
-+                            FloatRegister xb);
-+
-+  // VSX float arithmetic (XX3-form binary, XX2-form unary).
-+#define DECL_VSX_BIN(op) \
-+  BufferOffset as_##op(FloatRegister xt, FloatRegister xa, FloatRegister xb);
-+  DECL_VSX_BIN(xvaddsp)
-+  DECL_VSX_BIN(xvadddp) DECL_VSX_BIN(xvsubsp) DECL_VSX_BIN(
-+      xvsubdp) DECL_VSX_BIN(xvmulsp) DECL_VSX_BIN(xvmuldp) DECL_VSX_BIN(xvdivsp)
-+      DECL_VSX_BIN(xvdivdp) DECL_VSX_BIN(xvminsp) DECL_VSX_BIN(
-+          xvmindp) DECL_VSX_BIN(xvmaxsp) DECL_VSX_BIN(xvmaxdp)
-+          DECL_VSX_BIN(xvmaddasp) DECL_VSX_BIN(xvmaddadp) DECL_VSX_BIN(
-+              xvnmsubasp) DECL_VSX_BIN(xvnmsubadp)
-+#undef DECL_VSX_BIN
-+#define DECL_VSX_UN(op) \
-+  BufferOffset as_##op(FloatRegister xt, FloatRegister xb);
-+              DECL_VSX_UN(xvabssp) DECL_VSX_UN(xvabsdp) DECL_VSX_UN(xvnegsp)
-+                  DECL_VSX_UN(xvnegdp) DECL_VSX_UN(xvsqrtsp) DECL_VSX_UN(
-+                      xvsqrtdp) DECL_VSX_UN(xvrspip) DECL_VSX_UN(xvrdpip)
-+                      DECL_VSX_UN(xvrspim) DECL_VSX_UN(xvrdpim) DECL_VSX_UN(
-+                          xvrspiz) DECL_VSX_UN(xvrdpiz) DECL_VSX_UN(xvrspic)
-+                          DECL_VSX_UN(xvrdpic) DECL_VSX_UN(xvcvsxwsp)
-+                              DECL_VSX_UN(xvcvuxwsp) DECL_VSX_UN(xvcvsxwdp)
-+                                  DECL_VSX_UN(xvcvuxwdp) DECL_VSX_UN(xvcvspsxws)
-+                                      DECL_VSX_UN(xvcvspuxws)
-+                                          DECL_VSX_UN(xvcvdpsxws)
-+                                              DECL_VSX_UN(xvcvdpuxws)
-+                                                  DECL_VSX_UN(xvcvdpsp)
-+                                                      DECL_VSX_UN(xvcvspdp)
-+#undef DECL_VSX_UN
-+
-+      // VMX widen/narrow/merge/pack (VR0-31 only).
-+      BufferOffset as_vupkhsb(uint8_t vrt, uint8_t vrb);
-+  BufferOffset as_vupklsb(uint8_t vrt, uint8_t vrb);
-+  BufferOffset as_vupkhsh(uint8_t vrt, uint8_t vrb);
-+  BufferOffset as_vupklsh(uint8_t vrt, uint8_t vrb);
-+  BufferOffset as_vupkhsw(uint8_t vrt, uint8_t vrb);
-+  BufferOffset as_vupklsw(uint8_t vrt, uint8_t vrb);
-+  BufferOffset as_vpkshss(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vpkswss(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vpkshus(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vpkswus(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmrghb(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmrghh(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmrghw(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmrglb(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmrglh(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmrglw(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+
-+  // VMX extended multiply (VR0-31 only).
-+  BufferOffset as_vmulesb(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmulosb(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmuleub(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmuloub(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmulesh(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmulosh(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmuleuh(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmulouh(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmulesw(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmulosw(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmuleuw(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vmulouw(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  BufferOffset as_vpopcntb(uint8_t vrt, uint8_t vrb);
-+  BufferOffset as_vperm(uint8_t vrt, uint8_t vra, uint8_t vrb, uint8_t vrc);
-+  // POWER8+ (ISA 2.07). VX-form bit-permute. See PPC_vbpermq comment.
-+  BufferOffset as_vbpermq(uint8_t vrt, uint8_t vra, uint8_t vrb);
-+  // POWER10 (ISA 3.1) Vector Extract Mask. RT is a GPR.
-+  BufferOffset as_vextractbm(Register rt, FloatRegister vrb);
-+  BufferOffset as_vextracthm(Register rt, FloatRegister vrb);
-+  BufferOffset as_vextractwm(Register rt, FloatRegister vrb);
-+  BufferOffset as_vextractdm(Register rt, FloatRegister vrb);
-+  // POWER10 (ISA 3.1) Vector Insert from GPR at immediate byte offset.
-+  // UIM range: vinsw 0..12, vinsd 0..8 (caller must enforce).
-+  BufferOffset as_vinsw(FloatRegister vrt, Register rb, uint8_t uim);
-+  BufferOffset as_vinsd(FloatRegister vrt, Register rb, uint8_t uim);
-+  // POWER10 (ISA 3.1) Vector Insert byte / halfword from GPR with the
-+  // byte position supplied by another GPR (RA & 0xF for vinsbrx,
-+  // RA & 0xE for vinshrx). "rx" = right-indexed = LE-natural.
-+  BufferOffset as_vinsbrx(FloatRegister vrt, Register ra, Register rb);
-+  BufferOffset as_vinshrx(FloatRegister vrt, Register ra, Register rb);
-+  // POWER9 (ISA 3.0) Vector Insert byte / halfword from VR at immediate
-+  // byte position. UIM range: vinsertb 0..15, vinserth 0..14
-+  // (caller must enforce; vinserth UIM is in bytes, even-aligned).
-+  BufferOffset as_vinsertb(FloatRegister vrt, FloatRegister vrb, uint8_t uim);
-+  BufferOffset as_vinserth(FloatRegister vrt, FloatRegister vrb, uint8_t uim);
-+  // POWER9 (ISA 3.0) Vector Extract byte / halfword from VR at immediate
-+  // BE byte position. UIM range: vextractub 0..15, vextractuh 0..14
-+  // (caller must enforce; vextractuh UIM is in bytes, even-aligned). The
-+  // extracted byte/halfword lands at BE byte 7 of VRT, with the rest
-+  // zeroed — so a subsequent mfvsrd reads it as the low byte/halfword
-+  // of the GPR with implicit zero-extension.
-+  BufferOffset as_vextractub(FloatRegister vrt, FloatRegister vrb, uint8_t uim);
-+  BufferOffset as_vextractuh(FloatRegister vrt, FloatRegister vrb, uint8_t uim);
-+  // VX-form with 5-bit signed immediate splat: each lane of VRT is
-+  // set to sign_extend(SIMM5) (range [-16, 15]) at byte/halfword/word granularity.
-+  BufferOffset as_vspltisb(uint8_t vrt, int8_t simm5);
-+  BufferOffset as_vspltish(uint8_t vrt, int8_t simm5);
-+  BufferOffset as_vspltisw(uint8_t vrt, int8_t simm5);
-+
-+  // VA-form ternary VMX instructions.
-+  BufferOffset as_vmladduhm(uint8_t vrt, uint8_t vra, uint8_t vrb, uint8_t vrc);
-+  BufferOffset as_vmhraddshs(uint8_t vrt, uint8_t vra, uint8_t vrb,
-+                             uint8_t vrc);
-+  BufferOffset as_vmsumshm(uint8_t vrt, uint8_t vra, uint8_t vrb, uint8_t vrc);
-+  BufferOffset as_vmsumuhm(uint8_t vrt, uint8_t vra, uint8_t vrb, uint8_t vrc);
-+  BufferOffset as_xxpermdi(FloatRegister xt, FloatRegister xa, FloatRegister xb,
-+                           uint8_t dm);
-+  BufferOffset as_xxspltw(FloatRegister xt, FloatRegister xb, uint8_t uim);
-+  // POWER9 (ISA 3.0). Splat 8-bit immediate to all 16 bytes of an FPR-encoded
-+  // VSR (TX bit forced 0). XX1-form, no Rc.
-+  BufferOffset as_xxspltib(FloatRegister xt, uint8_t imm8);
-+  BufferOffset as_xxinsertw(FloatRegister xt, FloatRegister xb, uint8_t uim);
-+  BufferOffset as_xxextractuw(FloatRegister xt, FloatRegister xb, uint8_t uim);
-+  BufferOffset as_mtvsrdd(FloatRegister xt, Register ra, Register rb);
-+  BufferOffset as_mfvsrld(Register rt, FloatRegister xs);
-+
-+  // VMX vector operations.
-+  BufferOffset as_vspltb(FloatRegister vrt, FloatRegister vrb, uint8_t uim);
-+  BufferOffset as_vsplth(FloatRegister vrt, FloatRegister vrb, uint8_t uim);
-+  BufferOffset as_vsldoi(FloatRegister vrt, FloatRegister vra,
-+                         FloatRegister vrb, uint8_t shb);
-+
-+  // Barrier and sync instructions.
-+  BufferOffset as_lwsync();
-+  BufferOffset as_sync();
-+  BufferOffset as_isync();
-+
-+  // Convenience pseudo-instructions.
-+  BufferOffset xs_trap();
-+  BufferOffset xs_trap_tagged(TrapTag tag);
-+  BufferOffset xs_mr(Register rd, Register ra);
-+  BufferOffset xs_mtctr(Register ra);
-+  BufferOffset xs_mtlr(Register ra);
-+  BufferOffset xs_mflr(Register rd);
-+  BufferOffset xs_mtcr(Register rs);
-+  BufferOffset xs_mfxer(Register ra);
-+  BufferOffset xs_mtxer(Register ra);
-+  BufferOffset xs_li(Register rd, int16_t im);
-+  BufferOffset xs_lis(Register rd, int16_t im);
-+  BufferOffset x_subi(Register rd, Register ra, int16_t im);
-+  BufferOffset x_not(Register rd, Register ra);
-+  BufferOffset x_slwi(Register rd, Register rs, int n);
-+  BufferOffset x_sldi(Register rd, Register rs, int n);
-+  BufferOffset x_srwi(Register rd, Register rs, int n);
-+  BufferOffset x_srdi(Register rd, Register rs, int n);
-+  BufferOffset x_insertbits0_15(Register rd, Register rs);
-+  BufferOffset x_bit_value(Register rd, Register rs, unsigned bit);
-+  BufferOffset x_sr_mulli(Register rd, Register ra, int16_t im);
-+
-+  // --- Label operations.
-+  void bind(Label* label) { bind(label, nextOffset()); }
-+  void bind(Label* label, BufferOffset boff);
-+  void bind(InstImm* inst, uintptr_t branch, uintptr_t target);
-+  void bind(CodeLabel* label) { label->target()->bind(currentOffset()); }
-+  uint32_t currentOffset() { return nextOffset().getOffset(); }
-+  void retarget(Label* label, Label* target);
-+  void call(Label* label);
-+  void call(void* target);
-+
-+  void as_break(uint32_t code);
-+
-+  // --- Static capability queries.
-+  static bool SupportsFloatingPoint() { return true; }
-+  static bool SupportsWasmSimd() { return true; }
-+  static bool SupportsUnalignedAccesses() { return true; }
-+  static bool SupportsFastUnalignedFPAccesses() { return true; }
-+  // POWER9 has scalar FP16 hardware (xscvdphp/xscvhpdp); POWER8 doesn't.
-+  // Runtime-gate like x86's SupportsFloat32To16 (which keys off F16C).
-+  static bool SupportsFloat64To16() { return HasPOWER9(); }
-+  static bool SupportsFloat32To16() { return HasPOWER9(); }
-+  static bool HasRoundInstruction(RoundingMode mode) {
-+    // PPC64 has friz (trunc), frip (ceil), frim (floor), which are all correct.
-+    // frin (round-to-nearest) does NOT implement proper IEEE banker's rounding
-+    // (ties to even), so NearestTiesToEven is not supported.
-+    return mode == RoundingMode::TowardsZero || mode == RoundingMode::Up ||
-+           mode == RoundingMode::Down;
-+  }
-+
-+ protected:
-+  InstImm invertBranch(InstImm branch, BOffImm16 skipOffset);
-+  void addPendingJump(BufferOffset src, ImmPtr target, RelocationKind kind) {
-+    enoughMemory_ &= jumps_.append(RelativePatch(src, target.value, kind));
-+    if (kind == RelocationKind::JITCODE) {
-+      writeRelocation(src);
-+    }
-+  }
-+  void addLongJump(BufferOffset src, BufferOffset dst) {
-+    CodeLabel cl;
-+    cl.patchAt()->bind(src.getOffset());
-+    cl.target()->bind(dst.getOffset());
-+    cl.setLinkMode(CodeLabel::JumpImmediate);
-+    addCodeLabel(std::move(cl));
-+  }
-+
-+ public:
-+  void flushBuffer() { m_buffer.flushPool(); }
-+  void comment(const char* msg) { spew("; %s", msg); }
-+  static uint32_t NopSize() { return 4; }
-+
-+  // --- Static patching API.
-+  static uint64_t ExtractLoad64Value(Instruction* inst0);
-+  static void UpdateLoad64Value(Instruction* inst0, uint64_t value);
-+  static void WriteLoad64Instructions(Instruction* inst0, Register reg,
-+                                      uint64_t value);
-+
-+  static void PatchWrite_Imm32(CodeLocationLabel label, Imm32 imm);
-+  static uint8_t* NextInstruction(uint8_t* instruction,
-+                                  uint32_t* count = nullptr);
-+  static void ToggleToJmp(CodeLocationLabel inst_);
-+  static void ToggleToCmp(CodeLocationLabel inst_);
-+
-+  void verifyHeapAccessDisassembly(uint32_t begin, uint32_t end,
-+                                   const Disassembler::HeapAccess& ha) {}
-+
-+  // --- Public patching API (required by shared code).
-+  static void Bind(uint8_t* rawCode, const CodeLabel& label);
-+  void processCodeLabels(uint8_t* rawCode);
-+
-+  static void TraceJumpRelocations(JSTracer* trc, JitCode* code,
-+                                   CompactBufferReader& reader);
-+  static void TraceDataRelocations(JSTracer* trc, JitCode* code,
-+                                   CompactBufferReader& reader);
-+
-+  void executableCopy(uint8_t* buffer);
-+
-+  static uint32_t PatchWrite_NearCallSize();
-+  static void PatchWrite_NearCall(CodeLocationLabel start,
-+                                  CodeLocationLabel toCall);
-+  static void PatchDataWithValueCheck(CodeLocationLabel label, ImmPtr newValue,
-+                                      ImmPtr expectedValue);
-+  static void PatchDataWithValueCheck(CodeLocationLabel label,
-+                                      PatchedImmPtr newValue,
-+                                      PatchedImmPtr expectedValue);
-+  static void ToggleCall(CodeLocationLabel inst_, bool enabled);
-+
-+ private:
-+  GeneralRegisterSet scratch_register_list_;
-+
-+ public:
-+  GeneralRegisterSet* GetScratchRegisterList() {
-+    return &scratch_register_list_;
-+  }
-+};  // Assembler
-+
-+inline bool IsUnaligned(const wasm::MemoryAccessDesc& access) {
-+  if (!access.align()) {
-+    return false;
-+  }
-+  return access.align() < access.byteSize();
-+}
-+
-+}  // namespace jit
-+}  // namespace js
-+
-+// Whether an Imm32 fits in an unsigned 16-bit immediate.
-+#define PPC_IMM_OK_U(x) (MOZ_LIKELY(((x).value & 0xffff0000) == 0))
-+
-+// Whether an Imm32 fits in a signed 16-bit immediate.
-+#define PPC_IMM_OK_S(x)                        \
-+  (MOZ_LIKELY(((x).value & 0xffff8000) == 0 || \
-+              ((x).value & 0xffff8000) == 0xffff8000))
-+
-+// Whether the offset part of an Address fits in a signed 16-bit immediate.
-+#define PPC_OFFS_OK(x)                          \
-+  (MOZ_LIKELY(((x).offset & 0xffff8000) == 0 || \
-+              ((x).offset & 0xffff8000) == 0xffff8000))
-+
-+// Same test but checking a bit ahead (for paired loads).
-+#define PPC_OFFS_INCR_OK(x, incr)                          \
-+  (MOZ_LIKELY((((x).offset + (incr)) & 0xffff8000) == 0 || \
-+              (((x).offset + (incr)) & 0xffff8000) == 0xffff8000))
-+
-+#endif /* jit_ppc64_Assembler_ppc64_h */
-diff --git a/js/src/jit/ppc64/CodeGenerator-ppc64.cpp b/js/src/jit/ppc64/CodeGenerator-ppc64.cpp
-new file mode 100644
-index 000000000000..0a436fb1201a
---- /dev/null
-+++ b/js/src/jit/ppc64/CodeGenerator-ppc64.cpp
-@@ -0,0 +1,3647 @@
-+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
-+ * vim: set ts=8 sts=2 et sw=2 tw=80:
-+ * This Source Code Form is subject to the terms of the Mozilla Public
-+ * License, v. 2.0. If a copy of the MPL was not distributed with this
-+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-+
-+#include "jit/ppc64/CodeGenerator-ppc64.h"
-+
-+#include "mozilla/MathAlgorithms.h"
-+
-+#include <bit>
-+
-+#include "builtin/Number.h"
-+#include "jit/CodeGenerator.h"
-+#include "jit/InlineScriptTree.h"
-+#include "jit/JitRuntime.h"
-+#include "jit/MIR-wasm.h"
-+#include "jit/MIR.h"
-+#include "jit/MIRGraph.h"
-+#include "vm/JSContext.h"
-+#include "vm/Realm.h"
-+#include "vm/Shape.h"
-+
-+#include "jit/shared/CodeGenerator-shared-inl.h"
-+#include "vm/JSScript-inl.h"
-+
-+using namespace js;
-+using namespace js::jit;
-+
-+using JS::GenericNaN;
-+using mozilla::NegativeInfinity;
-+
-+namespace js {
-+namespace jit {
-+
-+CodeGeneratorPPC64::CodeGeneratorPPC64(MIRGenerator* gen, LIRGraph* graph,
-+                                       MacroAssembler* masm,
-+                                       const wasm::CodeMetadata* codeMeta)
-+    : CodeGeneratorShared(gen, graph, masm, codeMeta) {}
-+
-+Operand CodeGeneratorPPC64::ToOperand(const LAllocation& a) {
-+  if (a.isGeneralReg()) {
-+    return Operand(a.toGeneralReg()->reg());
-+  }
-+  if (a.isFloatReg()) {
-+    return Operand(a.toFloatReg()->reg());
-+  }
-+  return Operand(ToAddress(a));
-+}
-+
-+Operand CodeGeneratorPPC64::ToOperand(const LAllocation* a) {
-+  return ToOperand(*a);
-+}
-+
-+MoveOperand CodeGeneratorPPC64::toMoveOperand(LAllocation a) const {
-+  if (a.isGeneralReg()) {
-+    return MoveOperand(ToRegister(a));
-+  }
-+  if (a.isFloatReg()) {
-+    return MoveOperand(ToFloatRegister(a));
-+  }
-+  MoveOperand::Kind kind = a.isStackArea() ? MoveOperand::Kind::EffectiveAddress
-+                                           : MoveOperand::Kind::Memory;
-+  Address address = ToAddress(a);
-+  MOZ_ASSERT((address.offset & 3) == 0);
-+  return MoveOperand(address, kind);
-+}
-+
-+void CodeGeneratorPPC64::bailoutFrom(Label* label, LSnapshot* snapshot) {
-+  MOZ_ASSERT_IF(!masm.oom(), label->used());
-+  MOZ_ASSERT_IF(!masm.oom(), !label->bound());
-+
-+  encode(snapshot);
-+
-+  InlineScriptTree* tree = snapshot->mir()->block()->trackedTree();
-+  auto* ool = new (alloc()) LambdaOutOfLineCode([=, this](OutOfLineCode& ool) {
-+    // Push snapshotOffset and make sure stack is aligned.
-+    masm.subPtr(Imm32(sizeof(Value)), StackPointer);
-+    masm.storePtr(ImmWord(snapshot->snapshotOffset()),
-+                  Address(StackPointer, 0));
-+    masm.jump(&deoptLabel_);
-+  });
-+  addOutOfLineCode(ool,
-+                   new (alloc()) BytecodeSite(tree, tree->script()->code()));
-+
-+  masm.retarget(label, ool->entry());
-+}
-+
-+void CodeGeneratorPPC64::bailout(LSnapshot* snapshot) {
-+  Label label;
-+  masm.jump(&label);
-+  bailoutFrom(&label, snapshot);
-+}
-+
-+void CodeGeneratorPPC64::bailoutIfFalseBool(Register lhs, LSnapshot* snapshot) {
-+  Label bail;
-+  masm.branchTest32(Assembler::Zero, lhs, Imm32(0xFF), &bail);
-+  bailoutFrom(&bail, snapshot);
-+}
-+
-+bool CodeGeneratorPPC64::generateOutOfLineCode() {
-+  if (!CodeGeneratorShared::generateOutOfLineCode()) {
-+    return false;
-+  }
-+
-+  if (deoptLabel_.used()) {
-+    masm.bind(&deoptLabel_);
-+
-+    // Frame size is stored in LR and pushed by GenerateBailoutThunk
-+    // (via PushBailoutFrame -> pushReturnAddress -> mflr).
-+    {
-+      UseScratchRegisterScope temps(masm);
-+      Register scratch = temps.Acquire();
-+      masm.movePtr(ImmWord(frameSize()), scratch);
-+      masm.xs_mtlr(scratch);
-+    }
-+
-+    TrampolinePtr handler = gen->jitRuntime()->getGenericBailoutHandler();
-+    masm.jump(handler);
-+  }
-+
-+  return !masm.oom();
-+}
-+
-+void CodeGeneratorPPC64::branchToBlock(MBasicBlock* block) {
-+  Label* label = skipTrivialBlocks(block)->lir()->label();
-+  masm.jump(label);
-+}
-+
-+void CodeGeneratorPPC64::branchToBlock(Assembler::DoubleCondition cond,
-+                                       FloatRegister lhs, FloatRegister rhs,
-+                                       MBasicBlock* mir) {
-+  Label* label = skipTrivialBlocks(mir)->lir()->label();
-+  masm.branchDouble(cond, lhs, rhs, label);
-+}
-+
-+void CodeGeneratorPPC64::branchToBlock(Assembler::FloatFormat fmt,
-+                                       Assembler::DoubleCondition cond,
-+                                       FloatRegister lhs, FloatRegister rhs,
-+                                       MBasicBlock* mir) {
-+  Label* label = skipTrivialBlocks(mir)->lir()->label();
-+  if (fmt == Assembler::DoubleFloat) {
-+    masm.branchDouble(cond, lhs, rhs, label);
-+  } else {
-+    masm.branchFloat(cond, lhs, rhs, label);
-+  }
-+}
-+
-+class OutOfLineTableSwitch : public OutOfLineCodeBase<CodeGeneratorPPC64> {
-+  MTableSwitch* mir_;
-+  CodeLabel jumpLabel_;
-+
-+  void accept(CodeGeneratorPPC64* codegen) {
-+    codegen->visitOutOfLineTableSwitch(this);
-+  }
-+
-+ public:
-+  explicit OutOfLineTableSwitch(MTableSwitch* mir) : mir_(mir) {}
-+
-+  MTableSwitch* mir() const { return mir_; }
-+  CodeLabel* jumpLabel() { return &jumpLabel_; }
-+};
-+
-+void CodeGeneratorPPC64::emitTableSwitchDispatch(MTableSwitch* mir,
-+                                                 Register index,
-+                                                 Register base) {
-+  Label* defaultcase = skipTrivialBlocks(mir->getDefault())->lir()->label();
-+
-+  if (mir->low() != 0) {
-+    masm.subPtr(Imm32(mir->low()), index);
-+  }
-+
-+  int32_t cases = mir->numCases();
-+  masm.branchPtr(Assembler::AboveOrEqual, index, ImmWord(cases), defaultcase);
-+
-+  OutOfLineTableSwitch* ool = new (alloc()) OutOfLineTableSwitch(mir);
-+  addOutOfLineCode(ool, mir);
-+
-+  masm.mov(ool->jumpLabel(), base);
-+
-+  BaseIndex pointer(base, index, ScalePointer);
-+  masm.branchToComputedAddress(pointer);
-+}
-+
-+void CodeGeneratorPPC64::generateInvalidateEpilogue() {
-+  // Pad with enough nops so that PatchWrite_NearCall on the last OSI point
-+  // cannot overlap the invalidation epilogue. The patch area is
-+  // PatchWrite_NearCallSize (40) bytes; the last OSI point could be right
-+  // before this epilogue.
-+  for (size_t i = 0; i < Assembler::PatchWrite_NearCallSize();
-+       i += Assembler::NopSize()) {
-+    masm.nop();
-+  }
-+
-+  masm.bind(&invalidate_);
-+
-+  // Push the return address (LR) onto the stack.
-+  masm.pushReturnAddress();
-+
-+  invalidateEpilogueData_ = masm.pushWithPatch(ImmWord(uintptr_t(-1)));
-+
-+  TrampolinePtr thunk = gen->jitRuntime()->getInvalidationThunk();
-+  masm.jump(thunk);
-+}
-+
-+void CodeGeneratorPPC64::visitOutOfLineTableSwitch(OutOfLineTableSwitch* ool) {
-+  MTableSwitch* mir = ool->mir();
-+
-+  masm.haltingAlign(sizeof(void*));
-+  masm.bind(ool->jumpLabel());
-+  masm.addCodeLabel(*ool->jumpLabel());
-+
-+  for (size_t i = 0; i < mir->numCases(); i++) {
-+    LBlock* caseblock = skipTrivialBlocks(mir->getCase(i))->lir();
-+    Label* caseheader = caseblock->label();
-+    uint32_t caseoffset = caseheader->offset();
-+
-+    CodeLabel cl;
-+    masm.writeCodePointer(&cl);
-+    cl.target()->bind(caseoffset);
-+    masm.addCodeLabel(cl);
-+  }
-+}
-+
-+void CodeGeneratorPPC64::visitOutOfLineWasmTruncateCheck(
-+    OutOfLineWasmTruncateCheck* ool) {
-+  if (ool->toType() == MIRType::Int32) {
-+    masm.outOfLineWasmTruncateToInt32Check(ool->input(), ool->output(),
-+                                           ool->fromType(), ool->flags(),
-+                                           ool->rejoin(), ool->trapSiteDesc());
-+  } else {
-+    MOZ_ASSERT(ool->toType() == MIRType::Int64);
-+    masm.outOfLineWasmTruncateToInt64Check(ool->input(), ool->output64(),
-+                                           ool->fromType(), ool->flags(),
-+                                           ool->rejoin(), ool->trapSiteDesc());
-+  }
-+}
-+
-+void CodeGeneratorPPC64::emitBigIntPtrDiv(LBigIntPtrDiv* ins, Register dividend,
-+                                          Register divisor, Register output) {
-+  masm.as_divd(output, dividend, divisor);
-+}
-+
-+void CodeGeneratorPPC64::emitBigIntPtrMod(LBigIntPtrMod* ins, Register dividend,
-+                                          Register divisor, Register output) {
-+  if (HasPOWER9()) {
-+    masm.as_modsd(output, dividend, divisor);
-+  } else {
-+    masm.as_divd(output, dividend, divisor);
-+    masm.as_mulld(output, output, divisor);
-+    masm.as_subf(output, output, dividend);
-+  }
-+}
-+
-+// ===============================================================
-+// Visitors: Box/Unbox
-+
-+void CodeGenerator::visitBox(LBox* box) {
-+  const LAllocation* in = box->getOperand(0);
-+  ValueOperand result = ToOutValue(box);
-+
-+  masm.moveValue(TypedOrValueRegister(box->type(), ToAnyRegister(in)), result);
-+}
-+
-+void CodeGenerator::visitUnbox(LUnbox* unbox) {
-+  MUnbox* mir = unbox->mir();
-+
-+  Register result = ToRegister(unbox->output());
-+
-+  if (mir->fallible()) {
-+    ValueOperand value = ToValue(unbox->input());
-+    Label bail;
-+    switch (mir->type()) {
-+      case MIRType::Int32:
-+        masm.fallibleUnboxInt32(value, result, &bail);
-+        break;
-+      case MIRType::Boolean:
-+        masm.fallibleUnboxBoolean(value, result, &bail);
-+        break;
-+      case MIRType::Object:
-+        masm.fallibleUnboxObject(value, result, &bail);
-+        break;
-+      case MIRType::String:
-+        masm.fallibleUnboxString(value, result, &bail);
-+        break;
-+      case MIRType::Symbol:
-+        masm.fallibleUnboxSymbol(value, result, &bail);
-+        break;
-+      case MIRType::BigInt:
-+        masm.fallibleUnboxBigInt(value, result, &bail);
-+        break;
-+      default:
-+        MOZ_CRASH("Given MIRType cannot be unboxed.");
-+    }
-+    bailoutFrom(&bail, unbox->snapshot());
-+    return;
-+  }
-+
-+  LAllocation* input = unbox->getOperand(LUnbox::Input);
-+  if (input->isGeneralReg()) {
-+    Register inputReg = ToRegister(input);
-+    switch (mir->type()) {
-+      case MIRType::Int32:
-+        masm.unboxInt32(ValueOperand(inputReg), result);
-+        break;
-+      case MIRType::Boolean:
-+        masm.unboxBoolean(ValueOperand(inputReg), result);
-+        break;
-+      case MIRType::Object:
-+        masm.unboxObject(ValueOperand(inputReg), result);
-+        break;
-+      case MIRType::String:
-+        masm.unboxString(ValueOperand(inputReg), result);
-+        break;
-+      case MIRType::Symbol:
-+        masm.unboxSymbol(ValueOperand(inputReg), result);
-+        break;
-+      case MIRType::BigInt:
-+        masm.unboxBigInt(ValueOperand(inputReg), result);
-+        break;
-+      default:
-+        MOZ_CRASH("Given MIRType cannot be unboxed.");
-+    }
-+    return;
-+  }
-+
-+  Address inputAddr = ToAddress(input);
-+  switch (mir->type()) {
-+    case MIRType::Int32:
-+      masm.unboxInt32(inputAddr, result);
-+      break;
-+    case MIRType::Boolean:
-+      masm.unboxBoolean(inputAddr, result);
-+      break;
-+    case MIRType::Object:
-+      masm.unboxObject(inputAddr, result);
-+      break;
-+    case MIRType::String:
-+      masm.unboxString(inputAddr, result);
-+      break;
-+    case MIRType::Symbol:
-+      masm.unboxSymbol(inputAddr, result);
-+      break;
-+    case MIRType::BigInt:
-+      masm.unboxBigInt(inputAddr, result);
-+      break;
-+    default:
-+      MOZ_CRASH("Given MIRType cannot be unboxed.");
-+  }
-+}
-+
-+// ===============================================================
-+// Visitors: Integer Arithmetic
-+
-+void CodeGenerator::visitAddI(LAddI* ins) {
-+  LAllocation* lhs = ins->getOperand(0);
-+  LAllocation* rhs = ins->getOperand(1);
-+  Register dest = ToRegister(ins->getDef(0));
-+
-+  if (rhs->isConstant()) {
-+    Imm32 imm(ToInt32(rhs));
-+    if (ins->snapshot()) {
-+      masm.move32(ToRegister(lhs), dest);
-+      Label overflow;
-+      masm.branchAdd32(Assembler::Overflow, imm, dest, &overflow);
-+      bailoutFrom(&overflow, ins->snapshot());
-+    } else {
-+      masm.add32(imm, ToRegister(lhs), dest);
-+    }
-+  } else {
-+    Register rhsReg = ToRegister(rhs);
-+    if (ins->snapshot()) {
-+      // Use 3-operand add to avoid clobbering rhs when rhs == dest.
-+      masm.as_add(dest, ToRegister(lhs), rhsReg);
-+      // Check 32-bit overflow: sign-extend lower 32 and compare.
-+      masm.as_extsw(SecondScratchReg, dest);
-+      Label overflow;
-+      masm.as_cmpd(dest, SecondScratchReg);
-+      masm.ma_b(Assembler::NotEqual, &overflow);
-+      masm.as_extsw(dest, dest);
-+      bailoutFrom(&overflow, ins->snapshot());
-+    } else {
-+      masm.as_add(dest, ToRegister(lhs), rhsReg);
-+      masm.as_extsw(dest, dest);
-+    }
-+  }
-+}
-+
-+void CodeGenerator::visitAddIntPtr(LAddIntPtr* ins) {
-+  Register dest = ToRegister(ins->getDef(0));
-+  Register lhs = ToRegister(ins->getOperand(0));
-+  const LAllocation* rhs = ins->getOperand(1);
-+
-+  if (rhs->isConstant()) {
-+    if (lhs != dest) {
-+      masm.movePtr(lhs, dest);
-+    }
-+    masm.addPtr(ImmWord(ToIntPtr(rhs)), dest);
-+  } else {
-+    masm.as_add(dest, lhs, ToRegister(rhs));
-+  }
-+}
-+
-+void CodeGenerator::visitAddI64(LAddI64* lir) {
-+  Register dest = ToRegister(lir->getDef(0));
-+  Register lhs = ToRegister(lir->getOperand(0));
-+  const LAllocation* rhs = lir->getOperand(1);
-+
-+  if (rhs->isConstant()) {
-+    if (lhs != dest) {
-+      masm.movePtr(lhs, dest);
-+    }
-+    masm.addPtr(ImmWord(ToInt64(rhs)), dest);
-+  } else {
-+    masm.as_add(dest, lhs, ToRegister(rhs));
-+  }
-+}
-+
-+void CodeGenerator::visitSubI(LSubI* ins) {
-+  LAllocation* lhs = ins->getOperand(0);
-+  LAllocation* rhs = ins->getOperand(1);
-+  Register dest = ToRegister(ins->getDef(0));
-+
-+  if (rhs->isConstant()) {
-+    Imm32 imm(ToInt32(rhs));
-+    if (ins->snapshot()) {
-+      masm.move32(ToRegister(lhs), dest);
-+      Label overflow;
-+      masm.branchSub32(Assembler::Overflow, imm, dest, &overflow);
-+      bailoutFrom(&overflow, ins->snapshot());
-+    } else {
-+      masm.move32(ToRegister(lhs), dest);
-+      masm.sub32(imm, dest);
-+    }
-+  } else {
-+    Register rhsReg = ToRegister(rhs);
-+    if (ins->snapshot()) {
-+      // as_subf(d, a, b) computes d = b - a, so subf(dest, rhs, lhs) = lhs -
-+      // rhs
-+      masm.as_subf(dest, rhsReg, ToRegister(lhs));
-+      masm.as_extsw(SecondScratchReg, dest);
-+      Label overflow;
-+      masm.as_cmpd(dest, SecondScratchReg);
-+      masm.ma_b(Assembler::NotEqual, &overflow);
-+      masm.as_extsw(dest, dest);
-+      bailoutFrom(&overflow, ins->snapshot());
-+    } else {
-+      masm.as_subf(dest, rhsReg, ToRegister(lhs));
-+      masm.as_extsw(dest, dest);
-+    }
-+  }
-+}
-+
-+void CodeGenerator::visitSubIntPtr(LSubIntPtr* ins) {
-+  Register dest = ToRegister(ins->getDef(0));
-+  Register lhs = ToRegister(ins->getOperand(0));
-+  const LAllocation* rhs = ins->getOperand(1);
-+
-+  if (rhs->isConstant()) {
-+    if (lhs != dest) {
-+      masm.movePtr(lhs, dest);
-+    }
-+    masm.subPtr(Imm32(ToIntPtr(rhs)), dest);
-+  } else {
-+    // as_subf(d, a, b) = b - a
-+    masm.as_subf(dest, ToRegister(rhs), lhs);
-+  }
-+}
-+
-+void CodeGenerator::visitSubI64(LSubI64* lir) {
-+  Register dest = ToRegister(lir->getDef(0));
-+  Register lhs = ToRegister(lir->getOperand(0));
-+  const LAllocation* rhs = lir->getOperand(1);
-+
-+  if (rhs->isConstant()) {
-+    if (lhs != dest) {
-+      masm.movePtr(lhs, dest);
-+    }
-+    masm.sub64(Imm64(ToInt64(rhs)), Register64(dest));
-+  } else {
-+    // as_subf(d, a, b) = b - a
-+    masm.as_subf(dest, ToRegister(rhs), lhs);
-+  }
-+}
-+
-+void CodeGenerator::visitMulI(LMulI* ins) {
-+  Register dest = ToRegister(ins->getDef(0));
-+  Register lhs = ToRegister(ins->getOperand(0));
-+  const LAllocation* rhs = ins->getOperand(1);
-+  MMul* mul = ins->mir();
-+
-+  if (rhs->isConstant()) {
-+    int32_t constant = ToInt32(rhs);
-+    Register src = lhs;
-+
-+    // Bailout on -0.0 before the special-case handling below, since cases
-+    // like -1 and 0 return early and would skip the check.
-+    if (mul->canBeNegativeZero() && constant <= 0) {
-+      Assembler::Condition cond =
-+          (constant == 0) ? Assembler::Signed : Assembler::Equal;
-+      bailoutCmp32(cond, src, Imm32(0), ins->snapshot());
-+    }
-+
-+    switch (constant) {
-+      case -1:
-+        if (mul->canOverflow()) {
-+          Label ok;
-+          masm.branchPtr(Assembler::NotEqual, lhs, ImmWord(INT32_MIN), &ok);
-+          bailout(ins->snapshot());
-+          masm.bind(&ok);
-+        }
-+        masm.as_neg(dest, src);
-+        masm.as_extsw(dest, dest);
-+        return;
-+      case 0:
-+        masm.move32(Imm32(0), dest);
-+        return;
-+      case 1:
-+        masm.move32(src, dest);
-+        return;
-+      case 2:
-+        if (mul->canOverflow()) {
-+          masm.move32(src, dest);
-+          Label overflow;
-+          masm.branchAdd32(Assembler::Overflow, dest, dest, &overflow);
-+          bailoutFrom(&overflow, ins->snapshot());
-+        } else {
-+          masm.move32(src, dest);
-+          masm.add32(dest, dest);
-+        }
-+        return;
-+      default:
-+        break;
-+    }
-+
-+    // Check for power of 2 (positive).
-+    uint32_t absCst = mozilla::Abs(constant);
-+    if (absCst > 0 && (absCst & (absCst - 1)) == 0 && !mul->canOverflow()) {
-+      uint32_t shift = mozilla::FloorLog2(absCst);
-+      masm.x_slwi(dest, src, shift);
-+      if (constant < 0) {
-+        masm.as_neg(dest, dest);
-+      }
-+      masm.as_extsw(dest, dest);
-+      return;
-+    }
-+
-+    // General case.
-+    if (mul->canOverflow()) {
-+      masm.move32(src, dest);
-+      Label overflow;
-+      masm.branchMul32(Assembler::Overflow, Imm32(constant), dest, &overflow);
-+      bailoutFrom(&overflow, ins->snapshot());
-+    } else {
-+      masm.move32(src, dest);
-+      masm.mul32(Imm32(constant), dest);
-+    }
-+
-+    // Check for negative zero (for constants not handled above).
-+    if (mul->canBeNegativeZero() && constant < 0) {
-+      Label ok;
-+      masm.branchPtr(Assembler::NotEqual, dest, ImmWord(0), &ok);
-+      bailoutCmp32(Assembler::Signed, src, src, ins->snapshot());
-+      masm.bind(&ok);
-+    }
-+    return;
-+  }
-+
-+  Register rhsReg = ToRegister(rhs);
-+
-+  if (mul->canOverflow()) {
-+    // Use 64-bit multiply so the full result is deterministic, then check
-+    // whether truncating to 32 bits changes the value. Match the
-+    // visitAddI/visitSubI ordering: branch first, truncate only on the
-+    // success path (the bailout discards dest anyway). extsw is
-+    // non-recording (ISA v3.0B) so it doesn't disturb CR0
-+    // either way; the choice is for consistency.
-+    masm.as_mulld(dest, lhs, rhsReg);
-+    masm.as_extsw(SecondScratchReg, dest);
-+    Label overflow;
-+    masm.as_cmpd(dest, SecondScratchReg);
-+    masm.ma_b(Assembler::NotEqual, &overflow);
-+    masm.as_extsw(dest, dest);
-+    bailoutFrom(&overflow, ins->snapshot());
-+  } else {
-+    masm.as_mullw(dest, lhs, rhsReg);
-+    masm.as_extsw(dest, dest);
-+  }
-+
-+  if (mul->canBeNegativeZero()) {
-+    Label done;
-+    masm.branchPtr(Assembler::NotEqual, dest, ImmWord(0), &done);
-+    // Result is 0. Check if lhs|rhs was negative.
-+    {
-+      UseScratchRegisterScope temps(masm);
-+      Register scratch = temps.Acquire();
-+      masm.as_or_(scratch, lhs, rhsReg);
-+      bailoutCmp32(Assembler::Signed, scratch, scratch, ins->snapshot());
-+    }
-+    masm.bind(&done);
-+  }
-+}
-+
-+void CodeGenerator::visitMulIntPtr(LMulIntPtr* ins) {
-+  Register dest = ToRegister(ins->getDef(0));
-+  Register lhs = ToRegister(ins->getOperand(0));
-+  const LAllocation* rhs = ins->getOperand(1);
-+
-+  if (rhs->isConstant()) {
-+    if (lhs != dest) {
-+      masm.movePtr(lhs, dest);
-+    }
-+    masm.mulPtr(ImmWord(ToIntPtr(rhs)), dest);
-+  } else {
-+    masm.as_mulld(dest, lhs, ToRegister(rhs));
-+  }
-+}
-+
-+void CodeGenerator::visitMulI64(LMulI64* lir) {
-+  Register dest = ToRegister(lir->getDef(0));
-+  Register lhs = ToRegister(lir->getOperand(0));
-+  const LAllocation* rhs = lir->getOperand(1);
-+
-+  if (rhs->isConstant()) {
-+    if (lhs != dest) {
-+      masm.movePtr(lhs, dest);
-+    }
-+    masm.mulPtr(ImmWord(ToInt64(rhs)), dest);
-+  } else {
-+    masm.as_mulld(dest, lhs, ToRegister(rhs));
-+  }
-+}
-+
-+void CodeGenerator::visitDivI(LDivI* ins) {
-+  Register lhs = ToRegister(ins->lhs());
-+  Register rhs = ToRegister(ins->rhs());
-+  Register dest = ToRegister(ins->output());
-+  Register temp = ToRegister(ins->temp0());
-+  MDiv* mir = ins->mir();
-+
-+  Label done;
-+
-+  // Handle divide by zero.
-+  if (mir->canBeDivideByZero()) {
-+    if (mir->trapOnError()) {
-+      Label nonZero;
-+      masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(0), &nonZero);
-+      masm.wasmTrap(wasm::Trap::IntegerDivideByZero, mir->trapSiteDesc());
-+      masm.bind(&nonZero);
-+    } else if (mir->canTruncateInfinities()) {
-+      Label nonZero;
-+      masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(0), &nonZero);
-+      masm.move32(Imm32(0), dest);
-+      masm.jump(&done);
-+      masm.bind(&nonZero);
-+    } else {
-+      MOZ_ASSERT(mir->fallible());
-+      bailoutCmp32(Assembler::Equal, rhs, Imm32(0), ins->snapshot());
-+    }
-+  }
-+
-+  // Handle INT32_MIN / -1 overflow.
-+  if (mir->canBeNegativeOverflow()) {
-+    Label notMinInt;
-+    masm.branchPtr(Assembler::NotEqual, lhs, ImmWord(INT32_MIN), &notMinInt);
-+    masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(-1), &notMinInt);
-+
-+    if (mir->trapOnError()) {
-+      masm.wasmTrap(wasm::Trap::IntegerOverflow, mir->trapSiteDesc());
-+    } else if (mir->canTruncateOverflow()) {
-+      masm.move32(Imm32(INT32_MIN), dest);
-+      masm.jump(&done);
-+    } else {
-+      MOZ_ASSERT(mir->fallible());
-+      bailout(ins->snapshot());
-+    }
-+    masm.bind(&notMinInt);
-+  }
-+
-+  // Handle negative zero.
-+  if (!mir->canTruncateNegativeZero() && mir->canBeNegativeZero()) {
-+    Label ok;
-+    masm.branchPtr(Assembler::NotEqual, lhs, ImmWord(0), &ok);
-+    bailoutCmp32(Assembler::LessThan, rhs, Imm32(0), ins->snapshot());
-+    masm.bind(&ok);
-+  }
-+
-+  // Perform the division.
-+  masm.as_divw(dest, lhs, rhs);
-+  masm.as_extsw(dest, dest);
-+
-+  // Check remainder if not truncatable.
-+  if (!mir->canTruncateRemainder()) {
-+    // Compute remainder: temp = lhs - (dest * rhs)
-+    masm.as_mullw(temp, dest, rhs);
-+    masm.as_subf(temp, temp, lhs);  // temp = lhs - temp
-+    bailoutCmp32(Assembler::NotEqual, temp, Imm32(0), ins->snapshot());
-+  }
-+
-+  masm.bind(&done);
-+}
-+
-+void CodeGenerator::visitDivPowTwoI(LDivPowTwoI* ins) {
-+  Register lhs = ToRegister(ins->numerator());
-+  Register dest = ToRegister(ins->output());
-+  UseScratchRegisterScope temps(masm);
-+  Register tmp = temps.Acquire();
-+  int32_t shift = ins->shift();
-+
-+  if (shift != 0) {
-+    MDiv* mir = ins->mir();
-+
-+    if (!mir->isTruncated()) {
-+      // If remainder != 0, bailout (check lower 'shift' bits).
-+      masm.x_slwi(tmp, lhs, 32 - shift);
-+      bailoutCmp32(Assembler::NotEqual, tmp, Imm32(0), ins->snapshot());
-+    }
-+
-+    if (!mir->canBeNegativeDividend()) {
-+      // Non-negative dividend: simple right shift.
-+      masm.as_srawi(dest, lhs, shift);
-+    } else {
-+      // Need rounding adjustment for negative numbers.
-+      // Add (1 << shift) - 1 if lhs is negative.
-+      if (shift > 1) {
-+        masm.as_srawi(tmp, lhs, 31);
-+        masm.as_rlwinm(tmp, tmp, 0, 32 - shift, 31);
-+      } else {
-+        // shift == 1: extract sign bit into bit 31
-+        masm.as_rlwinm(tmp, lhs, 1, 31, 31);
-+      }
-+      masm.add32(lhs, tmp);
-+      masm.as_srawi(dest, tmp, shift);
-+    }
-+  } else {
-+    masm.move32(lhs, dest);
-+  }
-+}
-+
-+void CodeGenerator::visitModI(LModI* ins) {
-+  Register lhs = ToRegister(ins->lhs());
-+  Register rhs = ToRegister(ins->rhs());
-+  Register dest = ToRegister(ins->output());
-+  UseScratchRegisterScope temps(masm);
-+  Register temp = temps.Acquire();
-+  MMod* mir = ins->mir();
-+  Label done;
-+
-+  // Handle divide by zero.
-+  if (mir->canBeDivideByZero()) {
-+    if (mir->isTruncated()) {
-+      if (mir->trapOnError()) {
-+        Label nonZero;
-+        masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(0), &nonZero);
-+        masm.wasmTrap(wasm::Trap::IntegerDivideByZero, mir->trapSiteDesc());
-+        masm.bind(&nonZero);
-+      } else {
-+        // Truncated division by zero yields integer zero.
-+        masm.move32(rhs, dest);
-+        Label nonZero;
-+        masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(0), &nonZero);
-+        masm.jump(&done);
-+        masm.bind(&nonZero);
-+      }
-+    } else {
-+      MOZ_ASSERT(mir->fallible());
-+      bailoutCmp32(Assembler::Equal, rhs, Imm32(0), ins->snapshot());
-+    }
-+  }
-+
-+  // Handle INT32_MIN % -1.
-+  // PPC64 divw is undefined for INT32_MIN / -1 (quotient overflows), so we
-+  // must return 0 explicitly.  The wasm spec also defines rem_s(MIN, -1) = 0.
-+  if (!mir->isUnsigned()) {
-+    Label notMinOverflow;
-+    masm.branchPtr(Assembler::NotEqual, lhs, ImmWord(INT32_MIN),
-+                   &notMinOverflow);
-+    masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(-1), &notMinOverflow);
-+    masm.move32(Imm32(0), dest);
-+    masm.jump(&done);
-+    masm.bind(&notMinOverflow);
-+  }
-+
-+  if (HasPOWER9()) {
-+    masm.as_modsw(dest, lhs, rhs);
-+  } else {
-+    masm.as_divw(temp, lhs, rhs);
-+    masm.as_mullw(temp, temp, rhs);
-+    masm.as_subf(dest, temp, lhs);
-+  }
-+  masm.as_extsw(dest, dest);
-+
-+  // If X%Y == 0 and X < 0, the result is -0, and we need to bail out.
-+  if (mir->canBeNegativeDividend() && !mir->isTruncated()) {
-+    MOZ_ASSERT(mir->fallible());
-+    Label ok;
-+    masm.branchPtr(Assembler::NotEqual, dest, ImmWord(0), &ok);
-+    bailoutCmp32(Assembler::Signed, lhs, Imm32(0), ins->snapshot());
-+    masm.bind(&ok);
-+  }
-+
-+  masm.bind(&done);
-+}
-+
-+void CodeGenerator::visitModPowTwoI(LModPowTwoI* ins) {
-+  Register in = ToRegister(ins->getOperand(0));
-+  Register out = ToRegister(ins->getDef(0));
-+  MMod* mir = ins->mir();
-+  int32_t shift = ins->shift();
-+  uint32_t mask = (uint32_t(1) << shift) - 1;
-+
-+  if (mir->canBeNegativeDividend() && !mir->isTruncated()) {
-+    Label nonNeg;
-+    masm.branchPtr(Assembler::NotEqual, in, ImmWord(0), &nonNeg);
-+    // in == 0: mod is 0, check for negative zero.
-+    bailoutCmp32(Assembler::Signed, in, in, ins->snapshot());
-+    masm.bind(&nonNeg);
-+  }
-+
-+  Label negative, done;
-+  masm.branch32(Assembler::Signed, in, in, &negative);
-+
-+  // Positive case: just mask.
-+  masm.and32(Imm32(mask), in, out);
-+  masm.jump(&done);
-+
-+  // Negative case: negate, mask, negate back.
-+  masm.bind(&negative);
-+  masm.as_neg(out, in);
-+  masm.and32(Imm32(mask), out);
-+  masm.as_neg(out, out);
-+  masm.as_extsw(out, out);
-+
-+  if (!mir->isTruncated() && mir->canBeNegativeDividend()) {
-+    Label ok;
-+    masm.branchPtr(Assembler::NotEqual, out, ImmWord(0), &ok);
-+    bailout(ins->snapshot());
-+    masm.bind(&ok);
-+  }
-+
-+  masm.bind(&done);
-+}
-+
-+void CodeGenerator::visitModMaskI(LModMaskI* ins) {
-+  Register src = ToRegister(ins->input());
-+  Register dest = ToRegister(ins->output());
-+  Register tmp0 = ToRegister(ins->temp0());
-+  Register tmp1 = ToRegister(ins->temp1());
-+  MMod* mir = ins->mir();
-+
-+  if (!mir->isTruncated() && mir->canBeNegativeDividend()) {
-+    MOZ_ASSERT(mir->fallible());
-+
-+    Label bail;
-+    masm.ma_mod_mask(src, dest, tmp0, tmp1, ins->shift(), &bail);
-+    bailoutFrom(&bail, ins->snapshot());
-+  } else {
-+    masm.ma_mod_mask(src, dest, tmp0, tmp1, ins->shift(), nullptr);
-+  }
-+}
-+
-+void CodeGenerator::visitNegI(LNegI* ins) {
-+  Register input = ToRegister(ins->input());
-+  Register output = ToRegister(ins->output());
-+  masm.as_neg(output, input);
-+  masm.as_extsw(output, output);
-+}
-+
-+void CodeGenerator::visitNegI64(LNegI64* ins) {
-+  Register input = ToRegister64(ins->input()).reg;
-+  Register output = ToOutRegister64(ins).reg;
-+  masm.as_neg(output, input);
-+}
-+
-+void CodeGenerator::visitUDivOrMod(LUDivOrMod* ins) {
-+  Register lhs = ToRegister(ins->lhs());
-+  Register rhs = ToRegister(ins->rhs());
-+  Register output = ToRegister(ins->output());
-+  UseScratchRegisterScope temps(masm);
-+  Register temp = temps.Acquire();
-+  Label done;
-+
-+  // Division by zero check.
-+  if (ins->canBeDivideByZero()) {
-+    if (ins->mir()->isTruncated()) {
-+      if (ins->trapOnError()) {
-+        Label nonZero;
-+        masm.branch32(Assembler::NotEqual, rhs, Imm32(0), &nonZero);
-+        masm.wasmTrap(wasm::Trap::IntegerDivideByZero, ins->trapSiteDesc());
-+        masm.bind(&nonZero);
-+      } else {
-+        Label nonZero;
-+        masm.branch32(Assembler::NotEqual, rhs, Imm32(0), &nonZero);
-+        masm.move32(Imm32(0), output);
-+        masm.jump(&done);
-+        masm.bind(&nonZero);
-+      }
-+    } else {
-+      bailoutCmp32(Assembler::Equal, rhs, Imm32(0), ins->snapshot());
-+    }
-+  }
-+
-+  // Zero-extend both operands to 64 bits for unsigned divide.
-+  masm.move32To64ZeroExtend(lhs, Register64(lhs));
-+  masm.move32To64ZeroExtend(rhs, Register64(rhs));
-+
-+  if (ins->mir()->isDiv()) {
-+    // Division path: compute quotient. Check remainder if needed.
-+    if (!ins->mir()->toDiv()->canTruncateRemainder()) {
-+      if (HasPOWER9()) {
-+        masm.as_moduw(temp, lhs, rhs);
-+      } else {
-+        masm.as_divwu(temp, lhs, rhs);
-+        masm.as_mullw(temp, temp, rhs);
-+        masm.as_subf(temp, temp, lhs);
-+      }
-+      bailoutCmp32(Assembler::NotEqual, temp, Imm32(0), ins->snapshot());
-+    }
-+    masm.as_divwu(output, lhs, rhs);
-+  } else {
-+    // Modulo path.
-+    if (HasPOWER9()) {
-+      masm.as_moduw(output, lhs, rhs);
-+    } else {
-+      masm.as_divwu(temp, lhs, rhs);
-+      masm.as_mullw(temp, temp, rhs);
-+      masm.as_subf(output, temp, lhs);
-+    }
-+  }
-+
-+  masm.as_extsw(output, output);
-+
-+  if (!ins->mir()->isTruncated()) {
-+    bailoutCmp32(Assembler::LessThan, output, Imm32(0), ins->snapshot());
-+  }
-+
-+  masm.bind(&done);
-+}
-+
-+void CodeGenerator::visitDivOrModI64(LDivOrModI64* lir) {
-+  Register lhs = ToRegister(lir->getOperand(0));
-+  Register rhs = ToRegister(lir->getOperand(1));
-+  Register output = ToRegister(lir->output());
-+
-+  Label done;
-+
-+  // Division by zero trap.
-+  if (lir->canBeDivideByZero()) {
-+    Label nonZero;
-+    masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(0), &nonZero);
-+    masm.wasmTrap(wasm::Trap::IntegerDivideByZero, lir->trapSiteDesc());
-+    masm.bind(&nonZero);
-+  }
-+
-+  // INT64_MIN / -1 overflow trap (for div only).
-+  if (lir->canBeNegativeOverflow()) {
-+    Label notMinInt;
-+    masm.branchPtr(Assembler::NotEqual, lhs, ImmWord(INT64_MIN), &notMinInt);
-+    masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(-1), &notMinInt);
-+    if (lir->mir()->isDiv()) {
-+      masm.wasmTrap(wasm::Trap::IntegerOverflow, lir->trapSiteDesc());
-+    } else {
-+      masm.movePtr(ImmWord(0), output);
-+      masm.jump(&done);
-+    }
-+    masm.bind(&notMinInt);
-+  }
-+
-+  if (lir->mir()->isDiv()) {
-+    masm.as_divd(output, lhs, rhs);
-+  } else if (HasPOWER9()) {
-+    masm.as_modsd(output, lhs, rhs);
-+  } else {
-+    masm.as_divd(output, lhs, rhs);
-+    masm.as_mulld(output, output, rhs);
-+    masm.as_subf(output, output, lhs);
-+  }
-+
-+  masm.bind(&done);
-+}
-+
-+void CodeGenerator::visitUDivOrModI64(LUDivOrModI64* lir) {
-+  Register lhs = ToRegister(lir->getOperand(0));
-+  Register rhs = ToRegister(lir->getOperand(1));
-+  Register output = ToRegister(lir->output());
-+
-+  // Division by zero trap.
-+  if (lir->canBeDivideByZero()) {
-+    Label nonZero;
-+    masm.branchPtr(Assembler::NotEqual, rhs, ImmWord(0), &nonZero);
-+    masm.wasmTrap(wasm::Trap::IntegerDivideByZero, lir->trapSiteDesc());
-+    masm.bind(&nonZero);
-+  }
-+
-+  if (lir->mir()->isDiv()) {
-+    masm.as_divdu(output, lhs, rhs);
-+  } else if (HasPOWER9()) {
-+    masm.as_modud(output, lhs, rhs);
-+  } else {
-+    masm.as_divdu(output, lhs, rhs);
-+    masm.as_mulld(output, output, rhs);
-+    masm.as_subf(output, output, lhs);
-+  }
-+}
-+
-+// ===============================================================
-+// Visitors: Bitwise
-+
-+void CodeGenerator::visitBitNotI(LBitNotI* ins) {
-+  Register input = ToRegister(ins->input());
-+  Register dest = ToRegister(ins->output());
-+  masm.as_nor(dest, input, input);
-+  masm.as_extsw(dest, dest);
-+}
-+
-+void CodeGenerator::visitBitNotI64(LBitNotI64* ins) {
-+  Register input = ToRegister64(ins->input()).reg;
-+  Register dest = ToOutRegister64(ins).reg;
-+  masm.as_nor(dest, input, input);
-+}
-+
-+void CodeGenerator::visitBitOpI(LBitOpI* ins) {
-+  Register dest = ToRegister(ins->getDef(0));
-+  Register lhs = ToRegister(ins->getOperand(0));
-+  const LAllocation* rhs = ins->getOperand(1);
-+
-+  switch (ins->bitop()) {
-+    case JSOp::BitOr:
-+      if (rhs->isConstant()) {
-+        masm.or32(Imm32(ToInt32(rhs)), lhs, dest);
-+      } else {
-+        masm.as_or_(dest, lhs, ToRegister(rhs));
-+        masm.as_extsw(dest, dest);
-+      }
-+      break;
-+    case JSOp::BitXor:
-+      if (rhs->isConstant()) {
-+        masm.xor32(Imm32(ToInt32(rhs)), lhs, dest);
-+      } else {
-+        masm.as_xor_(dest, lhs, ToRegister(rhs));
-+        masm.as_extsw(dest, dest);
-+      }
-+      break;
-+    case JSOp::BitAnd:
-+      if (rhs->isConstant()) {
-+        masm.and32(Imm32(ToInt32(rhs)), lhs, dest);
-+      } else {
-+        masm.as_and_(dest, lhs, ToRegister(rhs));
-+        masm.as_extsw(dest, dest);
-+      }
-+      break;
-+    default:
-+      MOZ_CRASH("unexpected binary opcode");
-+  }
-+}
-+
-+void CodeGenerator::visitBitOpI64(LBitOpI64* lir) {
-+  Register dest = ToRegister(lir->getDef(0));
-+  Register lhs = ToRegister(lir->getOperand(0));
-+  const LAllocation* rhs = lir->getOperand(1);
-+
-+  switch (lir->bitop()) {
-+    case JSOp::BitOr:
-+      if (rhs->isConstant()) {
-+        if (lhs != dest) {
-+          masm.movePtr(lhs, dest);
-+        }
-+        masm.or64(Imm64(ToInt64(rhs)), Register64(dest));
-+      } else {
-+        masm.as_or_(dest, lhs, ToRegister(rhs));
-+      }
-+      break;
-+    case JSOp::BitXor:
-+      if (rhs->isConstant()) {
-+        if (lhs != dest) {
-+          masm.movePtr(lhs, dest);
-+        }
-+        masm.xor64(Imm64(ToInt64(rhs)), Register64(dest));
-+      } else {
-+        masm.as_xor_(dest, lhs, ToRegister(rhs));
-+      }
-+      break;
-+    case JSOp::BitAnd:
-+      if (rhs->isConstant()) {
-+        if (lhs != dest) {
-+          masm.movePtr(lhs, dest);
-+        }
-+        masm.and64(Imm64(ToInt64(rhs)), Register64(dest));
-+      } else {
-+        masm.as_and_(dest, lhs, ToRegister(rhs));
-+      }
-+      break;
-+    default:
-+      MOZ_CRASH("unexpected binary opcode");
-+  }
-+}
-+
-+void CodeGenerator::visitShiftI(LShiftI* ins) {
-+  Register lhs = ToRegister(ins->lhs());
-+  const LAllocation* rhs = ins->rhs();
-+  Register dest = ToRegister(ins->output());
-+
-+  if (rhs->isConstant()) {
-+    int32_t shift = ToInt32(rhs) & 0x1f;
-+    switch (ins->bitop()) {
-+      case JSOp::Lsh:
-+        if (shift) {
-+          masm.lshift32(Imm32(shift), lhs, dest);
-+        } else {
-+          masm.move32(lhs, dest);
-+        }
-+        break;
-+      case JSOp::Rsh:
-+        if (shift) {
-+          masm.rshift32Arithmetic(Imm32(shift), lhs, dest);
-+        } else {
-+          masm.move32(lhs, dest);
-+        }
-+        break;
-+      case JSOp::Ursh:
-+        if (shift) {
-+          masm.rshift32(Imm32(shift), lhs, dest);
-+        } else {
-+          // x >>> 0 can produce values that need to be treated as unsigned.
-+          masm.move32(lhs, dest);
-+        }
-+        if (ins->mir()->toUrsh()->fallible()) {
-+          // x >>> 0 can produce values that don't fit in signed int32.
-+          bailoutCmp32(Assembler::LessThan, dest, Imm32(0), ins->snapshot());
-+        }
-+        break;
-+      default:
-+        MOZ_CRASH("unexpected shift opcode");
-+    }
-+  } else {
-+    Register shiftReg = ToRegister(rhs);
-+    // PPC slw/srw/sraw use 6 bits of shift amount; JS requires mod 32.
-+    UseScratchRegisterScope temps(masm);
-+    Register masked = temps.Acquire();
-+    masm.as_rlwinm(masked, shiftReg, 0, 27, 31);
-+    switch (ins->bitop()) {
-+      case JSOp::Lsh:
-+        masm.as_slw(dest, lhs, masked);
-+        masm.as_extsw(dest, dest);
-+        break;
-+      case JSOp::Rsh:
-+        masm.as_sraw(dest, lhs, masked);
-+        break;
-+      case JSOp::Ursh:
-+        masm.as_srw(dest, lhs, masked);
-+        masm.as_extsw(dest, dest);
-+        if (ins->mir()->toUrsh()->fallible()) {
-+          bailoutCmp32(Assembler::LessThan, dest, Imm32(0), ins->snapshot());
-+        }
-+        break;
-+      default:
-+        MOZ_CRASH("unexpected shift opcode");
-+    }
-+  }
-+}
-+
-+void CodeGenerator::visitShiftIntPtr(LShiftIntPtr* ins) {
-+  Register lhs = ToRegister(ins->lhs());
-+  Register dest = ToRegister(ins->output());
-+
-+  if (ins->rhs()->isConstant()) {
-+    // ShiftIntPtr's RHS constant is IntPtr- or Int32-typed, not Int64. Use
-+    // ToIntPtr() which dispatches on the underlying MIRType (the previous
-+    // MConstant::toInt64() call asserted when the constant wasn't Int64).
-+    int32_t shift = int32_t(ToIntPtr(ins->rhs())) & 0x3f;
-+    switch (ins->bitop()) {
-+      case JSOp::Lsh:
-+        if (shift) {
-+          masm.lshiftPtr(Imm32(shift), lhs, dest);
-+        } else {
-+          masm.movePtr(lhs, dest);
-+        }
-+        break;
-+      case JSOp::Rsh:
-+        if (shift) {
-+          masm.rshiftPtrArithmetic(Imm32(shift), lhs, dest);
-+        } else {
-+          masm.movePtr(lhs, dest);
-+        }
-+        break;
-+      case JSOp::Ursh:
-+        if (shift) {
-+          masm.rshiftPtr(Imm32(shift), lhs, dest);
-+        } else {
-+          masm.movePtr(lhs, dest);
-+        }
-+        break;
-+      default:
-+        MOZ_CRASH("unexpected shift opcode");
-+    }
-+  } else {
-+    Register shiftReg = ToRegister(ins->rhs());
-+    // sld/srd/srad use the low 7 bits of the shift count: counts >= 64
-+    // produce 0 (sign-fill for srad). Mask to 6 bits for mod-64 semantics.
-+    UseScratchRegisterScope temps(masm);
-+    Register masked = temps.Acquire();
-+    masm.as_rldicl(masked, shiftReg, 0, 58);
-+    switch (ins->bitop()) {
-+      case JSOp::Lsh:
-+        masm.as_sld(dest, lhs, masked);
-+        break;
-+      case JSOp::Rsh:
-+        masm.as_srad(dest, lhs, masked);
-+        break;
-+      case JSOp::Ursh:
-+        masm.as_srd(dest, lhs, masked);
-+        break;
-+      default:
-+        MOZ_CRASH("unexpected shift opcode");
-+    }
-+  }
-+}
-+
-+void CodeGenerator::visitShiftI64(LShiftI64* lir) {
-+  Register lhs = ToRegister64(lir->lhs()).reg;
-+  Register dest = ToOutRegister64(lir).reg;
-+  const LAllocation* rhs = lir->rhs();
-+
-+  if (rhs->isConstant()) {
-+    int32_t shift = int32_t(rhs->toConstant()->toInt64()) & 0x3f;
-+    switch (lir->bitop()) {
-+      case JSOp::Lsh:
-+        if (shift) {
-+          masm.lshiftPtr(Imm32(shift), lhs, dest);
-+        } else {
-+          masm.movePtr(lhs, dest);
-+        }
-+        break;
-+      case JSOp::Rsh:
-+        if (shift) {
-+          masm.rshiftPtrArithmetic(Imm32(shift), lhs, dest);
-+        } else {
-+          masm.movePtr(lhs, dest);
-+        }
-+        break;
-+      case JSOp::Ursh:
-+        if (shift) {
-+          masm.rshiftPtr(Imm32(shift), lhs, dest);
-+        } else {
-+          masm.movePtr(lhs, dest);
-+        }
-+        break;
-+      default:
-+        MOZ_CRASH("unexpected shift opcode");
-+    }
-+  } else {
-+    Register shiftReg = ToRegister(rhs);
-+    // Wasm i64 shifts require shift count modulo 64. PPC64 sld/srd/srad
-+    // use a 7-bit shift field, so shifts >= 64 produce 0 (or sign-fill
-+    // for srad). Mask to 6 bits first.
-+    UseScratchRegisterScope temps(masm);
-+    Register masked = temps.Acquire();
-+    masm.as_rldicl(masked, shiftReg, 0, 58);  // clrldi: keep low 6 bits
-+    switch (lir->bitop()) {
-+      case JSOp::Lsh:
-+        masm.as_sld(dest, lhs, masked);
-+        break;
-+      case JSOp::Rsh:
-+        masm.as_srad(dest, lhs, masked);
-+        break;
-+      case JSOp::Ursh:
-+        masm.as_srd(dest, lhs, masked);
-+        break;
-+      default:
-+        MOZ_CRASH("unexpected shift opcode");
-+    }
-+  }
-+}
-+
-+void CodeGenerator::visitUrshD(LUrshD* ins) {
-+  Register lhs = ToRegister(ins->lhs());
-+  const LAllocation* rhs = ins->rhs();
-+  FloatRegister dest = ToFloatRegister(ins->output());
-+
-+  Register temp = ToRegister(ins->temp0());
-+
-+  if (rhs->isConstant()) {
-+    int32_t shift = ToInt32(rhs) & 0x1f;
-+    if (shift) {
-+      masm.rshift32(Imm32(shift), lhs, temp);
-+    } else {
-+      masm.move32(lhs, temp);
-+    }
-+  } else {
-+    masm.move32(lhs, temp);
-+    masm.rshift32(ToRegister(rhs), temp);
-+  }
-+
-+  masm.convertUInt32ToDouble(temp, dest);
-+}
-+
-+// ===============================================================
-+// Visitors: Floating-point arithmetic
-+
-+void CodeGenerator::visitMathD(LMathD* math) {
-+  FloatRegister lhs = ToFloatRegister(math->lhs());
-+  FloatRegister rhs = ToFloatRegister(math->rhs());
-+  FloatRegister dest = ToFloatRegister(math->output());
-+
-+  switch (math->jsop()) {
-+    case JSOp::Add:
-+      masm.as_fadd(dest, lhs, rhs);
-+      break;
-+    case JSOp::Sub:
-+      masm.as_fsub(dest, lhs, rhs);
-+      break;
-+    case JSOp::Mul:
-+      masm.as_fmul(dest, lhs, rhs);
-+      break;
-+    case JSOp::Div:
-+      masm.as_fdiv(dest, lhs, rhs);
-+      break;
-+    default:
-+      MOZ_CRASH("unexpected double opcode");
-+  }
-+}
-+
-+void CodeGenerator::visitMathF(LMathF* math) {
-+  FloatRegister lhs = ToFloatRegister(math->lhs());
-+  FloatRegister rhs = ToFloatRegister(math->rhs());
-+  FloatRegister dest = ToFloatRegister(math->output());
-+
-+  switch (math->jsop()) {
-+    case JSOp::Add:
-+      masm.as_fadds(dest, lhs, rhs);
-+      break;
-+    case JSOp::Sub:
-+      masm.as_fsubs(dest, lhs, rhs);
-+      break;
-+    case JSOp::Mul:
-+      masm.as_fmuls(dest, lhs, rhs);
-+      break;
-+    case JSOp::Div:
-+      masm.as_fdivs(dest, lhs, rhs);
-+      break;
-+    default:
-+      MOZ_CRASH("unexpected float32 opcode");
-+  }
-+}
-+
-+void CodeGenerator::visitMinMaxD(LMinMaxD* ins) {
-+  FloatRegister first = ToFloatRegister(ins->first());
-+  FloatRegister second = ToFloatRegister(ins->second());
-+  mozilla::DebugOnly<FloatRegister> output = ToFloatRegister(ins->output());
-+
-+  MOZ_ASSERT(first == output);
-+  if (ins->mir()->isMax()) {
-+    masm.maxDouble(second, first, /* handleNaN = */ true);
-+  } else {
-+    masm.minDouble(second, first, /* handleNaN = */ true);
-+  }
-+}
-+
-+void CodeGenerator::visitMinMaxF(LMinMaxF* ins) {
-+  FloatRegister first = ToFloatRegister(ins->first());
-+  FloatRegister second = ToFloatRegister(ins->second());
-+  mozilla::DebugOnly<FloatRegister> output = ToFloatRegister(ins->output());
-+
-+  MOZ_ASSERT(first == output);
-+  if (ins->mir()->isMax()) {
-+    masm.maxFloat32(second, first, /* handleNaN = */ true);
-+  } else {
-+    masm.minFloat32(second, first, /* handleNaN = */ true);
-+  }
-+}
-+
-+void CodeGenerator::visitNegD(LNegD* ins) {
-+  FloatRegister input = ToFloatRegister(ins->input());
-+  FloatRegister output = ToFloatRegister(ins->output());
-+  masm.as_fneg(output, input);
-+}
-+
-+void CodeGenerator::visitNegF(LNegF* ins) {
-+  FloatRegister input = ToFloatRegister(ins->input());
-+  FloatRegister output = ToFloatRegister(ins->output());
-+  masm.as_fneg(output, input);
-+}
-+
-+void CodeGenerator::visitPowHalfD(LPowHalfD* ins) {
-+  FloatRegister input = ToFloatRegister(ins->input());
-+  FloatRegister output = ToFloatRegister(ins->output());
-+
-+  Label done, skip;
-+
-+  // Check for -Infinity.
-+  masm.loadConstantDouble(NegativeInfinity<double>(), ScratchDoubleReg);
-+  masm.branchDouble(Assembler::DoubleNotEqualOrUnordered, input,
-+                    ScratchDoubleReg, &skip);
-+  masm.loadConstantDouble(std::numeric_limits<double>::infinity(), output);
-+  masm.jump(&done);
-+
-+  masm.bind(&skip);
-+  // Add 0.0 to handle -0.
-+  masm.loadConstantDouble(0.0, ScratchDoubleReg);
-+  masm.as_fadd(output, input, ScratchDoubleReg);
-+  masm.as_fsqrt(output, output);
-+
-+  masm.bind(&done);
-+}
-+
-+void CodeGenerator::visitNotD(LNotD* ins) {
-+  FloatRegister input = ToFloatRegister(ins->input());
-+  Register dest = ToRegister(ins->output());
-+
-+  masm.loadConstantDouble(0.0, ScratchDoubleReg);
-+  masm.as_fcmpu(input, ScratchDoubleReg);
-+  masm.ma_cmp_set_dbl(dest, Assembler::DoubleEqualOrUnordered);
-+}
-+
-+void CodeGenerator::visitNotF(LNotF* ins) {
-+  FloatRegister input = ToFloatRegister(ins->input());
-+  Register dest = ToRegister(ins->output());
-+
-+  masm.loadConstantFloat32(0.0f, ScratchFloat32Reg);
-+  masm.as_fcmpu(input, ScratchFloat32Reg);
-+  masm.ma_cmp_set_dbl(dest, Assembler::DoubleEqualOrUnordered);
-+}
-+
-+// ===============================================================
-+// Visitors: FP comparisons and branches
-+
-+void CodeGenerator::visitCompareD(LCompareD* comp) {
-+  FloatRegister lhs = ToFloatRegister(comp->left());
-+  FloatRegister rhs = ToFloatRegister(comp->right());
-+  Register dest = ToRegister(comp->output());
-+  Assembler::DoubleCondition cond =
-+      comp->mir()->jsop() == JSOp::StrictEq ? Assembler::DoubleEqual
-+      : comp->mir()->jsop() == JSOp::StrictNe
-+          ? Assembler::DoubleNotEqualOrUnordered
-+          : JSOpToDoubleCondition(comp->mir()->jsop());
-+
-+  masm.as_fcmpu(lhs, rhs);
-+  masm.ma_cmp_set_dbl(dest, cond);
-+}
-+
-+void CodeGenerator::visitCompareF(LCompareF* comp) {
-+  FloatRegister lhs = ToFloatRegister(comp->left());
-+  FloatRegister rhs = ToFloatRegister(comp->right());
-+  Register dest = ToRegister(comp->output());
-+  Assembler::DoubleCondition cond =
-+      comp->mir()->jsop() == JSOp::StrictEq ? Assembler::DoubleEqual
-+      : comp->mir()->jsop() == JSOp::StrictNe
-+          ? Assembler::DoubleNotEqualOrUnordered
-+          : JSOpToDoubleCondition(comp->mir()->jsop());
-+
-+  masm.as_fcmpu(lhs, rhs);
-+  masm.ma_cmp_set_dbl(dest, cond);
-+}
-+
-+void CodeGenerator::visitCompareDAndBranch(LCompareDAndBranch* comp) {
-+  FloatRegister lhs = ToFloatRegister(comp->left());
-+  FloatRegister rhs = ToFloatRegister(comp->right());
-+
-+  Assembler::DoubleCondition cond =
-+      JSOpToDoubleCondition(comp->cmpMir()->jsop());
-+  MBasicBlock* ifTrue = comp->ifTrue();
-+  MBasicBlock* ifFalse = comp->ifFalse();
-+
-+  if (isNextBlock(ifFalse->lir())) {
-+    branchToBlock(Assembler::DoubleFloat, cond, lhs, rhs, ifTrue);
-+  } else {
-+    branchToBlock(Assembler::DoubleFloat, Assembler::InvertCondition(cond), lhs,
-+                  rhs, ifFalse);
-+    jumpToBlock(ifTrue);
-+  }
-+}
-+
-+void CodeGenerator::visitCompareFAndBranch(LCompareFAndBranch* comp) {
-+  FloatRegister lhs = ToFloatRegister(comp->left());
-+  FloatRegister rhs = ToFloatRegister(comp->right());
-+
-+  Assembler::DoubleCondition cond =
-+      JSOpToDoubleCondition(comp->cmpMir()->jsop());
-+  MBasicBlock* ifTrue = comp->ifTrue();
-+  MBasicBlock* ifFalse = comp->ifFalse();
-+
-+  if (isNextBlock(ifFalse->lir())) {
-+    branchToBlock(Assembler::SingleFloat, cond, lhs, rhs, ifTrue);
-+  } else {
-+    branchToBlock(Assembler::SingleFloat, Assembler::InvertCondition(cond), lhs,
-+                  rhs, ifFalse);
-+    jumpToBlock(ifTrue);
-+  }
-+}
-+
-+void CodeGenerator::visitTestDAndBranch(LTestDAndBranch* test) {
-+  FloatRegister input = ToFloatRegister(test->input());
-+
-+  MBasicBlock* ifTrue = test->ifTrue();
-+  MBasicBlock* ifFalse = test->ifFalse();
-+
-+  masm.loadConstantDouble(0.0, ScratchDoubleReg);
-+
-+  if (isNextBlock(ifFalse->lir())) {
-+    branchToBlock(Assembler::DoubleFloat, Assembler::DoubleNotEqual, input,
-+                  ScratchDoubleReg, ifTrue);
-+  } else {
-+    branchToBlock(Assembler::DoubleFloat, Assembler::DoubleEqualOrUnordered,
-+                  input, ScratchDoubleReg, ifFalse);
-+    jumpToBlock(ifTrue);
-+  }
-+}
-+
-+void CodeGenerator::visitTestFAndBranch(LTestFAndBranch* test) {
-+  FloatRegister input = ToFloatRegister(test->input());
-+
-+  MBasicBlock* ifTrue = test->ifTrue();
-+  MBasicBlock* ifFalse = test->ifFalse();
-+
-+  masm.loadConstantFloat32(0.0f, ScratchFloat32Reg);
-+
-+  if (isNextBlock(ifFalse->lir())) {
-+    branchToBlock(Assembler::SingleFloat, Assembler::DoubleNotEqual, input,
-+                  ScratchFloat32Reg, ifTrue);
-+  } else {
-+    branchToBlock(Assembler::SingleFloat, Assembler::DoubleEqualOrUnordered,
-+                  input, ScratchFloat32Reg, ifFalse);
-+    jumpToBlock(ifTrue);
-+  }
-+}
-+
-+// ===============================================================
-+// Visitors: Truncation
-+
-+void CodeGenerator::visitTruncateDToInt32(LTruncateDToInt32* ins) {
-+  emitTruncateDouble(ToFloatRegister(ins->input()), ToRegister(ins->output()),
-+                     ins->mir());
-+}
-+
-+void CodeGenerator::visitTruncateFToInt32(LTruncateFToInt32* ins) {
-+  emitTruncateFloat32(ToFloatRegister(ins->input()), ToRegister(ins->output()),
-+                      ins->mir());
-+}
-+
-+// ===============================================================
-+// Visitors: Int64 / Wasm type conversions
-+
-+void CodeGenerator::visitExtendInt32ToInt64(LExtendInt32ToInt64* lir) {
-+  Register input = ToRegister(lir->input());
-+  Register output = ToRegister(lir->output());
-+
-+  if (lir->mir()->isUnsigned()) {
-+    masm.move32To64ZeroExtend(input, Register64(output));
-+  } else {
-+    masm.as_extsw(output, input);
-+  }
-+}
-+
-+void CodeGenerator::visitWrapInt64ToInt32(LWrapInt64ToInt32* lir) {
-+  const LInt64Allocation input = lir->input();
-+  Register output = ToRegister(lir->output());
-+
-+  if (lir->mir()->bottomHalf()) {
-+    if (input.value().isMemory()) {
-+      masm.load32(ToAddress(input), output);
-+    } else {
-+      masm.move64To32(ToRegister64(input), output);
-+    }
-+  } else {
-+    // The only producer of `bottomHalf=false` MWrapInt64ToInt32 in the
-+    // current MIR pipeline is the GPR-pair argument splitter in
-+    // WasmIonCompile.cpp, which is gated on JS_CODEGEN_REGISTER_PAIR
-+    // (32-bit ARM only). PPC64 is 64-bit and never reaches this path.
-+    // Matches the same defensive crash in x64 / ARM64 backends.
-+    MOZ_CRASH("Not implemented.");
-+  }
-+}
-+
-+void CodeGenerator::visitSignExtendInt64(LSignExtendInt64* lir) {
-+  Register64 input = ToRegister64(lir->input());
-+  Register64 output = ToOutRegister64(lir);
-+
-+  switch (lir->mir()->mode()) {
-+    case MSignExtendInt64::Byte:
-+      masm.as_extsb(output.reg, input.reg);
-+      break;
-+    case MSignExtendInt64::Half:
-+      masm.as_extsh(output.reg, input.reg);
-+      break;
-+    case MSignExtendInt64::Word:
-+      masm.as_extsw(output.reg, input.reg);
-+      break;
-+  }
-+}
-+
-+void CodeGenerator::visitWasmExtendU32Index(LWasmExtendU32Index* lir) {
-+  Register input = ToRegister(lir->input());
-+  Register output = ToRegister(lir->output());
-+  masm.move32To64ZeroExtend(input, Register64(output));
-+}
-+
-+void CodeGenerator::visitWasmWrapU32Index(LWasmWrapU32Index* lir) {
-+  Register input = ToRegister(lir->input());
-+  Register output = ToRegister(lir->output());
-+  masm.move32(input, output);
-+}
-+
-+void CodeGenerator::visitWasmTruncateToInt32(LWasmTruncateToInt32* lir) {
-+  auto input = ToFloatRegister(lir->input());
-+  auto output = ToRegister(lir->output());
-+
-+  MWasmTruncateToInt32* mir = lir->mir();
-+  MIRType fromType = mir->input()->type();
-+
-+  MOZ_ASSERT(fromType == MIRType::Double || fromType == MIRType::Float32);
-+
-+  auto* ool = new (alloc()) OutOfLineWasmTruncateCheck(mir, input, output);
-+  addOutOfLineCode(ool, mir);
-+
-+  Label* oolEntry = ool->entry();
-+  if (mir->isUnsigned()) {
-+    if (fromType == MIRType::Double) {
-+      masm.wasmTruncateDoubleToUInt32(input, output, mir->isSaturating(),
-+                                      oolEntry);
-+    } else if (fromType == MIRType::Float32) {
-+      masm.wasmTruncateFloat32ToUInt32(input, output, mir->isSaturating(),
-+                                       oolEntry);
-+    } else {
-+      MOZ_CRASH("unexpected type");
-+    }
-+
-+    masm.bind(ool->rejoin());
-+    return;
-+  }
-+
-+  if (fromType == MIRType::Double) {
-+    masm.wasmTruncateDoubleToInt32(input, output, mir->isSaturating(),
-+                                   oolEntry);
-+  } else if (fromType == MIRType::Float32) {
-+    masm.wasmTruncateFloat32ToInt32(input, output, mir->isSaturating(),
-+                                    oolEntry);
-+  } else {
-+    MOZ_CRASH("unexpected type");
-+  }
-+
-+  masm.bind(ool->rejoin());
-+}
-+
-+void CodeGenerator::visitWasmTruncateToInt64(LWasmTruncateToInt64* lir) {
-+  FloatRegister input = ToFloatRegister(lir->input());
-+  Register64 output = ToOutRegister64(lir);
-+
-+  MWasmTruncateToInt64* mir = lir->mir();
-+  MIRType fromType = mir->input()->type();
-+
-+  MOZ_ASSERT(fromType == MIRType::Double || fromType == MIRType::Float32);
-+
-+  auto* ool = new (alloc()) OutOfLineWasmTruncateCheck(mir, input, output);
-+  addOutOfLineCode(ool, mir);
-+
-+  Label* oolEntry = ool->entry();
-+  Label* oolRejoin = ool->rejoin();
-+  bool isSaturating = mir->isSaturating();
-+
-+  if (fromType == MIRType::Double) {
-+    if (mir->isUnsigned()) {
-+      masm.wasmTruncateDoubleToUInt64(input, output, isSaturating, oolEntry,
-+                                      oolRejoin, InvalidFloatReg);
-+    } else {
-+      masm.wasmTruncateDoubleToInt64(input, output, isSaturating, oolEntry,
-+                                     oolRejoin, InvalidFloatReg);
-+    }
-+  } else {
-+    if (mir->isUnsigned()) {
-+      masm.wasmTruncateFloat32ToUInt64(input, output, isSaturating, oolEntry,
-+                                       oolRejoin, InvalidFloatReg);
-+    } else {
-+      masm.wasmTruncateFloat32ToInt64(input, output, isSaturating, oolEntry,
-+                                      oolRejoin, InvalidFloatReg);
-+    }
-+  }
-+}
-+
-+void CodeGenerator::visitInt64ToFloatingPoint(LInt64ToFloatingPoint* lir) {
-+  Register64 input = ToRegister64(lir->input());
-+  FloatRegister output = ToFloatRegister(lir->output());
-+  MIRType outputType = lir->mir()->type();
-+
-+  if (outputType == MIRType::Double) {
-+    if (lir->mir()->isUnsigned()) {
-+      masm.convertUInt64ToDouble(input, output, Register::Invalid());
-+    } else {
-+      masm.convertInt64ToDouble(input, output);
-+    }
-+  } else {
-+    if (lir->mir()->isUnsigned()) {
-+      masm.convertUInt64ToFloat32(input, output, Register::Invalid());
-+    } else {
-+      masm.convertInt64ToFloat32(input, output);
-+    }
-+  }
-+}
-+
-+void CodeGenerator::visitWasmUint32ToDouble(LWasmUint32ToDouble* lir) {
-+  Register input = ToRegister(lir->input());
-+  FloatRegister output = ToFloatRegister(lir->output());
-+  masm.convertUInt32ToDouble(input, output);
-+}
-+
-+void CodeGenerator::visitWasmUint32ToFloat32(LWasmUint32ToFloat32* lir) {
-+  Register input = ToRegister(lir->input());
-+  FloatRegister output = ToFloatRegister(lir->output());
-+  masm.convertUInt32ToFloat32(input, output);
-+}
-+
-+void CodeGenerator::visitWasmBuiltinTruncateDToInt32(
-+    LWasmBuiltinTruncateDToInt32* lir) {
-+  emitTruncateDouble(ToFloatRegister(lir->getOperand(0)),
-+                     ToRegister(lir->getDef(0)), lir->mir());
-+}
-+
-+void CodeGenerator::visitWasmBuiltinTruncateFToInt32(
-+    LWasmBuiltinTruncateFToInt32* lir) {
-+  emitTruncateFloat32(ToFloatRegister(lir->getOperand(0)),
-+                      ToRegister(lir->getDef(0)), lir->mir());
-+}
-+
-+// ===============================================================
-+// Visitors: Wasm load/store
-+
-+template <typename T>
-+void CodeGeneratorPPC64::emitWasmLoad(T* lir) {
-+  const MWasmLoad* mir = lir->mir();
-+  UseScratchRegisterScope temps(masm);
-+  Register scratch = temps.Acquire();
-+
-+  Register memoryBase = ToRegister(lir->memoryBase());
-+  Register ptr = ToRegister(lir->ptr());
-+  Register ptrScratch = ToTempRegisterOrInvalid(lir->temp0());
-+
-+  if (mir->base()->type() == MIRType::Int32) {
-+    masm.move32To64ZeroExtend(ptr, Register64(scratch));
-+    ptr = scratch;
-+    ptrScratch = ptrScratch != InvalidReg ? scratch : InvalidReg;
-+  }
-+
-+  masm.wasmLoad(mir->access(), memoryBase, ptr, ptrScratch,
-+                ToAnyRegister(lir->output()));
-+}
-+
-+template <typename T>
-+void CodeGeneratorPPC64::emitWasmStore(T* lir) {
-+  const MWasmStore* mir = lir->mir();
-+  UseScratchRegisterScope temps(masm);
-+  Register scratch = temps.Acquire();
-+
-+  Register memoryBase = ToRegister(lir->memoryBase());
-+  Register ptr = ToRegister(lir->ptr());
-+  Register ptrScratch = ToTempRegisterOrInvalid(lir->temp0());
-+
-+  if (mir->base()->type() == MIRType::Int32) {
-+    masm.move32To64ZeroExtend(ptr, Register64(scratch));
-+    ptr = scratch;
-+    ptrScratch = ptrScratch != InvalidReg ? scratch : InvalidReg;
-+  }
-+
-+  masm.wasmStore(mir->access(), ToAnyRegister(lir->value()), memoryBase, ptr,
-+                 ptrScratch);
-+}
-+
-+void CodeGenerator::visitWasmLoad(LWasmLoad* lir) { emitWasmLoad(lir); }
-+
-+void CodeGenerator::visitWasmStore(LWasmStore* lir) { emitWasmStore(lir); }
-+
-+void CodeGenerator::visitWasmLoadI64(LWasmLoadI64* lir) {
-+  const MWasmLoad* mir = lir->mir();
-+
-+  Register memoryBase = ToRegister(lir->memoryBase());
-+  Register ptrScratch = ToTempRegisterOrInvalid(lir->temp0());
-+
-+  Register ptrReg = ToRegister(lir->ptr());
-+  if (mir->base()->type() == MIRType::Int32) {
-+    masm.move32ZeroExtendToPtr(ptrReg, ptrReg);
-+  }
-+
-+  masm.wasmLoadI64(mir->access(), memoryBase, ptrReg, ptrScratch,
-+                   ToOutRegister64(lir));
-+}
-+
-+void CodeGenerator::visitWasmStoreI64(LWasmStoreI64* lir) {
-+  const MWasmStore* mir = lir->mir();
-+
-+  Register memoryBase = ToRegister(lir->memoryBase());
-+  Register ptrScratch = ToTempRegisterOrInvalid(lir->temp0());
-+
-+  Register ptrReg = ToRegister(lir->ptr());
-+  if (mir->base()->type() == MIRType::Int32) {
-+    masm.move32ZeroExtendToPtr(ptrReg, ptrReg);
-+  }
-+
-+  masm.wasmStoreI64(mir->access(), ToRegister64(lir->value()), memoryBase,
-+                    ptrReg, ptrScratch);
-+}
-+
-+void CodeGenerator::visitAsmJSLoadHeap(LAsmJSLoadHeap* ins) {
-+  const MAsmJSLoadHeap* mir = ins->mir();
-+  MOZ_ASSERT(!mir->hasMemoryBase());
-+
-+  const LAllocation* ptr = ins->ptr();
-+  const LDefinition* output = ins->output();
-+  const LAllocation* boundsCheckLimit = ins->boundsCheckLimit();
-+
-+  Register ptrReg = ToRegister(ptr);
-+  Scalar::Type accessType = mir->accessType();
-+  bool isFloat = accessType == Scalar::Float32 || accessType == Scalar::Float64;
-+  Label done;
-+
-+  if (mir->needsBoundsCheck()) {
-+    Label boundsCheckPassed;
-+    Register boundsCheckLimitReg = ToRegister(boundsCheckLimit);
-+    masm.wasmBoundsCheck32(Assembler::Below, ptrReg, boundsCheckLimitReg,
-+                           &boundsCheckPassed);
-+    if (isFloat) {
-+      if (accessType == Scalar::Float32) {
-+        masm.loadConstantFloat32(GenericNaN(), ToFloatRegister(output));
-+      } else {
-+        masm.loadConstantDouble(GenericNaN(), ToFloatRegister(output));
-+      }
-+    } else {
-+      masm.movePtr(ImmWord(0), ToRegister(output));
-+    }
-+    masm.jump(&done);
-+    masm.bind(&boundsCheckPassed);
-+  }
-+
-+  UseScratchRegisterScope temps(masm);
-+  Register scratch = temps.Acquire();
-+  masm.move32To64ZeroExtend(ptrReg, Register64(scratch));
-+
-+  switch (accessType) {
-+    case Scalar::Int8:
-+      masm.as_lbzx(ToRegister(output), HeapReg, scratch);
-+      masm.as_extsb(ToRegister(output), ToRegister(output));
-+      break;
-+    case Scalar::Uint8:
-+      masm.as_lbzx(ToRegister(output), HeapReg, scratch);
-+      break;
-+    case Scalar::Int16:
-+      masm.as_lhax(ToRegister(output), HeapReg, scratch);
-+      break;
-+    case Scalar::Uint16:
-+      masm.as_lhzx(ToRegister(output), HeapReg, scratch);
-+      break;
-+    case Scalar::Int32:
-+      masm.as_lwzx(ToRegister(output), HeapReg, scratch);
-+      masm.as_extsw(ToRegister(output), ToRegister(output));
-+      break;
-+    case Scalar::Uint32:
-+      masm.as_lwzx(ToRegister(output), HeapReg, scratch);
-+      break;
-+    case Scalar::Float64:
-+      masm.as_lfdx(ToFloatRegister(output), HeapReg, scratch);
-+      break;
-+    case Scalar::Float32:
-+      masm.as_lfsx(ToFloatRegister(output), HeapReg, scratch);
-+      break;
-+    default:
-+      MOZ_CRASH("unexpected array type");
-+  }
-+
-+  if (done.used()) {
-+    masm.bind(&done);
-+  }
-+}
-+
-+void CodeGenerator::visitAsmJSStoreHeap(LAsmJSStoreHeap* ins) {
-+  const MAsmJSStoreHeap* mir = ins->mir();
-+  MOZ_ASSERT(!mir->hasMemoryBase());
-+
-+  const LAllocation* value = ins->value();
-+  const LAllocation* ptr = ins->ptr();
-+  const LAllocation* boundsCheckLimit = ins->boundsCheckLimit();
-+
-+  Register ptrReg = ToRegister(ptr);
-+
-+  Label done;
-+  if (mir->needsBoundsCheck()) {
-+    Register boundsCheckLimitReg = ToRegister(boundsCheckLimit);
-+    masm.wasmBoundsCheck32(Assembler::AboveOrEqual, ptrReg, boundsCheckLimitReg,
-+                           &done);
-+  }
-+
-+  UseScratchRegisterScope temps(masm);
-+  Register scratch = temps.Acquire();
-+  masm.move32To64ZeroExtend(ptrReg, Register64(scratch));
-+
-+  switch (mir->accessType()) {
-+    case Scalar::Int8:
-+    case Scalar::Uint8:
-+      masm.as_stbx(ToRegister(value), HeapReg, scratch);
-+      break;
-+    case Scalar::Int16:
-+    case Scalar::Uint16:
-+      masm.as_sthx(ToRegister(value), HeapReg, scratch);
-+      break;
-+    case Scalar::Int32:
-+    case Scalar::Uint32:
-+      masm.as_stwx(ToRegister(value), HeapReg, scratch);
-+      break;
-+    case Scalar::Float64:
-+      masm.as_stfdx(ToFloatRegister(value), HeapReg, scratch);
-+      break;
-+    case Scalar::Float32:
-+      masm.as_stfsx(ToFloatRegister(value), HeapReg, scratch);
-+      break;
-+    default:
-+      MOZ_CRASH("unexpected array type");
-+  }
-+
-+  if (done.used()) {
-+    masm.bind(&done);
-+  }
-+}
-+
-+void CodeGenerator::visitWasmStackArg(LWasmStackArg* ins) {
-+  const MWasmStackArg* mir = ins->mir();
-+  if (ins->arg()->isConstant()) {
-+    masm.storePtr(ImmWord(ToInt32(ins->arg())),
-+                  Address(StackPointer, mir->spOffset()));
-+  } else {
-+    if (ins->arg()->isGeneralReg()) {
-+      masm.storePtr(ToRegister(ins->arg()),
-+                    Address(StackPointer, mir->spOffset()));
-+    } else if (mir->input()->type() == MIRType::Double) {
-+      masm.storeDouble(ToFloatRegister(ins->arg()),
-+                       Address(StackPointer, mir->spOffset()));
-+#ifdef ENABLE_WASM_SIMD
-+    } else if (mir->input()->type() == MIRType::Simd128) {
-+      masm.storeUnalignedSimd128(ToFloatRegister(ins->arg()),
-+                                 Address(StackPointer, mir->spOffset()));
-+#endif
-+    } else {
-+      masm.storeFloat32(ToFloatRegister(ins->arg()),
-+                        Address(StackPointer, mir->spOffset()));
-+    }
-+  }
-+}
-+
-+void CodeGenerator::visitWasmStackArgI64(LWasmStackArgI64* ins) {
-+  const MWasmStackArg* mir = ins->mir();
-+  Address dst(StackPointer, mir->spOffset());
-+  if (IsConstant(ins->arg())) {
-+    masm.store64(Imm64(ToInt64(ins->arg())), dst);
-+  } else {
-+    masm.store64(ToRegister64(ins->arg()), dst);
-+  }
-+}
-+
-+void CodeGenerator::visitWasmSelect(LWasmSelect* ins) {
-+  MIRType mirType = ins->mir()->type();
-+
-+  Register cond = ToRegister(ins->condExpr());
-+  const LAllocation* falseExpr = ins->falseExpr();
-+
-+  if (mirType == MIRType::Int32 || mirType == MIRType::WasmAnyRef) {
-+    Register out = ToRegister(ins->output());
-+    MOZ_ASSERT(ToRegister(ins->trueExpr()) == out,
-+               "true expr input is reused for output");
-+    if (falseExpr->isGeneralReg()) {
-+      masm.moveIfZero(out, ToRegister(falseExpr), cond);
-+    } else {
-+      masm.cmp32Load32(Assembler::Zero, cond, cond, ToAddress(falseExpr), out);
-+    }
-+    return;
-+  }
-+
-+  FloatRegister out = ToFloatRegister(ins->output());
-+  MOZ_ASSERT(ToFloatRegister(ins->trueExpr()) == out,
-+             "true expr input is reused for output");
-+
-+  if (falseExpr->isFloatReg()) {
-+    Label done;
-+    // The select condition is a 32-bit value; test 32 bits so high-bit garbage
-+    // does not make a zero condition read as non-zero.
-+    masm.branchTest32(Assembler::NonZero, cond, cond, &done);
-+    if (mirType == MIRType::Float32) {
-+      masm.moveFloat32(ToFloatRegister(falseExpr), out);
-+    } else if (mirType == MIRType::Double) {
-+      masm.moveDouble(ToFloatRegister(falseExpr), out);
-+    } else if (mirType == MIRType::Simd128) {
-+      masm.moveSimd128(ToFloatRegister(falseExpr), out);
-+    } else {
-+      MOZ_CRASH("unhandled type in visitWasmSelect!");
-+    }
-+    masm.bind(&done);
-+  } else {
-+    Label done;
-+    // The select condition is a 32-bit value; test 32 bits so high-bit garbage
-+    // does not make a zero condition read as non-zero.
-+    masm.branchTest32(Assembler::NonZero, cond, cond, &done);
-+
-+    if (mirType == MIRType::Float32) {
-+      masm.loadFloat32(ToAddress(falseExpr), out);
-+    } else if (mirType == MIRType::Double) {
-+      masm.loadDouble(ToAddress(falseExpr), out);
-+    } else if (mirType == MIRType::Simd128) {
-+      masm.loadUnalignedSimd128(ToAddress(falseExpr), out);
-+    } else {
-+      MOZ_CRASH("unhandled type in visitWasmSelect!");
-+    }
-+
-+    masm.bind(&done);
-+  }
-+}
-+
-+void CodeGenerator::visitWasmSelectI64(LWasmSelectI64* lir) {
-+  MOZ_ASSERT(lir->mir()->type() == MIRType::Int64);
-+
-+  Register cond = ToRegister(lir->condExpr());
-+  LInt64Allocation falseExpr = lir->falseExpr();
-+
-+  Register64 out = ToOutRegister64(lir);
-+  MOZ_ASSERT(ToRegister64(lir->trueExpr()) == out,
-+             "true expr is reused for input");
-+
-+  if (falseExpr.value().isGeneralReg()) {
-+    masm.moveIfZero(out.reg, ToRegister(falseExpr.value()), cond);
-+  } else {
-+    Label done;
-+    // The select condition is a 32-bit value; test 32 bits so high-bit garbage
-+    // does not make a zero condition read as non-zero.
-+    masm.branchTest32(Assembler::NonZero, cond, cond, &done);
-+    masm.loadPtr(ToAddress(falseExpr.value()), out.reg);
-+    masm.bind(&done);
-+  }
-+}
-+
-+void CodeGenerator::visitWasmCompareAndSelect(LWasmCompareAndSelect* ins) {
-+  MCompare::CompareType compTy = ins->compareType();
-+  MIRType insTy = ins->mir()->type();
-+  const bool cmpIs32 = compTy == MCompare::Compare_Int32 ||
-+                       compTy == MCompare::Compare_UInt32;
-+  const bool cmpIs64 = compTy == MCompare::Compare_Int64 ||
-+                       compTy == MCompare::Compare_UInt64;
-+  const bool selIsInt = insTy == MIRType::Int32 || insTy == MIRType::Int64;
-+
-+  MOZ_RELEASE_ASSERT(
-+      (cmpIs32 || cmpIs64) && selIsInt,
-+      "CodeGenerator::visitWasmCompareAndSelect: unexpected types");
-+
-+  Register trueExprAndDest = ToRegister(ins->output());
-+  MOZ_ASSERT(ToRegister(ins->ifTrueExpr()) == trueExprAndDest,
-+             "true expr input is reused for output");
-+
-+  Assembler::Condition cond =
-+      Assembler::InvertCondition(JSOpToCondition(compTy, ins->jsop()));
-+  Register lhs = ToRegister(ins->leftExpr());
-+  Register rhs = ToRegister(ins->rightExpr());
-+  Register falseExpr = ToRegister(ins->ifFalseExpr());
-+
-+  // isel operates on the whole 64-bit GPR regardless of compare width; only
-+  // the compare instruction differs (cmpw/cmplw vs cmpd/cmpld).
-+  if (cmpIs32) {
-+    masm.cmp32Move32(cond, lhs, rhs, falseExpr, trueExprAndDest);
-+  } else {
-+    masm.cmpPtrMovePtr(cond, lhs, rhs, falseExpr, trueExprAndDest);
-+  }
-+}
-+
-+void CodeGenerator::visitWasmAddOffset(LWasmAddOffset* lir) {
-+  MWasmAddOffset* mir = lir->mir();
-+  Register base = ToRegister(lir->base());
-+  Register out = ToRegister(lir->output());
-+
-+  Label ok;
-+  masm.ma_add32TestCarry(Assembler::CarryClear, out, base, Imm32(mir->offset()),
-+                         &ok);
-+  masm.wasmTrap(wasm::Trap::OutOfBounds, mir->trapSiteDesc());
-+  masm.bind(&ok);
-+}
-+
-+void CodeGenerator::visitWasmAddOffset64(LWasmAddOffset64* lir) {
-+  MWasmAddOffset* mir = lir->mir();
-+  Register64 base = ToRegister64(lir->base());
-+  Register64 out = ToOutRegister64(lir);
-+
-+  Label ok;
-+  masm.ma_addPtrTestCarry(Assembler::CarryClear, out.reg, base.reg,
-+                          ImmWord(mir->offset()), &ok);
-+  masm.wasmTrap(wasm::Trap::OutOfBounds, mir->trapSiteDesc());
-+  masm.bind(&ok);
-+}
-+
-+// ===============================================================
-+// Visitors: Effective Address
-+
-+void CodeGenerator::visitEffectiveAddress2(LEffectiveAddress2* ins) {
-+  const MEffectiveAddress2* mir = ins->mir();
-+  Register output = ToRegister(ins->output());
-+
-+  // EA = index * scale + displacement (no base register)
-+  masm.movePtr(ImmWord(0), output);
-+  BaseIndex addr(output, ToRegister(ins->index()), mir->scale(),
-+                 mir->displacement());
-+  masm.computeEffectiveAddress(addr, output);
-+  // Sign-extend to 32-bit
-+  masm.as_extsw(output, output);
-+}
-+
-+void CodeGenerator::visitEffectiveAddress3(LEffectiveAddress3* ins) {
-+  const MEffectiveAddress3* mir = ins->mir();
-+  Register output = ToRegister(ins->output());
-+
-+  BaseIndex addr(ToRegister(ins->base()), ToRegister(ins->index()),
-+                 mir->scale(), mir->displacement());
-+  masm.computeEffectiveAddress(addr, output);
-+  // Sign-extend to 32-bit
-+  masm.as_extsw(output, output);
-+}
-+
-+void CodeGenerator::visitWasmMulI64WideHI64(LWasmMulI64WideHI64* ins) {
-+  Register lhs = ToRegister(ins->lhs());
-+  Register rhs = ToRegister(ins->rhs());
-+  Register output = ToRegister(ins->output());
-+
-+  if (ins->isSigned()) {
-+    masm.as_mulhd(output, lhs, rhs);
-+  } else {
-+    masm.as_mulhdu(output, lhs, rhs);
-+  }
-+}
-+
-+// ===============================================================
-+// Visitors: Typed Array Atomics
-+
-+void CodeGenerator::visitCompareExchangeTypedArrayElement(
-+    LCompareExchangeTypedArrayElement* lir) {
-+  Register elements = ToRegister(lir->elements());
-+  AnyRegister output = ToAnyRegister(lir->output());
-+  Register outTemp = ToTempRegisterOrInvalid(lir->temp0());
-+
-+  Register oldval = ToRegister(lir->oldval());
-+  Register newval = ToRegister(lir->newval());
-+  Register valueTemp = ToTempRegisterOrInvalid(lir->temp1());
-+  Register offsetTemp = ToTempRegisterOrInvalid(lir->temp2());
-+  Register maskTemp = ToTempRegisterOrInvalid(lir->temp3());
-+  Scalar::Type arrayType = lir->mir()->arrayType();
-+
-+  auto dest = ToAddressOrBaseIndex(elements, lir->index(), arrayType);
-+
-+  dest.match([&](const auto& dest) {
-+    masm.compareExchangeJS(arrayType, Synchronization::Full(), dest, oldval,
-+                           newval, valueTemp, offsetTemp, maskTemp, outTemp,
-+                           output);
-+  });
-+}
-+
-+void CodeGenerator::visitAtomicExchangeTypedArrayElement(
-+    LAtomicExchangeTypedArrayElement* lir) {
-+  Register elements = ToRegister(lir->elements());
-+  AnyRegister output = ToAnyRegister(lir->output());
-+  Register outTemp = ToTempRegisterOrInvalid(lir->temp0());
-+
-+  Register value = ToRegister(lir->value());
-+  Register valueTemp = ToTempRegisterOrInvalid(lir->temp1());
-+  Register offsetTemp = ToTempRegisterOrInvalid(lir->temp2());
-+  Register maskTemp = ToTempRegisterOrInvalid(lir->temp3());
-+  Scalar::Type arrayType = lir->mir()->arrayType();
-+
-+  auto dest = ToAddressOrBaseIndex(elements, lir->index(), arrayType);
-+
-+  dest.match([&](const auto& dest) {
-+    masm.atomicExchangeJS(arrayType, Synchronization::Full(), dest, value,
-+                          valueTemp, offsetTemp, maskTemp, outTemp, output);
-+  });
-+}
-+
-+void CodeGenerator::visitAtomicTypedArrayElementBinop(
-+    LAtomicTypedArrayElementBinop* lir) {
-+  MOZ_ASSERT(!lir->mir()->isForEffect());
-+
-+  AnyRegister output = ToAnyRegister(lir->output());
-+  Register elements = ToRegister(lir->elements());
-+  Register outTemp = ToTempRegisterOrInvalid(lir->temp0());
-+  Register valueTemp = ToTempRegisterOrInvalid(lir->temp1());
-+  Register offsetTemp = ToTempRegisterOrInvalid(lir->temp2());
-+  Register maskTemp = ToTempRegisterOrInvalid(lir->temp3());
-+  Register value = ToRegister(lir->value());
-+  Scalar::Type arrayType = lir->mir()->arrayType();
-+
-+  auto mem = ToAddressOrBaseIndex(elements, lir->index(), arrayType);
-+
-+  mem.match([&](const auto& mem) {
-+    masm.atomicFetchOpJS(arrayType, Synchronization::Full(),
-+                         lir->mir()->operation(), value, mem, valueTemp,
-+                         offsetTemp, maskTemp, outTemp, output);
-+  });
-+}
-+
-+void CodeGenerator::visitAtomicTypedArrayElementBinopForEffect(
-+    LAtomicTypedArrayElementBinopForEffect* lir) {
-+  MOZ_ASSERT(lir->mir()->isForEffect());
-+
-+  Register elements = ToRegister(lir->elements());
-+  Register valueTemp = ToTempRegisterOrInvalid(lir->temp0());
-+  Register offsetTemp = ToTempRegisterOrInvalid(lir->temp1());
-+  Register maskTemp = ToTempRegisterOrInvalid(lir->temp2());
-+  Register value = ToRegister(lir->value());
-+  Scalar::Type arrayType = lir->mir()->arrayType();
-+
-+  auto mem = ToAddressOrBaseIndex(elements, lir->index(), arrayType);
-+
-+  mem.match([&](const auto& mem) {
-+    masm.atomicEffectOpJS(arrayType, Synchronization::Full(),
-+                          lir->mir()->operation(), value, mem, valueTemp,
-+                          offsetTemp, maskTemp);
-+  });
-+}
-+
-+void CodeGenerator::visitCompareExchangeTypedArrayElement64(
-+    LCompareExchangeTypedArrayElement64* lir) {
-+  Register elements = ToRegister(lir->elements());
-+  Register64 oldval = ToRegister64(lir->oldval());
-+  Register64 newval = ToRegister64(lir->newval());
-+  Register64 out = ToOutRegister64(lir);
-+  Scalar::Type arrayType = lir->mir()->arrayType();
-+
-+  auto dest = ToAddressOrBaseIndex(elements, lir->index(), arrayType);
-+
-+  dest.match([&](const auto& dest) {
-+    masm.compareExchange64(Synchronization::Full(), dest, oldval, newval, out);
-+  });
-+}
-+
-+void CodeGenerator::visitAtomicExchangeTypedArrayElement64(
-+    LAtomicExchangeTypedArrayElement64* lir) {
-+  Register elements = ToRegister(lir->elements());
-+  Register64 value = ToRegister64(lir->value());
-+  Register64 out = ToOutRegister64(lir);
-+  Scalar::Type arrayType = lir->mir()->arrayType();
-+
-+  auto dest = ToAddressOrBaseIndex(elements, lir->index(), arrayType);
-+
-+  dest.match([&](const auto& dest) {
-+    masm.atomicExchange64(Synchronization::Full(), dest, value, out);
-+  });
-+}
-+
-+void CodeGenerator::visitAtomicTypedArrayElementBinop64(
-+    LAtomicTypedArrayElementBinop64* lir) {
-+  MOZ_ASSERT(lir->mir()->hasUses());
-+
-+  Register elements = ToRegister(lir->elements());
-+  Register64 value = ToRegister64(lir->value());
-+  Register64 temp = ToRegister64(lir->temp0());
-+  Register64 out = ToOutRegister64(lir);
-+
-+  Scalar::Type arrayType = lir->mir()->arrayType();
-+  AtomicOp atomicOp = lir->mir()->operation();
-+
-+  auto dest = ToAddressOrBaseIndex(elements, lir->index(), arrayType);
-+
-+  dest.match([&](const auto& dest) {
-+    masm.atomicFetchOp64(Synchronization::Full(), atomicOp, value, dest, temp,
-+                         out);
-+  });
-+}
-+
-+void CodeGenerator::visitAtomicTypedArrayElementBinopForEffect64(
-+    LAtomicTypedArrayElementBinopForEffect64* lir) {
-+  MOZ_ASSERT(!lir->mir()->hasUses());
-+
-+  Register elements = ToRegister(lir->elements());
-+  Register64 value = ToRegister64(lir->value());
-+  Register64 temp = ToRegister64(lir->temp0());
-+
-+  Scalar::Type arrayType = lir->mir()->arrayType();
-+  AtomicOp atomicOp = lir->mir()->operation();
-+
-+  auto dest = ToAddressOrBaseIndex(elements, lir->index(), arrayType);
-+
-+  dest.match([&](const auto& dest) {
-+    masm.atomicEffectOp64(Synchronization::Full(), atomicOp, value, dest, temp);
-+  });
-+}
-+
-+void CodeGenerator::visitAtomicLoad64(LAtomicLoad64* lir) {
-+  Register elements = ToRegister(lir->elements());
-+  Register64 out = ToOutRegister64(lir);
-+  Scalar::Type storageType = lir->mir()->storageType();
-+
-+  auto source = ToAddressOrBaseIndex(elements, lir->index(), storageType);
-+
-+  auto sync = Synchronization::Load();
-+  masm.memoryBarrierBefore(sync);
-+  source.match([&](const auto& source) { masm.load64(source, out); });
-+  masm.memoryBarrierAfter(sync);
-+}
-+
-+void CodeGenerator::visitAtomicStore64(LAtomicStore64* lir) {
-+  Register elements = ToRegister(lir->elements());
-+  Register64 value = ToRegister64(lir->value());
-+  Scalar::Type writeType = lir->mir()->writeType();
-+
-+  auto dest = ToAddressOrBaseIndex(elements, lir->index(), writeType);
-+
-+  auto sync = Synchronization::Store();
-+  masm.memoryBarrierBefore(sync);
-+  dest.match([&](const auto& dest) { masm.store64(value, dest); });
-+  masm.memoryBarrierAfter(sync);
-+}
-+
-+// Wasm Atomics
-+void CodeGenerator::visitWasmCompareExchangeHeap(
-+    LWasmCompareExchangeHeap* ins) {
-+  MWasmCompareExchangeHeap* mir = ins->mir();
-+  Register memoryBase = ToRegister(ins->memoryBase());
-+  Register ptrReg = ToRegister(ins->ptr());
-+  BaseIndex srcAddr(memoryBase, ptrReg, TimesOne, mir->access().offset32());
-+
-+  Register oldval = ToRegister(ins->oldValue());
-+  Register newval = ToRegister(ins->newValue());
-+  Register valueTemp = ToTempRegisterOrInvalid(ins->temp0());
-+  Register offsetTemp = ToTempRegisterOrInvalid(ins->temp1());
-+  Register maskTemp = ToTempRegisterOrInvalid(ins->temp2());
-+
-+  masm.wasmCompareExchange(mir->access(), srcAddr, oldval, newval, valueTemp,
-+                           offsetTemp, maskTemp, ToRegister(ins->output()));
-+}
-+
-+void CodeGenerator::visitWasmAtomicExchangeHeap(LWasmAtomicExchangeHeap* ins) {
-+  MWasmAtomicExchangeHeap* mir = ins->mir();
-+  Register memoryBase = ToRegister(ins->memoryBase());
-+  Register ptrReg = ToRegister(ins->ptr());
-+  Register value = ToRegister(ins->value());
-+  BaseIndex srcAddr(memoryBase, ptrReg, TimesOne, mir->access().offset32());
-+
-+  Register valueTemp = ToTempRegisterOrInvalid(ins->temp0());
-+  Register offsetTemp = ToTempRegisterOrInvalid(ins->temp1());
-+  Register maskTemp = ToTempRegisterOrInvalid(ins->temp2());
-+
-+  masm.wasmAtomicExchange(mir->access(), srcAddr, value, valueTemp, offsetTemp,
-+                          maskTemp, ToRegister(ins->output()));
-+}
-+
-+void CodeGenerator::visitWasmAtomicBinopHeap(LWasmAtomicBinopHeap* ins) {
-+  MOZ_ASSERT(ins->mir()->hasUses());
-+
-+  MWasmAtomicBinopHeap* mir = ins->mir();
-+  Register memoryBase = ToRegister(ins->memoryBase());
-+  Register ptrReg = ToRegister(ins->ptr());
-+  Register valueTemp = ToTempRegisterOrInvalid(ins->temp0());
-+  Register offsetTemp = ToTempRegisterOrInvalid(ins->temp1());
-+  Register maskTemp = ToTempRegisterOrInvalid(ins->temp2());
-+
-+  BaseIndex srcAddr(memoryBase, ptrReg, TimesOne, mir->access().offset32());
-+
-+  masm.wasmAtomicFetchOp(mir->access(), mir->operation(),
-+                         ToRegister(ins->value()), srcAddr, valueTemp,
-+                         offsetTemp, maskTemp, ToRegister(ins->output()));
-+}
-+
-+void CodeGenerator::visitWasmAtomicBinopHeapForEffect(
-+    LWasmAtomicBinopHeapForEffect* ins) {
-+  MOZ_ASSERT(!ins->mir()->hasUses());
-+
-+  MWasmAtomicBinopHeap* mir = ins->mir();
-+  Register memoryBase = ToRegister(ins->memoryBase());
-+  Register ptrReg = ToRegister(ins->ptr());
-+  Register valueTemp = ToTempRegisterOrInvalid(ins->temp0());
-+  Register offsetTemp = ToTempRegisterOrInvalid(ins->temp1());
-+  Register maskTemp = ToTempRegisterOrInvalid(ins->temp2());
-+
-+  BaseIndex srcAddr(memoryBase, ptrReg, TimesOne, mir->access().offset32());
-+  masm.wasmAtomicEffectOp(mir->access(), mir->operation(),
-+                          ToRegister(ins->value()), srcAddr, valueTemp,
-+                          offsetTemp, maskTemp);
-+}
-+
-+void CodeGenerator::visitWasmCompareExchangeI64(LWasmCompareExchangeI64* lir) {
-+  Register memoryBase = ToRegister(lir->memoryBase());
-+  Register ptr = ToRegister(lir->ptr());
-+  Register64 oldValue = ToRegister64(lir->oldValue());
-+  Register64 newValue = ToRegister64(lir->newValue());
-+  Register64 output = ToOutRegister64(lir);
-+  uint32_t offset = lir->mir()->access().offset32();
-+
-+  BaseIndex addr(memoryBase, ptr, TimesOne, offset);
-+  masm.wasmCompareExchange64(lir->mir()->access(), addr, oldValue, newValue,
-+                             output);
-+}
-+
-+void CodeGenerator::visitWasmAtomicExchangeI64(LWasmAtomicExchangeI64* lir) {
-+  Register memoryBase = ToRegister(lir->memoryBase());
-+  Register ptr = ToRegister(lir->ptr());
-+  Register64 value = ToRegister64(lir->value());
-+  Register64 output = ToOutRegister64(lir);
-+  uint32_t offset = lir->mir()->access().offset32();
-+
-+  BaseIndex addr(memoryBase, ptr, TimesOne, offset);
-+  masm.wasmAtomicExchange64(lir->mir()->access(), addr, value, output);
-+}
-+
-+void CodeGenerator::visitWasmAtomicBinopI64(LWasmAtomicBinopI64* lir) {
-+  Register memoryBase = ToRegister(lir->memoryBase());
-+  Register ptr = ToRegister(lir->ptr());
-+  Register64 value = ToRegister64(lir->value());
-+  Register64 output = ToOutRegister64(lir);
-+  Register64 temp = ToRegister64(lir->temp0());
-+  uint32_t offset = lir->mir()->access().offset32();
-+
-+  BaseIndex addr(memoryBase, ptr, TimesOne, offset);
-+
-+  masm.wasmAtomicFetchOp64(lir->mir()->access(), lir->mir()->operation(), value,
-+                           addr, temp, output);
-+}
-+
-+// SIMD code generators.
-+void CodeGenerator::visitSimd128(LSimd128* ins) {
-+  FloatRegister dest = ToFloatRegister(ins->output());
-+  masm.loadConstantSimd128(ins->simd128(), dest);
-+}
-+void CodeGenerator::visitWasmTernarySimd128(LWasmTernarySimd128* ins) {
-+  FloatRegister v0 = ToFloatRegister(ins->v0());
-+  FloatRegister v1 = ToFloatRegister(ins->v1());
-+  FloatRegister v2 = ToFloatRegister(ins->v2());
-+  FloatRegister dest = ToFloatRegister(ins->output());
-+  switch (ins->simdOp()) {
-+    case wasm::SimdOp::V128Bitselect:
-+      // bitselect(v0, v1, v2): result = (v0 & v2) | (v1 & ~v2)
-+      // xxsel: XC=0→XA, XC=1→XB → (XA & ~XC) | (XB & XC)
-+      // Need XA=v1, XB=v0, XC=v2.
-+      masm.as_xxsel(dest, v1, v0, v2);
-+      break;
-+    case wasm::SimdOp::I8x16RelaxedLaneSelect:
-+    case wasm::SimdOp::I16x8RelaxedLaneSelect:
-+    case wasm::SimdOp::I32x4RelaxedLaneSelect:
-+    case wasm::SimdOp::I64x2RelaxedLaneSelect:
-+      // relaxed laneSelect(v0, v1, mask=v2): same as bitselect
-+      masm.as_xxsel(dest, v1, v0, v2);
-+      break;
-+    // Lowering uses defineReuseInput on V2Index for ternary ops — the
-+    // allocator is required to place `dest` in v2's slot. Assert that
-+    // here; the FMA/dot helpers write their result through v2 in-place,
-+    // so dest == v2 makes the trailing moveSimd128 unnecessary.
-+    case wasm::SimdOp::I32x4RelaxedDotI8x16I7x16AddS:
-+      MOZ_ASSERT(dest == v2);
-+      masm.dotInt8x16Int7x16ThenAdd(v0, v1, v2,
-+                                    ToFloatRegister(ins->temp0()));
-+      break;
-+    case wasm::SimdOp::F32x4RelaxedMadd:
-+      MOZ_ASSERT(dest == v2);
-+      masm.fmaFloat32x4(v0, v1, v2);
-+      break;
-+    case wasm::SimdOp::F64x2RelaxedMadd:
-+      MOZ_ASSERT(dest == v2);
-+      masm.fmaFloat64x2(v0, v1, v2);
-+      break;
-+    case wasm::SimdOp::F32x4RelaxedNmadd:
-+      MOZ_ASSERT(dest == v2);
-+      masm.fnmaFloat32x4(v0, v1, v2);
-+      break;
-+    case wasm::SimdOp::F64x2RelaxedNmadd:
-+      MOZ_ASSERT(dest == v2);
-+      masm.fnmaFloat64x2(v0, v1, v2);
-+      break;
-+    default:
-+      MOZ_CRASH("PPC64: NYI SIMD ternary op");
-+  }
-+}
-+void CodeGenerator::visitWasmBinarySimd128(LWasmBinarySimd128* ins) {
-+  FloatRegister lhs = ToFloatRegister(ins->lhs());
-+  FloatRegister rhs = ToFloatRegister(ins->rhs());
-+  FloatRegister dest = ToFloatRegister(ins->output());
-+  switch (ins->simdOp()) {
-+    // Bitwise
-+    case wasm::SimdOp::V128And:
-+      masm.bitwiseAndSimd128(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::V128Or:
-+      masm.bitwiseOrSimd128(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::V128Xor:
-+      masm.bitwiseXorSimd128(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::V128AndNot:
-+      masm.bitwiseAndNotSimd128(lhs, rhs, dest);
-+      break;
-+    // Integer add
-+    case wasm::SimdOp::I8x16Add:
-+      masm.addInt8x16(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8Add:
-+      masm.addInt16x8(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I32x4Add:
-+      masm.addInt32x4(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I64x2Add:
-+      masm.addInt64x2(lhs, rhs, dest);
-+      break;
-+    // Integer sub
-+    case wasm::SimdOp::I8x16Sub:
-+      masm.subInt8x16(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8Sub:
-+      masm.subInt16x8(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I32x4Sub:
-+      masm.subInt32x4(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I64x2Sub:
-+      masm.subInt64x2(lhs, rhs, dest);
-+      break;
-+    // Saturating add
-+    case wasm::SimdOp::I8x16AddSatS:
-+      masm.addSatInt8x16(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I8x16AddSatU:
-+      masm.unsignedAddSatInt8x16(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8AddSatS:
-+      masm.addSatInt16x8(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8AddSatU:
-+      masm.unsignedAddSatInt16x8(lhs, rhs, dest);
-+      break;
-+    // Saturating sub
-+    case wasm::SimdOp::I8x16SubSatS:
-+      masm.subSatInt8x16(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I8x16SubSatU:
-+      masm.unsignedSubSatInt8x16(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8SubSatS:
-+      masm.subSatInt16x8(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8SubSatU:
-+      masm.unsignedSubSatInt16x8(lhs, rhs, dest);
-+      break;
-+    // Integer multiply
-+    case wasm::SimdOp::I16x8Mul:
-+      masm.mulInt16x8(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I32x4Mul:
-+      masm.mulInt32x4(lhs, rhs, dest);
-+      break;
-+    // Integer min/max signed
-+    case wasm::SimdOp::I8x16MinS:
-+      masm.minInt8x16(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I8x16MaxS:
-+      masm.maxInt8x16(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8MinS:
-+      masm.minInt16x8(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8MaxS:
-+      masm.maxInt16x8(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I32x4MinS:
-+      masm.minInt32x4(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I32x4MaxS:
-+      masm.maxInt32x4(lhs, rhs, dest);
-+      break;
-+    // Integer min/max unsigned
-+    case wasm::SimdOp::I8x16MinU:
-+      masm.unsignedMinInt8x16(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I8x16MaxU:
-+      masm.unsignedMaxInt8x16(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8MinU:
-+      masm.unsignedMinInt16x8(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8MaxU:
-+      masm.unsignedMaxInt16x8(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I32x4MinU:
-+      masm.unsignedMinInt32x4(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I32x4MaxU:
-+      masm.unsignedMaxInt32x4(lhs, rhs, dest);
-+      break;
-+    // Average unsigned
-+    case wasm::SimdOp::I8x16AvgrU:
-+      masm.unsignedAverageInt8x16(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8AvgrU:
-+      masm.unsignedAverageInt16x8(lhs, rhs, dest);
-+      break;
-+    // Q15 multiply
-+    case wasm::SimdOp::I16x8Q15MulrSatS:
-+      masm.q15MulrSatInt16x8(lhs, rhs, dest);
-+      break;
-+    // Integer compare
-+    case wasm::SimdOp::I8x16Eq:
-+      masm.compareInt8x16(Assembler::Equal, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I8x16Ne:
-+      masm.compareInt8x16(Assembler::NotEqual, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I8x16LtS:
-+      masm.compareInt8x16(Assembler::LessThan, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I8x16GtS:
-+      masm.compareInt8x16(Assembler::GreaterThan, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I8x16LeS:
-+      masm.compareInt8x16(Assembler::LessThanOrEqual, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I8x16GeS:
-+      masm.compareInt8x16(Assembler::GreaterThanOrEqual, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I8x16LtU:
-+      masm.compareInt8x16(Assembler::Below, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I8x16GtU:
-+      masm.compareInt8x16(Assembler::Above, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I8x16LeU:
-+      masm.compareInt8x16(Assembler::BelowOrEqual, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I8x16GeU:
-+      masm.compareInt8x16(Assembler::AboveOrEqual, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8Eq:
-+      masm.compareInt16x8(Assembler::Equal, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8Ne:
-+      masm.compareInt16x8(Assembler::NotEqual, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8LtS:
-+      masm.compareInt16x8(Assembler::LessThan, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8GtS:
-+      masm.compareInt16x8(Assembler::GreaterThan, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8LeS:
-+      masm.compareInt16x8(Assembler::LessThanOrEqual, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8GeS:
-+      masm.compareInt16x8(Assembler::GreaterThanOrEqual, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8LtU:
-+      masm.compareInt16x8(Assembler::Below, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8GtU:
-+      masm.compareInt16x8(Assembler::Above, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8LeU:
-+      masm.compareInt16x8(Assembler::BelowOrEqual, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8GeU:
-+      masm.compareInt16x8(Assembler::AboveOrEqual, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I32x4Eq:
-+      masm.compareInt32x4(Assembler::Equal, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I32x4Ne:
-+      masm.compareInt32x4(Assembler::NotEqual, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I32x4LtS:
-+      masm.compareInt32x4(Assembler::LessThan, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I32x4GtS:
-+      masm.compareInt32x4(Assembler::GreaterThan, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I32x4LeS:
-+      masm.compareInt32x4(Assembler::LessThanOrEqual, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I32x4GeS:
-+      masm.compareInt32x4(Assembler::GreaterThanOrEqual, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I32x4LtU:
-+      masm.compareInt32x4(Assembler::Below, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I32x4GtU:
-+      masm.compareInt32x4(Assembler::Above, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I32x4LeU:
-+      masm.compareInt32x4(Assembler::BelowOrEqual, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I32x4GeU:
-+      masm.compareInt32x4(Assembler::AboveOrEqual, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I64x2Eq:
-+      masm.compareInt64x2(Assembler::Equal, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I64x2Ne:
-+      masm.compareInt64x2(Assembler::NotEqual, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I64x2LtS:
-+      masm.compareInt64x2(Assembler::LessThan, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I64x2GtS:
-+      masm.compareInt64x2(Assembler::GreaterThan, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I64x2LeS:
-+      masm.compareInt64x2(Assembler::LessThanOrEqual, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I64x2GeS:
-+      masm.compareInt64x2(Assembler::GreaterThanOrEqual, lhs, rhs, dest);
-+      break;
-+    // Float compare
-+    case wasm::SimdOp::F32x4Eq:
-+      masm.compareFloat32x4(Assembler::Equal, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::F32x4Ne:
-+      masm.compareFloat32x4(Assembler::NotEqual, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::F32x4Lt:
-+      masm.compareFloat32x4(Assembler::LessThan, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::F32x4Gt:
-+      masm.compareFloat32x4(Assembler::GreaterThan, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::F32x4Le:
-+      masm.compareFloat32x4(Assembler::LessThanOrEqual, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::F32x4Ge:
-+      masm.compareFloat32x4(Assembler::GreaterThanOrEqual, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::F64x2Eq:
-+      masm.compareFloat64x2(Assembler::Equal, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::F64x2Ne:
-+      masm.compareFloat64x2(Assembler::NotEqual, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::F64x2Lt:
-+      masm.compareFloat64x2(Assembler::LessThan, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::F64x2Gt:
-+      masm.compareFloat64x2(Assembler::GreaterThan, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::F64x2Le:
-+      masm.compareFloat64x2(Assembler::LessThanOrEqual, lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::F64x2Ge:
-+      masm.compareFloat64x2(Assembler::GreaterThanOrEqual, lhs, rhs, dest);
-+      break;
-+    // Float arithmetic
-+    case wasm::SimdOp::F32x4Add:
-+      masm.addFloat32x4(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::F32x4Sub:
-+      masm.subFloat32x4(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::F32x4Mul:
-+      masm.mulFloat32x4(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::F32x4Div:
-+      masm.divFloat32x4(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::F32x4Min:
-+      masm.minFloat32x4(lhs, rhs, dest, ToFloatRegister(ins->getTemp(0)),
-+                         ToFloatRegister(ins->getTemp(1)));
-+      break;
-+    case wasm::SimdOp::F32x4Max:
-+      masm.maxFloat32x4(lhs, rhs, dest, ToFloatRegister(ins->getTemp(0)),
-+                         ToFloatRegister(ins->getTemp(1)));
-+      break;
-+    case wasm::SimdOp::F32x4PMin:
-+      masm.pseudoMinFloat32x4(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::F32x4PMax:
-+      masm.pseudoMaxFloat32x4(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::F64x2Add:
-+      masm.addFloat64x2(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::F64x2Sub:
-+      masm.subFloat64x2(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::F64x2Mul:
-+      masm.mulFloat64x2(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::F64x2Div:
-+      masm.divFloat64x2(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::F64x2Min:
-+      masm.minFloat64x2(lhs, rhs, dest, ToFloatRegister(ins->getTemp(0)),
-+                         ToFloatRegister(ins->getTemp(1)));
-+      break;
-+    case wasm::SimdOp::F64x2Max:
-+      masm.maxFloat64x2(lhs, rhs, dest, ToFloatRegister(ins->getTemp(0)),
-+                         ToFloatRegister(ins->getTemp(1)));
-+      break;
-+    case wasm::SimdOp::F64x2PMin:
-+      masm.pseudoMinFloat64x2(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::F64x2PMax:
-+      masm.pseudoMaxFloat64x2(lhs, rhs, dest);
-+      break;
-+    // Narrow
-+    case wasm::SimdOp::I8x16NarrowI16x8S:
-+      masm.narrowInt16x8(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I8x16NarrowI16x8U:
-+      masm.unsignedNarrowInt16x8(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8NarrowI32x4S:
-+      masm.narrowInt32x4(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8NarrowI32x4U:
-+      masm.unsignedNarrowInt32x4(lhs, rhs, dest);
-+      break;
-+    // i64 multiply
-+    case wasm::SimdOp::I64x2Mul: {
-+      FloatRegister temp0 = ToTempFloatRegisterOrInvalid(ins->temp0());
-+      FloatRegister temp1f = ToTempFloatRegisterOrInvalid(ins->temp1());
-+      masm.mulInt64x2(lhs, rhs, dest, temp0, temp1f);
-+      break;
-+    }
-+    // Extended multiply
-+    case wasm::SimdOp::I16x8ExtmulLowI8x16S:
-+      masm.extMulLowInt8x16(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8ExtmulHighI8x16S:
-+      masm.extMulHighInt8x16(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8ExtmulLowI8x16U:
-+      masm.unsignedExtMulLowInt8x16(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8ExtmulHighI8x16U:
-+      masm.unsignedExtMulHighInt8x16(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I32x4ExtmulLowI16x8S:
-+      masm.extMulLowInt16x8(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I32x4ExtmulHighI16x8S:
-+      masm.extMulHighInt16x8(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I32x4ExtmulLowI16x8U:
-+      masm.unsignedExtMulLowInt16x8(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I32x4ExtmulHighI16x8U:
-+      masm.unsignedExtMulHighInt16x8(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I64x2ExtmulLowI32x4S:
-+      masm.extMulLowInt32x4(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I64x2ExtmulHighI32x4S:
-+      masm.extMulHighInt32x4(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I64x2ExtmulLowI32x4U:
-+      masm.unsignedExtMulLowInt32x4(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I64x2ExtmulHighI32x4U:
-+      masm.unsignedExtMulHighInt32x4(lhs, rhs, dest);
-+      break;
-+    // Dot product
-+    case wasm::SimdOp::I32x4DotI16x8S:
-+      masm.widenDotInt16x8(lhs, rhs, dest);
-+      break;
-+    // Relaxed binary ops
-+    case wasm::SimdOp::F32x4RelaxedMin:
-+      masm.minFloat32x4Relaxed(rhs, lhs);
-+      if (dest != lhs) masm.moveSimd128(lhs, dest);
-+      break;
-+    case wasm::SimdOp::F32x4RelaxedMax:
-+      masm.maxFloat32x4Relaxed(rhs, lhs);
-+      if (dest != lhs) masm.moveSimd128(lhs, dest);
-+      break;
-+    case wasm::SimdOp::F64x2RelaxedMin:
-+      masm.minFloat64x2Relaxed(rhs, lhs);
-+      if (dest != lhs) masm.moveSimd128(lhs, dest);
-+      break;
-+    case wasm::SimdOp::F64x2RelaxedMax:
-+      masm.maxFloat64x2Relaxed(rhs, lhs);
-+      if (dest != lhs) masm.moveSimd128(lhs, dest);
-+      break;
-+    case wasm::SimdOp::I8x16RelaxedSwizzle:
-+      masm.swizzleInt8x16Relaxed(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8RelaxedQ15MulrS:
-+      masm.q15MulrInt16x8Relaxed(lhs, rhs, dest);
-+      break;
-+    // Swizzle
-+    case wasm::SimdOp::I8x16Swizzle:
-+      masm.swizzleInt8x16(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8RelaxedDotI8x16I7x16S:
-+      masm.dotInt8x16Int7x16(lhs, rhs, dest);
-+      break;
-+    default:
-+      MOZ_CRASH("PPC64: NYI SIMD binary op");
-+  }
-+}
-+void CodeGenerator::visitWasmBinarySimd128WithConstant(
-+    LWasmBinarySimd128WithConstant* ins) {
-+  FloatRegister lhs = ToFloatRegister(ins->lhs());
-+  FloatRegister dest = ToFloatRegister(ins->output());
-+  SimdConstant rhs = ins->rhs();
-+  // Load the constant into scratch, then use the binary op.
-+  ScratchSimd128Scope scratch(masm);
-+  masm.loadConstantSimd128(rhs, scratch);
-+  switch (ins->mir()->simdOp()) {
-+    // Bitwise
-+    case wasm::SimdOp::V128And:
-+      masm.bitwiseAndSimd128(lhs, scratch, dest);
-+      break;
-+    case wasm::SimdOp::V128Or:
-+      masm.bitwiseOrSimd128(lhs, scratch, dest);
-+      break;
-+    case wasm::SimdOp::V128Xor:
-+      masm.bitwiseXorSimd128(lhs, scratch, dest);
-+      break;
-+    case wasm::SimdOp::V128AndNot:
-+      masm.bitwiseAndNotSimd128(lhs, scratch, dest);
-+      break;
-+    // Integer add
-+    case wasm::SimdOp::I8x16Add:
-+      masm.addInt8x16(lhs, scratch, dest);
-+      break;
-+    case wasm::SimdOp::I16x8Add:
-+      masm.addInt16x8(lhs, scratch, dest);
-+      break;
-+    case wasm::SimdOp::I32x4Add:
-+      masm.addInt32x4(lhs, scratch, dest);
-+      break;
-+    case wasm::SimdOp::I64x2Add:
-+      masm.addInt64x2(lhs, scratch, dest);
-+      break;
-+    // Integer sub
-+    case wasm::SimdOp::I8x16Sub:
-+      masm.subInt8x16(lhs, scratch, dest);
-+      break;
-+    case wasm::SimdOp::I16x8Sub:
-+      masm.subInt16x8(lhs, scratch, dest);
-+      break;
-+    case wasm::SimdOp::I32x4Sub:
-+      masm.subInt32x4(lhs, scratch, dest);
-+      break;
-+    case wasm::SimdOp::I64x2Sub:
-+      masm.subInt64x2(lhs, scratch, dest);
-+      break;
-+    // Integer multiply (16-/32-bit lanes; I64x2 unreachable, see below)
-+    case wasm::SimdOp::I16x8Mul:
-+      masm.mulInt16x8(lhs, scratch, dest);
-+      break;
-+    case wasm::SimdOp::I32x4Mul:
-+      masm.mulInt32x4(lhs, scratch, dest);
-+      break;
-+    case wasm::SimdOp::I64x2Mul:
-+      // Unreachable on PPC64: MWasmBinarySimd128::specializeForConstantRhs
-+      // returns false in Lowering-ppc64.cpp, so MIR with a constant rhs
-+      // to I64x2Mul is never created on this backend.
-+      //
-+      // The previous in-place implementation was broken in three ways:
-+      // hard-coded VR0/VR1 staging assumed an ordering that didn't match
-+      // the surrounding code; a dead `mfvsrd(a, f0)` clobbered `a`
-+      // immediately before the next mfvsrd; and the trailing
-+      // `xxpermdi(dest, scratch, dest, 0)` with DM=0 placed lane-0 in the
-+      // wrong half. Rather than ship dead-but-broken code, crash loudly
-+      // if reachability ever changes — the future enabler must write a
-+      // correct lowering (e.g. via masm.mulInt64x2 with explicit temps).
-+      MOZ_CRASH("PPC64: I64x2Mul with constant rhs unimplemented "
-+                "(specializeForConstantRhs returns false)");
-+    // Compare
-+    case wasm::SimdOp::I8x16Eq:
-+      masm.compareInt8x16(Assembler::Equal, lhs, scratch, dest);
-+      break;
-+    case wasm::SimdOp::I8x16Ne:
-+      masm.compareInt8x16(Assembler::NotEqual, lhs, scratch, dest);
-+      break;
-+    default:
-+      MOZ_CRASH("PPC64: NYI SIMD binary-with-constant op");
-+  }
-+}
-+void CodeGenerator::visitWasmVariableShiftSimd128(
-+    LWasmVariableShiftSimd128* ins) {
-+  FloatRegister lhs = ToFloatRegister(ins->lhs());
-+  Register rhs = ToRegister(ins->rhs());
-+  FloatRegister dest = ToFloatRegister(ins->output());
-+  switch (ins->mir()->simdOp()) {
-+    case wasm::SimdOp::I8x16Shl:
-+      masm.leftShiftInt8x16(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I8x16ShrS:
-+      masm.rightShiftInt8x16(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I8x16ShrU:
-+      masm.unsignedRightShiftInt8x16(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8Shl:
-+      masm.leftShiftInt16x8(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8ShrS:
-+      masm.rightShiftInt16x8(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I16x8ShrU:
-+      masm.unsignedRightShiftInt16x8(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I32x4Shl:
-+      masm.leftShiftInt32x4(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I32x4ShrS:
-+      masm.rightShiftInt32x4(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I32x4ShrU:
-+      masm.unsignedRightShiftInt32x4(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I64x2Shl:
-+      masm.leftShiftInt64x2(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I64x2ShrS:
-+      masm.rightShiftInt64x2(lhs, rhs, dest);
-+      break;
-+    case wasm::SimdOp::I64x2ShrU:
-+      masm.unsignedRightShiftInt64x2(lhs, rhs, dest);
-+      break;
-+    default:
-+      MOZ_CRASH("PPC64: NYI SIMD variable shift op");
-+  }
-+}
-+void CodeGenerator::visitWasmConstantShiftSimd128(
-+    LWasmConstantShiftSimd128* ins) {
-+  FloatRegister src = ToFloatRegister(ins->src());
-+  FloatRegister dest = ToFloatRegister(ins->output());
-+  int32_t shift = ins->shift();
-+  switch (ins->mir()->simdOp()) {
-+    case wasm::SimdOp::I8x16Shl:
-+      masm.leftShiftInt8x16(Imm32(shift), src, dest);
-+      break;
-+    case wasm::SimdOp::I8x16ShrS:
-+      masm.rightShiftInt8x16(Imm32(shift), src, dest);
-+      break;
-+    case wasm::SimdOp::I8x16ShrU:
-+      masm.unsignedRightShiftInt8x16(Imm32(shift), src, dest);
-+      break;
-+    case wasm::SimdOp::I16x8Shl:
-+      masm.leftShiftInt16x8(Imm32(shift), src, dest);
-+      break;
-+    case wasm::SimdOp::I16x8ShrS:
-+      masm.rightShiftInt16x8(Imm32(shift), src, dest);
-+      break;
-+    case wasm::SimdOp::I16x8ShrU:
-+      masm.unsignedRightShiftInt16x8(Imm32(shift), src, dest);
-+      break;
-+    case wasm::SimdOp::I32x4Shl:
-+      masm.leftShiftInt32x4(Imm32(shift), src, dest);
-+      break;
-+    case wasm::SimdOp::I32x4ShrS:
-+      masm.rightShiftInt32x4(Imm32(shift), src, dest);
-+      break;
-+    case wasm::SimdOp::I32x4ShrU:
-+      masm.unsignedRightShiftInt32x4(Imm32(shift), src, dest);
-+      break;
-+    case wasm::SimdOp::I64x2Shl:
-+      masm.leftShiftInt64x2(Imm32(shift), src, dest);
-+      break;
-+    case wasm::SimdOp::I64x2ShrS:
-+      masm.rightShiftInt64x2(Imm32(shift), src, dest);
-+      break;
-+    case wasm::SimdOp::I64x2ShrU:
-+      masm.unsignedRightShiftInt64x2(Imm32(shift), src, dest);
-+      break;
-+    default:
-+      MOZ_CRASH("PPC64: NYI SIMD constant shift op");
-+  }
-+}
-+void CodeGenerator::visitWasmSignReplicationSimd128(
-+    LWasmSignReplicationSimd128* ins) {
-+  FloatRegister src = ToFloatRegister(ins->src());
-+  FloatRegister dest = ToFloatRegister(ins->output());
-+  // Sign replication = arithmetic right shift by max amount (all sign bits).
-+  switch (ins->mir()->simdOp()) {
-+    case wasm::SimdOp::I8x16ShrS:
-+      masm.rightShiftInt8x16(Imm32(7), src, dest);
-+      break;
-+    case wasm::SimdOp::I16x8ShrS:
-+      masm.rightShiftInt16x8(Imm32(15), src, dest);
-+      break;
-+    case wasm::SimdOp::I32x4ShrS:
-+      masm.rightShiftInt32x4(Imm32(31), src, dest);
-+      break;
-+    case wasm::SimdOp::I64x2ShrS:
-+      masm.rightShiftInt64x2(Imm32(63), src, dest);
-+      break;
-+    default:
-+      MOZ_CRASH("Unexpected sign replication op");
-+  }
-+}
-+void CodeGenerator::visitWasmShuffleSimd128(LWasmShuffleSimd128* ins) {
-+  FloatRegister lhs = ToFloatRegister(ins->lhs());
-+  FloatRegister rhs = ToFloatRegister(ins->rhs());
-+  FloatRegister dest = ToFloatRegister(ins->output());
-+  SimdConstant ctrl = ins->control();
-+  const uint8_t* lanes = reinterpret_cast<const uint8_t*>(ctrl.bytes());
-+  masm.shuffleInt8x16(lanes, lhs, rhs, dest);
-+}
-+void CodeGenerator::visitWasmPermuteSimd128(LWasmPermuteSimd128* ins) {
-+  FloatRegister src = ToFloatRegister(ins->src());
-+  FloatRegister dest = ToFloatRegister(ins->output());
-+  // PPC64: the shuffle analysis transforms control bytes into specialized
-+  // formats. Reconstruct raw Wasm byte indices for our vperm implementation.
-+  SimdConstant ctrl = ins->control();
-+  uint8_t rawLanes[16];
-+  switch (ins->op()) {
-+    case SimdPermuteOp::MOVE:
-+      masm.moveSimd128(src, dest);
-+      return;
-+    case SimdPermuteOp::PERMUTE_32x4: {
-+      const int32_t* words = reinterpret_cast<const int32_t*>(ctrl.bytes());
-+      for (int i = 0; i < 4; i++)
-+        for (int j = 0; j < 4; j++)
-+          rawLanes[i * 4 + j] = words[i] * 4 + j;
-+      break;
-+    }
-+    case SimdPermuteOp::PERMUTE_16x8: {
-+      // control has int16 halfword indices. High byte of halfs[0] may have
-+      // platform-specific flags (Perm16x8Action). Mask to get the index only.
-+      const int16_t* halfs = reinterpret_cast<const int16_t*>(ctrl.bytes());
-+      for (int i = 0; i < 8; i++) {
-+        int hwIdx = halfs[i] & 0x7;
-+        rawLanes[i * 2] = hwIdx * 2;
-+        rawLanes[i * 2 + 1] = hwIdx * 2 + 1;
-+      }
-+      break;
-+    }
-+    case SimdPermuteOp::BROADCAST_8x16: {
-+      uint8_t lane = reinterpret_cast<const int8_t*>(ctrl.bytes())[0];
-+      for (int i = 0; i < 16; i++) rawLanes[i] = lane;
-+      break;
-+    }
-+    case SimdPermuteOp::BROADCAST_16x8: {
-+      uint8_t lane = reinterpret_cast<const int8_t*>(ctrl.bytes())[0];
-+      for (int i = 0; i < 8; i++) {
-+        rawLanes[i * 2] = lane * 2;
-+        rawLanes[i * 2 + 1] = lane * 2 + 1;
-+      }
-+      break;
-+    }
-+    case SimdPermuteOp::ROTATE_RIGHT_8x16: {
-+      uint8_t shift = reinterpret_cast<const int8_t*>(ctrl.bytes())[0];
-+      for (int i = 0; i < 16; i++) rawLanes[i] = (i + shift) % 16;
-+      break;
-+    }
-+    case SimdPermuteOp::SHIFT_LEFT_8x16: {
-+      // Shifted-out positions must be zero. Use index 16+ to pick from zero.
-+      uint8_t shift = reinterpret_cast<const int8_t*>(ctrl.bytes())[0];
-+      for (int i = 0; i < 16; i++)
-+        rawLanes[i] = (i >= shift) ? (i - shift) : (16 + i);
-+      goto needsZeroRhs;
-+    }
-+    case SimdPermuteOp::SHIFT_RIGHT_8x16: {
-+      uint8_t shift = reinterpret_cast<const int8_t*>(ctrl.bytes())[0];
-+      for (int i = 0; i < 16; i++)
-+        rawLanes[i] = (i + shift < 16) ? (i + shift) : (16 + i);
-+      goto needsZeroRhs;
-+    }
-+    case SimdPermuteOp::REVERSE_16x8: {
-+      // Reverse bytes within each 16-bit lane: [1,0,3,2,5,4,...]
-+      for (int i = 0; i < 8; i++) {
-+        rawLanes[i * 2] = i * 2 + 1;
-+        rawLanes[i * 2 + 1] = i * 2;
-+      }
-+      break;
-+    }
-+    case SimdPermuteOp::REVERSE_32x4: {
-+      // Reverse bytes within each 32-bit lane: [3,2,1,0,7,6,5,4,...]
-+      for (int i = 0; i < 4; i++)
-+        for (int j = 0; j < 4; j++)
-+          rawLanes[i * 4 + j] = i * 4 + (3 - j);
-+      break;
-+    }
-+    case SimdPermuteOp::REVERSE_64x2: {
-+      // Reverse bytes within each 64-bit lane: [7,6,5,4,3,2,1,0,15,...]
-+      for (int i = 0; i < 2; i++)
-+        for (int j = 0; j < 8; j++)
-+          rawLanes[i * 8 + j] = i * 8 + (7 - j);
-+      break;
-+    }
-+    case SimdPermuteOp::ZERO_EXTEND_8x16_TO_16x8:
-+    case SimdPermuteOp::ZERO_EXTEND_8x16_TO_32x4:
-+    case SimdPermuteOp::ZERO_EXTEND_8x16_TO_64x2:
-+    case SimdPermuteOp::ZERO_EXTEND_16x8_TO_32x4:
-+    case SimdPermuteOp::ZERO_EXTEND_16x8_TO_64x2:
-+    case SimdPermuteOp::ZERO_EXTEND_32x4_TO_64x2: {
-+      const int8_t* bytes = reinterpret_cast<const int8_t*>(ctrl.bytes());
-+      for (int i = 0; i < 16; i++) rawLanes[i] = bytes[i];
-+      goto needsZeroRhs;
-+    }
-+    default: {
-+      // PERMUTE_8x16 and others: control has raw byte indices.
-+      const int8_t* bytes = reinterpret_cast<const int8_t*>(ctrl.bytes());
-+      for (int i = 0; i < 16; i++) rawLanes[i] = bytes[i];
-+      break;
-+    }
-+  }
-+  masm.shuffleInt8x16(rawLanes, src, src, dest);
-+  return;
-+
-+  needsZeroRhs: {
-+    // Wasm convention: rawLanes[i] in 0..15 selects src.LE_byte[idx], and
-+    // rawLanes[i] >= 16 means "zero". Without spilling, we can't satisfy
-+    // vperm's three-input constraint AND keep src alive when dest == src.
-+    // Strategy: vperm src with itself (any valid byte for the "zero"
-+    // positions, bytes get masked out below), then AND with a mask that
-+    // zeros those positions.
-+    int8_t ctrl[16], mask[16];
-+    for (unsigned i = 0; i < 16; i++) {
-+      uint8_t idx = rawLanes[i];
-+      if (idx < 16) {
-+        ctrl[i] = 15 - idx;
-+        mask[i] = -1;
-+      } else {
-+        ctrl[i] = 0;
-+        mask[i] = 0;
-+      }
-+    }
-+    ScratchSimd128Scope scratch(masm);
-+    masm.loadConstantSimd128(SimdConstant::CreateX16(ctrl), scratch);
-+    masm.as_vperm(dest.encoding() & 31,
-+                  src.encoding() & 31,
-+                  src.encoding() & 31,
-+                  scratch.encoding() & 31);
-+    masm.loadConstantSimd128(SimdConstant::CreateX16(mask), scratch);
-+    masm.as_xxland(dest, dest, scratch);
-+    return;
-+  }
-+}
-+void CodeGenerator::visitWasmReplaceLaneSimd128(LWasmReplaceLaneSimd128* ins) {
-+  FloatRegister lhsDest = ToFloatRegister(ins->output());
-+  MOZ_ASSERT(ToFloatRegister(ins->lhs()) == lhsDest);
-+  uint32_t lane = ins->mir()->laneIndex();
-+  switch (ins->mir()->simdOp()) {
-+    case wasm::SimdOp::I8x16ReplaceLane:
-+      masm.replaceLaneInt8x16(lane, ToRegister(ins->rhs()), lhsDest);
-+      break;
-+    case wasm::SimdOp::I16x8ReplaceLane:
-+      masm.replaceLaneInt16x8(lane, ToRegister(ins->rhs()), lhsDest);
-+      break;
-+    case wasm::SimdOp::I32x4ReplaceLane:
-+      masm.replaceLaneInt32x4(lane, ToRegister(ins->rhs()), lhsDest);
-+      break;
-+    case wasm::SimdOp::F32x4ReplaceLane:
-+      masm.replaceLaneFloat32x4(lane, ToFloatRegister(ins->rhs()), lhsDest);
-+      break;
-+    case wasm::SimdOp::F64x2ReplaceLane:
-+      masm.replaceLaneFloat64x2(lane, ToFloatRegister(ins->rhs()), lhsDest);
-+      break;
-+    default:
-+      MOZ_CRASH("PPC64: NYI SIMD replace lane op");
-+  }
-+}
-+void CodeGenerator::visitWasmReplaceInt64LaneSimd128(
-+    LWasmReplaceInt64LaneSimd128* ins) {
-+  MOZ_ASSERT(ins->mir()->simdOp() == wasm::SimdOp::I64x2ReplaceLane);
-+  FloatRegister lhsDest = ToFloatRegister(ins->output());
-+  MOZ_ASSERT(ToFloatRegister(ins->lhs()) == lhsDest);
-+  masm.replaceLaneInt64x2(ins->mir()->laneIndex(),
-+                          ToRegister64(ins->rhs()), lhsDest);
-+}
-+void CodeGenerator::visitWasmScalarToSimd128(LWasmScalarToSimd128* ins) {
-+  FloatRegister dest = ToFloatRegister(ins->output());
-+  switch (ins->mir()->simdOp()) {
-+    case wasm::SimdOp::I8x16Splat:
-+      masm.splatX16(ToRegister(ins->src()), dest);
-+      break;
-+    case wasm::SimdOp::I16x8Splat:
-+      masm.splatX8(ToRegister(ins->src()), dest);
-+      break;
-+    case wasm::SimdOp::I32x4Splat:
-+      masm.splatX4(ToRegister(ins->src()), dest);
-+      break;
-+    case wasm::SimdOp::F32x4Splat:
-+      masm.splatX4(ToFloatRegister(ins->src()), dest);
-+      break;
-+    case wasm::SimdOp::F64x2Splat:
-+      masm.splatX2(ToFloatRegister(ins->src()), dest);
-+      break;
-+    default:
-+      MOZ_CRASH("PPC64: NYI SIMD scalar-to-simd op");
-+  }
-+}
-+void CodeGenerator::visitWasmInt64ToSimd128(LWasmInt64ToSimd128* ins) {
-+  FloatRegister dest = ToFloatRegister(ins->output());
-+  switch (ins->mir()->simdOp()) {
-+    case wasm::SimdOp::I64x2Splat:
-+      masm.splatX2(ToRegister64(ins->src()), dest);
-+      break;
-+    default:
-+      MOZ_CRASH("PPC64: NYI SIMD int64-to-simd op");
-+  }
-+}
-+void CodeGenerator::visitWasmUnarySimd128(LWasmUnarySimd128* ins) {
-+  FloatRegister src = ToFloatRegister(ins->src());
-+  FloatRegister dest = ToFloatRegister(ins->output());
-+  switch (ins->mir()->simdOp()) {
-+    case wasm::SimdOp::I8x16Neg:
-+      masm.negInt8x16(src, dest);
-+      break;
-+    case wasm::SimdOp::I16x8Neg:
-+      masm.negInt16x8(src, dest);
-+      break;
-+    case wasm::SimdOp::I32x4Neg:
-+      masm.negInt32x4(src, dest);
-+      break;
-+    case wasm::SimdOp::I64x2Neg:
-+      masm.negInt64x2(src, dest);
-+      break;
-+    case wasm::SimdOp::I8x16Abs:
-+      masm.absInt8x16(src, dest);
-+      break;
-+    case wasm::SimdOp::I16x8Abs:
-+      masm.absInt16x8(src, dest);
-+      break;
-+    case wasm::SimdOp::I32x4Abs:
-+      masm.absInt32x4(src, dest);
-+      break;
-+    case wasm::SimdOp::I64x2Abs:
-+      masm.absInt64x2(src, dest);
-+      break;
-+    case wasm::SimdOp::V128Not:
-+      masm.bitwiseNotSimd128(src, dest);
-+      break;
-+    case wasm::SimdOp::F32x4Neg:
-+      masm.negFloat32x4(src, dest);
-+      break;
-+    case wasm::SimdOp::F64x2Neg:
-+      masm.negFloat64x2(src, dest);
-+      break;
-+    case wasm::SimdOp::F32x4Abs:
-+      masm.absFloat32x4(src, dest);
-+      break;
-+    case wasm::SimdOp::F64x2Abs:
-+      masm.absFloat64x2(src, dest);
-+      break;
-+    case wasm::SimdOp::F32x4Sqrt:
-+      masm.sqrtFloat32x4(src, dest);
-+      break;
-+    case wasm::SimdOp::F64x2Sqrt:
-+      masm.sqrtFloat64x2(src, dest);
-+      break;
-+    case wasm::SimdOp::F32x4Ceil:
-+      masm.ceilFloat32x4(src, dest);
-+      break;
-+    case wasm::SimdOp::F64x2Ceil:
-+      masm.ceilFloat64x2(src, dest);
-+      break;
-+    case wasm::SimdOp::F32x4Floor:
-+      masm.floorFloat32x4(src, dest);
-+      break;
-+    case wasm::SimdOp::F64x2Floor:
-+      masm.floorFloat64x2(src, dest);
-+      break;
-+    case wasm::SimdOp::F32x4Trunc:
-+      masm.truncFloat32x4(src, dest);
-+      break;
-+    case wasm::SimdOp::F64x2Trunc:
-+      masm.truncFloat64x2(src, dest);
-+      break;
-+    case wasm::SimdOp::F32x4Nearest:
-+      masm.nearestFloat32x4(src, dest);
-+      break;
-+    case wasm::SimdOp::F64x2Nearest:
-+      masm.nearestFloat64x2(src, dest);
-+      break;
-+    // Conversions
-+    case wasm::SimdOp::F32x4ConvertI32x4S:
-+      masm.convertInt32x4ToFloat32x4(src, dest);
-+      break;
-+    case wasm::SimdOp::F32x4ConvertI32x4U:
-+      masm.unsignedConvertInt32x4ToFloat32x4(src, dest);
-+      break;
-+    case wasm::SimdOp::I32x4TruncSatF32x4S:
-+      masm.truncSatFloat32x4ToInt32x4(src, dest);
-+      break;
-+    case wasm::SimdOp::I32x4TruncSatF32x4U:
-+      masm.unsignedTruncSatFloat32x4ToInt32x4(src, dest);
-+      break;
-+    case wasm::SimdOp::F64x2ConvertLowI32x4S:
-+      masm.convertInt32x4ToFloat64x2(src, dest);
-+      break;
-+    case wasm::SimdOp::F64x2ConvertLowI32x4U:
-+      masm.unsignedConvertInt32x4ToFloat64x2(src, dest);
-+      break;
-+    case wasm::SimdOp::F32x4DemoteF64x2Zero:
-+      masm.convertFloat64x2ToFloat32x4(src, dest);
-+      break;
-+    case wasm::SimdOp::F64x2PromoteLowF32x4:
-+      masm.convertFloat32x4ToFloat64x2(src, dest);
-+      break;
-+    case wasm::SimdOp::I32x4TruncSatF64x2SZero:
-+      masm.truncSatFloat64x2ToInt32x4(src, dest, ScratchSimd128Reg);
-+      break;
-+    case wasm::SimdOp::I32x4TruncSatF64x2UZero:
-+      masm.unsignedTruncSatFloat64x2ToInt32x4(src, dest, ScratchSimd128Reg);
-+      break;
-+    // Widen
-+    case wasm::SimdOp::I16x8ExtendLowI8x16S:
-+      masm.widenLowInt8x16(src, dest);
-+      break;
-+    case wasm::SimdOp::I16x8ExtendHighI8x16S:
-+      masm.widenHighInt8x16(src, dest);
-+      break;
-+    case wasm::SimdOp::I16x8ExtendLowI8x16U:
-+      masm.unsignedWidenLowInt8x16(src, dest);
-+      break;
-+    case wasm::SimdOp::I16x8ExtendHighI8x16U:
-+      masm.unsignedWidenHighInt8x16(src, dest);
-+      break;
-+    case wasm::SimdOp::I32x4ExtendLowI16x8S:
-+      masm.widenLowInt16x8(src, dest);
-+      break;
-+    case wasm::SimdOp::I32x4ExtendHighI16x8S:
-+      masm.widenHighInt16x8(src, dest);
-+      break;
-+    case wasm::SimdOp::I32x4ExtendLowI16x8U:
-+      masm.unsignedWidenLowInt16x8(src, dest);
-+      break;
-+    case wasm::SimdOp::I32x4ExtendHighI16x8U:
-+      masm.unsignedWidenHighInt16x8(src, dest);
-+      break;
-+    case wasm::SimdOp::I64x2ExtendLowI32x4S:
-+      masm.widenLowInt32x4(src, dest);
-+      break;
-+    case wasm::SimdOp::I64x2ExtendHighI32x4S:
-+      masm.widenHighInt32x4(src, dest);
-+      break;
-+    case wasm::SimdOp::I64x2ExtendLowI32x4U:
-+      masm.unsignedWidenLowInt32x4(src, dest);
-+      break;
-+    case wasm::SimdOp::I64x2ExtendHighI32x4U:
-+      masm.unsignedWidenHighInt32x4(src, dest);
-+      break;
-+    // Extended add pairwise
-+    case wasm::SimdOp::I16x8ExtaddPairwiseI8x16S:
-+      masm.extAddPairwiseInt8x16(src, dest);
-+      break;
-+    case wasm::SimdOp::I16x8ExtaddPairwiseI8x16U:
-+      masm.unsignedExtAddPairwiseInt8x16(src, dest);
-+      break;
-+    case wasm::SimdOp::I32x4ExtaddPairwiseI16x8S:
-+      masm.extAddPairwiseInt16x8(src, dest);
-+      break;
-+    case wasm::SimdOp::I32x4ExtaddPairwiseI16x8U:
-+      masm.unsignedExtAddPairwiseInt16x8(src, dest);
-+      break;
-+    // Relaxed truncation
-+    case wasm::SimdOp::I32x4RelaxedTruncF32x4S:
-+      masm.truncFloat32x4ToInt32x4Relaxed(src, dest);
-+      break;
-+    case wasm::SimdOp::I32x4RelaxedTruncF32x4U:
-+      masm.unsignedTruncFloat32x4ToInt32x4Relaxed(src, dest);
-+      break;
-+    case wasm::SimdOp::I32x4RelaxedTruncF64x2SZero:
-+      masm.truncFloat64x2ToInt32x4Relaxed(src, dest);
-+      break;
-+    case wasm::SimdOp::I32x4RelaxedTruncF64x2UZero:
-+      masm.unsignedTruncFloat64x2ToInt32x4Relaxed(src, dest);
-+      break;
-+    // Popcnt
-+    case wasm::SimdOp::I8x16Popcnt:
-+      masm.popcntInt8x16(src, dest);
-+      break;
-+    default:
-+      MOZ_CRASH("PPC64: NYI SIMD unary op");
-+  }
-+}
-+void CodeGenerator::visitWasmReduceSimd128(LWasmReduceSimd128* ins) {
-+  FloatRegister src = ToFloatRegister(ins->src());
-+  uint32_t imm = ins->mir()->imm();
-+  switch (ins->mir()->simdOp()) {
-+    case wasm::SimdOp::I8x16ExtractLaneS:
-+      masm.extractLaneInt8x16(imm, src, ToRegister(ins->output()));
-+      break;
-+    case wasm::SimdOp::I8x16ExtractLaneU:
-+      masm.unsignedExtractLaneInt8x16(imm, src, ToRegister(ins->output()));
-+      break;
-+    case wasm::SimdOp::I16x8ExtractLaneS:
-+      masm.extractLaneInt16x8(imm, src, ToRegister(ins->output()));
-+      break;
-+    case wasm::SimdOp::I16x8ExtractLaneU:
-+      masm.unsignedExtractLaneInt16x8(imm, src, ToRegister(ins->output()));
-+      break;
-+    case wasm::SimdOp::I32x4ExtractLane:
-+      masm.extractLaneInt32x4(imm, src, ToRegister(ins->output()));
-+      break;
-+    case wasm::SimdOp::F32x4ExtractLane:
-+      masm.extractLaneFloat32x4(imm, src, ToFloatRegister(ins->output()));
-+      break;
-+    case wasm::SimdOp::F64x2ExtractLane:
-+      masm.extractLaneFloat64x2(imm, src, ToFloatRegister(ins->output()));
-+      break;
-+    case wasm::SimdOp::V128AnyTrue:
-+      masm.anyTrueSimd128(src, ToRegister(ins->output()));
-+      break;
-+    case wasm::SimdOp::I8x16AllTrue:
-+      masm.allTrueInt8x16(src, ToRegister(ins->output()));
-+      break;
-+    case wasm::SimdOp::I16x8AllTrue:
-+      masm.allTrueInt16x8(src, ToRegister(ins->output()));
-+      break;
-+    case wasm::SimdOp::I32x4AllTrue:
-+      masm.allTrueInt32x4(src, ToRegister(ins->output()));
-+      break;
-+    case wasm::SimdOp::I64x2AllTrue:
-+      masm.allTrueInt64x2(src, ToRegister(ins->output()));
-+      break;
-+    case wasm::SimdOp::I8x16Bitmask:
-+      masm.bitmaskInt8x16(src, ToRegister(ins->output()), ScratchSimd128Reg);
-+      break;
-+    case wasm::SimdOp::I16x8Bitmask:
-+      masm.bitmaskInt16x8(src, ToRegister(ins->output()), ScratchSimd128Reg);
-+      break;
-+    case wasm::SimdOp::I32x4Bitmask:
-+      masm.bitmaskInt32x4(src, ToRegister(ins->output()), ScratchSimd128Reg);
-+      break;
-+    case wasm::SimdOp::I64x2Bitmask:
-+      masm.bitmaskInt64x2(src, ToRegister(ins->output()), ScratchSimd128Reg);
-+      break;
-+    default:
-+      MOZ_CRASH("PPC64: NYI SIMD reduce op");
-+  }
-+}
-+void CodeGenerator::visitWasmReduceAndBranchSimd128(
-+    LWasmReduceAndBranchSimd128* ins) {
-+  FloatRegister src = ToFloatRegister(ins->src());
-+  UseScratchRegisterScope temps(masm);
-+  Register tmp = temps.Acquire();
-+  switch (ins->simdOp()) {
-+    case wasm::SimdOp::V128AnyTrue:
-+      masm.anyTrueSimd128(src, tmp);
-+      break;
-+    case wasm::SimdOp::I8x16AllTrue:
-+      masm.allTrueInt8x16(src, tmp);
-+      break;
-+    case wasm::SimdOp::I16x8AllTrue:
-+      masm.allTrueInt16x8(src, tmp);
-+      break;
-+    case wasm::SimdOp::I32x4AllTrue:
-+      masm.allTrueInt32x4(src, tmp);
-+      break;
-+    case wasm::SimdOp::I64x2AllTrue:
-+      masm.allTrueInt64x2(src, tmp);
-+      break;
-+    case wasm::SimdOp::I8x16Bitmask:
-+      masm.bitmaskInt8x16(src, tmp, ScratchSimd128Reg);
-+      break;
-+    case wasm::SimdOp::I16x8Bitmask:
-+      masm.bitmaskInt16x8(src, tmp, ScratchSimd128Reg);
-+      break;
-+    case wasm::SimdOp::I32x4Bitmask:
-+      masm.bitmaskInt32x4(src, tmp, ScratchSimd128Reg);
-+      break;
-+    case wasm::SimdOp::I64x2Bitmask:
-+      masm.bitmaskInt64x2(src, tmp, ScratchSimd128Reg);
-+      break;
-+    default:
-+      MOZ_CRASH("PPC64: NYI SIMD reduce-and-branch op");
-+  }
-+  masm.as_cmpdi(tmp, 0);
-+  // Branch to ifTrue if nonzero, fall through to ifFalse.
-+  Label* ifTrue = skipTrivialBlocks(ins->ifTrue())->lir()->label();
-+  Label* ifFalse = skipTrivialBlocks(ins->ifFalse())->lir()->label();
-+  masm.ma_b(Assembler::NotEqual, ifTrue);
-+  masm.jump(ifFalse);
-+}
-+void CodeGenerator::visitWasmReduceSimd128ToInt64(
-+    LWasmReduceSimd128ToInt64* ins) {
-+  FloatRegister src = ToFloatRegister(ins->src());
-+  Register64 dest = ToOutRegister64(ins);
-+  switch (ins->mir()->simdOp()) {
-+    case wasm::SimdOp::I64x2ExtractLane:
-+      masm.extractLaneInt64x2(ins->mir()->imm(), src, dest);
-+      break;
-+    default:
-+      MOZ_CRASH("PPC64: NYI SIMD reduce-to-int64 op");
-+  }
-+}
-+static inline wasm::MemoryAccessDesc DeriveMemoryAccessDesc(
-+    const wasm::MemoryAccessDesc& access, Scalar::Type type) {
-+  return wasm::MemoryAccessDesc(access.memoryIndex(), type, access.align(),
-+                                access.offset32(), access.trapDesc(),
-+                                access.isHugeMemory());
-+}
-+
-+void CodeGenerator::visitWasmLoadLaneSimd128(LWasmLoadLaneSimd128* ins) {
-+  const MWasmLoadLaneSimd128* mir = ins->mir();
-+  Register memoryBase = ToRegister(ins->memoryBase());
-+  Register ptr = ToRegister(ins->ptr());
-+  FloatRegister src = ToFloatRegister(ins->src());
-+  FloatRegister dest = ToFloatRegister(ins->output());
-+  UseScratchRegisterScope temps(masm);
-+  Register tmp = temps.Acquire();
-+  masm.moveSimd128(src, dest);
-+  switch (mir->laneSize()) {
-+    case 1:
-+      masm.wasmLoad(DeriveMemoryAccessDesc(mir->access(), Scalar::Int8),
-+                    memoryBase, ptr, ptr, AnyRegister(tmp));
-+      masm.replaceLaneInt8x16(mir->laneIndex(), tmp, dest);
-+      break;
-+    case 2:
-+      masm.wasmLoad(DeriveMemoryAccessDesc(mir->access(), Scalar::Int16),
-+                    memoryBase, ptr, ptr, AnyRegister(tmp));
-+      masm.replaceLaneInt16x8(mir->laneIndex(), tmp, dest);
-+      break;
-+    case 4:
-+      masm.wasmLoad(DeriveMemoryAccessDesc(mir->access(), Scalar::Int32),
-+                    memoryBase, ptr, ptr, AnyRegister(tmp));
-+      masm.replaceLaneInt32x4(mir->laneIndex(), tmp, dest);
-+      break;
-+    case 8: {
-+      masm.wasmLoadI64(DeriveMemoryAccessDesc(mir->access(), Scalar::Int64),
-+                       memoryBase, ptr, ptr,
-+                       Register64(tmp));
-+      masm.replaceLaneInt64x2(mir->laneIndex(), Register64(tmp), dest);
-+      break;
-+    }
-+    default:
-+      MOZ_CRASH("Unexpected lane size");
-+  }
-+}
-+void CodeGenerator::visitWasmStoreLaneSimd128(LWasmStoreLaneSimd128* ins) {
-+  const MWasmStoreLaneSimd128* mir = ins->mir();
-+  Register memoryBase = ToRegister(ins->memoryBase());
-+  Register ptr = ToRegister(ins->ptr());
-+  FloatRegister src = ToFloatRegister(ins->src());
-+  UseScratchRegisterScope temps(masm);
-+  Register tmp = temps.Acquire();
-+  switch (mir->laneSize()) {
-+    case 1:
-+      masm.unsignedExtractLaneInt8x16(mir->laneIndex(), src, tmp);
-+      masm.wasmStore(DeriveMemoryAccessDesc(mir->access(), Scalar::Int8),
-+                     AnyRegister(tmp), memoryBase, ptr, ptr);
-+      break;
-+    case 2:
-+      masm.unsignedExtractLaneInt16x8(mir->laneIndex(), src, tmp);
-+      masm.wasmStore(DeriveMemoryAccessDesc(mir->access(), Scalar::Int16),
-+                     AnyRegister(tmp), memoryBase, ptr, ptr);
-+      break;
-+    case 4:
-+      masm.extractLaneInt32x4(mir->laneIndex(), src, tmp);
-+      masm.wasmStore(DeriveMemoryAccessDesc(mir->access(), Scalar::Int32),
-+                     AnyRegister(tmp), memoryBase, ptr, ptr);
-+      break;
-+    case 8:
-+      masm.extractLaneInt64x2(mir->laneIndex(), src, Register64(tmp));
-+      masm.wasmStoreI64(DeriveMemoryAccessDesc(mir->access(), Scalar::Int64),
-+                        Register64(tmp), memoryBase, ptr, ptr);
-+      break;
-+    default:
-+      MOZ_CRASH("Unexpected lane size");
-+  }
-+}
-+
-+}  // namespace jit
-+}  // namespace js
-diff --git a/js/src/jit/ppc64/CodeGenerator-ppc64.h b/js/src/jit/ppc64/CodeGenerator-ppc64.h
-new file mode 100644
-index 000000000000..3414eceb5ac4
---- /dev/null
-+++ b/js/src/jit/ppc64/CodeGenerator-ppc64.h
-@@ -0,0 +1,101 @@
-+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
-+ * vim: set ts=8 sts=2 et sw=2 tw=80:
-+ * This Source Code Form is subject to the terms of the Mozilla Public
-+ * License, v. 2.0. If a copy of the MPL was not distributed with this
-+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-+
-+#ifndef jit_ppc64_CodeGenerator_ppc64_h
-+#define jit_ppc64_CodeGenerator_ppc64_h
-+
-+#include "jit/ppc64/Assembler-ppc64.h"
-+#include "jit/shared/CodeGenerator-shared.h"
-+
-+namespace js {
-+namespace jit {
-+
-+class CodeGeneratorPPC64;
-+class OutOfLineTableSwitch;
-+
-+using OutOfLineWasmTruncateCheck =
-+    OutOfLineWasmTruncateCheckBase<CodeGeneratorPPC64>;
-+
-+class CodeGeneratorPPC64 : public CodeGeneratorShared {
-+  friend class MoveResolverPPC64;
-+
-+ protected:
-+  CodeGeneratorPPC64(MIRGenerator* gen, LIRGraph* graph, MacroAssembler* masm,
-+                     const wasm::CodeMetadata* codeMeta);
-+
-+  NonAssertingLabel deoptLabel_;
-+
-+  Operand ToOperand(const LAllocation& a);
-+  Operand ToOperand(const LAllocation* a);
-+  MoveOperand toMoveOperand(LAllocation a) const;
-+
-+  template <typename T1, typename T2>
-+  void bailoutCmp32(Assembler::Condition c, T1 lhs, T2 rhs,
-+                    LSnapshot* snapshot) {
-+    Label bail;
-+    masm.branch32(c, lhs, rhs, &bail);
-+    bailoutFrom(&bail, snapshot);
-+  }
-+  template <typename T1, typename T2>
-+  void bailoutCmpPtr(Assembler::Condition c, T1 lhs, T2 rhs,
-+                     LSnapshot* snapshot) {
-+    Label bail;
-+    masm.branchPtr(c, lhs, rhs, &bail);
-+    bailoutFrom(&bail, snapshot);
-+  }
-+  template <typename T1, typename T2>
-+  void bailoutTest32(Assembler::Condition c, T1 lhs, T2 rhs,
-+                     LSnapshot* snapshot) {
-+    Label bail;
-+    masm.branchTest32(c, lhs, rhs, &bail);
-+    bailoutFrom(&bail, snapshot);
-+  }
-+  void bailoutIfFalseBool(Register lhs, LSnapshot* snapshot);
-+  void bailoutFrom(Label* label, LSnapshot* snapshot);
-+  void bailout(LSnapshot* snapshot);
-+
-+ protected:
-+  bool generateOutOfLineCode();
-+  void branchToBlock(MBasicBlock* block);
-+
-+  template <typename T>
-+  void branchToBlock(Assembler::Condition cond, Register lhs, T rhs,
-+                     MBasicBlock* mir) {
-+    Label* label = skipTrivialBlocks(mir)->lir()->label();
-+    masm.branch32(cond, lhs, rhs, label);
-+  }
-+  void branchToBlock(Assembler::DoubleCondition cond, FloatRegister lhs,
-+                     FloatRegister rhs, MBasicBlock* mir);
-+  void branchToBlock(Assembler::FloatFormat fmt,
-+                     Assembler::DoubleCondition cond, FloatRegister lhs,
-+                     FloatRegister rhs, MBasicBlock* mir);
-+
-+  void emitTableSwitchDispatch(MTableSwitch* mir, Register index,
-+                               Register base);
-+
-+  void emitBigIntPtrDiv(LBigIntPtrDiv* ins, Register dividend, Register divisor,
-+                        Register output);
-+  void emitBigIntPtrMod(LBigIntPtrMod* ins, Register dividend, Register divisor,
-+                        Register output);
-+
-+  void generateInvalidateEpilogue();
-+
-+  template <typename T>
-+  void emitWasmLoad(T* lir);
-+  template <typename T>
-+  void emitWasmStore(T* lir);
-+
-+ public:
-+  void visitOutOfLineTableSwitch(OutOfLineTableSwitch* ool);
-+  void visitOutOfLineWasmTruncateCheck(OutOfLineWasmTruncateCheck* ool);
-+};
-+
-+typedef CodeGeneratorPPC64 CodeGeneratorSpecific;
-+
-+}  // namespace jit
-+}  // namespace js
-+
-+#endif /* jit_ppc64_CodeGenerator_ppc64_h */
-diff --git a/js/src/jit/ppc64/LIR-ppc64.h b/js/src/jit/ppc64/LIR-ppc64.h
-new file mode 100644
-index 000000000000..686875056127
---- /dev/null
-+++ b/js/src/jit/ppc64/LIR-ppc64.h
-@@ -0,0 +1,135 @@
-+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
-+ * vim: set ts=8 sts=2 et sw=2 tw=80:
-+ * This Source Code Form is subject to the terms of the Mozilla Public
-+ * License, v. 2.0. If a copy of the MPL was not distributed with this
-+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-+
-+#ifndef jit_ppc64_LIR_ppc64_h
-+#define jit_ppc64_LIR_ppc64_h
-+
-+namespace js {
-+namespace jit {
-+
-+class LUnbox : public LInstructionHelper<1, BOX_PIECES, 0> {
-+ public:
-+  LIR_HEADER(Unbox);
-+
-+  explicit LUnbox(const LAllocation& input) : LInstructionHelper(classOpcode) {
-+    setOperand(0, input);
-+  }
-+
-+  static const size_t Input = 0;
-+
-+  LBoxAllocation input() const { return getBoxOperand(Input); }
-+
-+  MUnbox* mir() const { return mir_->toUnbox(); }
-+  const char* extraName() const { return StringFromMIRType(mir()->type()); }
-+};
-+
-+class LUDivOrMod : public LBinaryMath<0> {
-+ public:
-+  LIR_HEADER(UDivOrMod);
-+
-+  LUDivOrMod() : LBinaryMath(classOpcode) {}
-+
-+  MBinaryArithInstruction* mir() const {
-+    MOZ_ASSERT(mir_->isDiv() || mir_->isMod());
-+    return static_cast<MBinaryArithInstruction*>(mir_);
-+  }
-+
-+  bool canBeDivideByZero() const {
-+    if (mir_->isMod()) {
-+      return mir_->toMod()->canBeDivideByZero();
-+    }
-+    return mir_->toDiv()->canBeDivideByZero();
-+  }
-+
-+  bool trapOnError() const {
-+    if (mir_->isMod()) {
-+      return mir_->toMod()->trapOnError();
-+    }
-+    return mir_->toDiv()->trapOnError();
-+  }
-+
-+  wasm::TrapSiteDesc trapSiteDesc() const {
-+    MOZ_ASSERT(mir_->isDiv() || mir_->isMod());
-+    if (mir_->isMod()) {
-+      return mir_->toMod()->trapSiteDesc();
-+    }
-+    return mir_->toDiv()->trapSiteDesc();
-+  }
-+};
-+
-+class LDivOrModI64 : public LBinaryMath<0> {
-+ public:
-+  LIR_HEADER(DivOrModI64);
-+
-+  LDivOrModI64(const LAllocation& lhs, const LAllocation& rhs)
-+      : LBinaryMath(classOpcode) {
-+    setOperand(0, lhs);
-+    setOperand(1, rhs);
-+  }
-+
-+  MBinaryArithInstruction* mir() const {
-+    MOZ_ASSERT(mir_->isDiv() || mir_->isMod());
-+    return static_cast<MBinaryArithInstruction*>(mir_);
-+  }
-+
-+  bool canBeDivideByZero() const {
-+    if (mir_->isMod()) {
-+      return mir_->toMod()->canBeDivideByZero();
-+    }
-+    return mir_->toDiv()->canBeDivideByZero();
-+  }
-+  bool canBeNegativeOverflow() const {
-+    if (mir_->isMod()) {
-+      return mir_->toMod()->canBeNegativeDividend();
-+    }
-+    return mir_->toDiv()->canBeNegativeOverflow();
-+  }
-+  wasm::TrapSiteDesc trapSiteDesc() const {
-+    MOZ_ASSERT(mir_->isDiv() || mir_->isMod());
-+    if (mir_->isMod()) {
-+      return mir_->toMod()->trapSiteDesc();
-+    }
-+    return mir_->toDiv()->trapSiteDesc();
-+  }
-+};
-+
-+class LUDivOrModI64 : public LBinaryMath<0> {
-+ public:
-+  LIR_HEADER(UDivOrModI64);
-+
-+  LUDivOrModI64(const LAllocation& lhs, const LAllocation& rhs)
-+      : LBinaryMath(classOpcode) {
-+    setOperand(0, lhs);
-+    setOperand(1, rhs);
-+  }
-+
-+  const char* extraName() const {
-+    return mir()->isTruncated() ? "Truncated" : nullptr;
-+  }
-+
-+  MBinaryArithInstruction* mir() const {
-+    MOZ_ASSERT(mir_->isDiv() || mir_->isMod());
-+    return static_cast<MBinaryArithInstruction*>(mir_);
-+  }
-+  bool canBeDivideByZero() const {
-+    if (mir_->isMod()) {
-+      return mir_->toMod()->canBeDivideByZero();
-+    }
-+    return mir_->toDiv()->canBeDivideByZero();
-+  }
-+  wasm::TrapSiteDesc trapSiteDesc() const {
-+    MOZ_ASSERT(mir_->isDiv() || mir_->isMod());
-+    if (mir_->isMod()) {
-+      return mir_->toMod()->trapSiteDesc();
-+    }
-+    return mir_->toDiv()->trapSiteDesc();
-+  }
-+};
-+
-+}  // namespace jit
-+}  // namespace js
-+
-+#endif /* jit_ppc64_LIR_ppc64_h */
-diff --git a/js/src/jit/ppc64/Lowering-ppc64.cpp b/js/src/jit/ppc64/Lowering-ppc64.cpp
-new file mode 100644
-index 000000000000..be0ead19d273
---- /dev/null
-+++ b/js/src/jit/ppc64/Lowering-ppc64.cpp
-@@ -0,0 +1,1324 @@
-+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
-+ * vim: set ts=8 sts=2 et sw=2 tw=80:
-+ * This Source Code Form is subject to the terms of the Mozilla Public
-+ * License, v. 2.0. If a copy of the MPL was not distributed with this
-+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-+
-+#include "jit/ppc64/Lowering-ppc64.h"
-+
-+#include "mozilla/MathAlgorithms.h"
-+
-+#include "jit/Lowering.h"
-+#include "jit/MIR-wasm.h"
-+#include "jit/MIR.h"
-+#include "jit/ppc64/Assembler-ppc64.h"
-+#include "wasm/WasmFeatures.h"  // for wasm::ReportSimdAnalysis
-+
-+#include "jit/shared/Lowering-shared-inl.h"
-+
-+using namespace js;
-+using namespace js::jit;
-+
-+using mozilla::FloorLog2;
-+
-+namespace js {
-+namespace jit {
-+
-+LTableSwitch* LIRGeneratorPPC64::newLTableSwitch(const LAllocation& in,
-+                                                 const LDefinition& inputCopy) {
-+  return new (alloc()) LTableSwitch(in, inputCopy, temp());
-+}
-+
-+LTableSwitchV* LIRGeneratorPPC64::newLTableSwitchV(const LBoxAllocation& in) {
-+  return new (alloc()) LTableSwitchV(in, temp(), tempDouble(), temp());
-+}
-+
-+void LIRGeneratorPPC64::lowerForShift(LInstructionHelper<1, 2, 0>* ins,
-+                                      MDefinition* mir, MDefinition* lhs,
-+                                      MDefinition* rhs) {
-+  lowerForALU(ins, mir, lhs, rhs);
-+}
-+
-+template <class LInstr>
-+void LIRGeneratorPPC64::lowerForShiftInt64(LInstr* ins, MDefinition* mir,
-+                                           MDefinition* lhs, MDefinition* rhs) {
-+  if constexpr (std::is_same_v<LInstr, LShiftI64>) {
-+    ins->setLhs(useInt64RegisterAtStart(lhs));
-+    ins->setRhs(useRegisterOrConstantAtStart(rhs));
-+  } else {
-+    ins->setInput(useInt64RegisterAtStart(lhs));
-+    ins->setCount(useRegisterOrConstantAtStart(rhs));
-+  }
-+  defineInt64(ins, mir);
-+}
-+
-+template void LIRGeneratorPPC64::lowerForShiftInt64(LShiftI64* ins,
-+                                                    MDefinition* mir,
-+                                                    MDefinition* lhs,
-+                                                    MDefinition* rhs);
-+template void LIRGeneratorPPC64::lowerForShiftInt64(LRotateI64* ins,
-+                                                    MDefinition* mir,
-+                                                    MDefinition* lhs,
-+                                                    MDefinition* rhs);
-+
-+void LIRGeneratorPPC64::lowerForALU(LInstructionHelper<1, 1, 0>* ins,
-+                                    MDefinition* mir, MDefinition* input) {
-+  ins->setOperand(0, useRegisterAtStart(input));
-+  define(ins, mir);
-+}
-+
-+void LIRGeneratorPPC64::lowerForALU(LInstructionHelper<1, 2, 0>* ins,
-+                                    MDefinition* mir, MDefinition* lhs,
-+                                    MDefinition* rhs) {
-+  ins->setOperand(0, useRegisterAtStart(lhs));
-+  ins->setOperand(1, useRegisterOrConstantAtStart(rhs));
-+  define(ins, mir);
-+}
-+
-+void LIRGeneratorPPC64::lowerForALUInt64(
-+    LInstructionHelper<INT64_PIECES, INT64_PIECES, 0>* ins, MDefinition* mir,
-+    MDefinition* input) {
-+  ins->setInt64Operand(0, useInt64RegisterAtStart(input));
-+  defineInt64(ins, mir);
-+}
-+
-+void LIRGeneratorPPC64::lowerForALUInt64(
-+    LInstructionHelper<INT64_PIECES, 2 * INT64_PIECES, 0>* ins,
-+    MDefinition* mir, MDefinition* lhs, MDefinition* rhs) {
-+  ins->setInt64Operand(0, useInt64RegisterAtStart(lhs));
-+  ins->setInt64Operand(INT64_PIECES, useInt64RegisterOrConstantAtStart(rhs));
-+  defineInt64(ins, mir);
-+}
-+
-+void LIRGeneratorPPC64::lowerForMulInt64(LMulI64* ins, MMul* mir,
-+                                         MDefinition* lhs, MDefinition* rhs) {
-+  lowerForALUInt64(ins, mir, lhs, rhs);
-+}
-+
-+void LIRGeneratorPPC64::lowerForFPU(LInstructionHelper<1, 1, 0>* ins,
-+                                    MDefinition* mir, MDefinition* input) {
-+  ins->setOperand(0, useRegisterAtStart(input));
-+  define(ins, mir);
-+}
-+
-+void LIRGeneratorPPC64::lowerForFPU(LInstructionHelper<1, 2, 0>* ins,
-+                                    MDefinition* mir, MDefinition* lhs,
-+                                    MDefinition* rhs) {
-+  ins->setOperand(0, useRegisterAtStart(lhs));
-+  ins->setOperand(1, useRegisterAtStart(rhs));
-+  define(ins, mir);
-+}
-+
-+LBoxAllocation LIRGeneratorPPC64::useBoxFixed(MDefinition* mir, Register reg1,
-+                                              Register reg2, bool useAtStart) {
-+  MOZ_ASSERT(mir->type() == MIRType::Value);
-+
-+  ensureDefined(mir);
-+  return LBoxAllocation(LUse(reg1, mir->virtualRegister(), useAtStart));
-+}
-+
-+LAllocation LIRGeneratorPPC64::useByteOpRegister(MDefinition* mir) {
-+  return useRegister(mir);
-+}
-+
-+LAllocation LIRGeneratorPPC64::useByteOpRegisterAtStart(MDefinition* mir) {
-+  return useRegisterAtStart(mir);
-+}
-+
-+LAllocation LIRGeneratorPPC64::useByteOpRegisterOrNonDoubleConstant(
-+    MDefinition* mir) {
-+  return useRegisterOrNonDoubleConstant(mir);
-+}
-+
-+LDefinition LIRGeneratorPPC64::tempByteOpRegister() { return temp(); }
-+
-+LDefinition LIRGeneratorPPC64::tempToUnbox() { return temp(); }
-+
-+void LIRGeneratorPPC64::lowerUntypedPhiInput(MPhi* phi, uint32_t inputPosition,
-+                                             LBlock* block, size_t lirIndex) {
-+  lowerTypedPhiInput(phi, inputPosition, block, lirIndex);
-+}
-+
-+void LIRGeneratorPPC64::lowerInt64PhiInput(MPhi* phi, uint32_t inputPosition,
-+                                           LBlock* block, size_t lirIndex) {
-+  lowerTypedPhiInput(phi, inputPosition, block, lirIndex);
-+}
-+
-+void LIRGeneratorPPC64::defineInt64Phi(MPhi* phi, size_t lirIndex) {
-+  defineTypedPhi(phi, lirIndex);
-+}
-+
-+void LIRGeneratorPPC64::lowerMulI(MMul* mul, MDefinition* lhs,
-+                                  MDefinition* rhs) {
-+  LMulI* lir = new (alloc()) LMulI;
-+  if (mul->fallible()) {
-+    assignSnapshot(lir, mul->bailoutKind());
-+  }
-+  if (mul->canBeNegativeZero() && !rhs->isConstant()) {
-+    lir->setOperand(0, useRegister(lhs));
-+    lir->setOperand(1, useRegister(rhs));
-+    define(lir, mul);
-+    return;
-+  }
-+  lowerForALU(lir, mul, lhs, rhs);
-+}
-+
-+void LIRGeneratorPPC64::lowerDivI(MDiv* div) {
-+  if (div->rhs()->isConstant()) {
-+    int32_t rhs = div->rhs()->toConstant()->toInt32();
-+    int32_t shift = FloorLog2(uint32_t(rhs));
-+    if (rhs > 0 && 1 << shift == rhs) {
-+      LDivPowTwoI* lir =
-+          new (alloc()) LDivPowTwoI(useRegister(div->lhs()), shift);
-+      if (div->fallible()) {
-+        assignSnapshot(lir, div->bailoutKind());
-+      }
-+      define(lir, div);
-+      return;
-+    }
-+  }
-+  LDivI* lir = new (alloc())
-+      LDivI(useRegister(div->lhs()), useRegister(div->rhs()), temp());
-+  if (div->fallible()) {
-+    assignSnapshot(lir, div->bailoutKind());
-+  }
-+  define(lir, div);
-+}
-+
-+void LIRGeneratorPPC64::lowerDivI64(MDiv* div) {
-+  auto* lir = new (alloc())
-+      LDivOrModI64(useRegister(div->lhs()), useRegister(div->rhs()));
-+  defineInt64(lir, div);
-+}
-+
-+void LIRGeneratorPPC64::lowerModI(MMod* mod) {
-+  if (mod->rhs()->isConstant()) {
-+    int32_t rhs = mod->rhs()->toConstant()->toInt32();
-+    int32_t shift = FloorLog2(uint32_t(rhs));
-+    if (rhs > 0 && 1 << shift == rhs) {
-+      LModPowTwoI* lir =
-+          new (alloc()) LModPowTwoI(useRegister(mod->lhs()), shift);
-+      if (mod->fallible()) {
-+        assignSnapshot(lir, mod->bailoutKind());
-+      }
-+      define(lir, mod);
-+      return;
-+    } else if (shift < 31 && (1 << (shift + 1)) - 1 == rhs) {
-+      LModMaskI* lir = new (alloc())
-+          LModMaskI(useRegister(mod->lhs()), temp(), temp(), shift + 1);
-+      if (mod->fallible()) {
-+        assignSnapshot(lir, mod->bailoutKind());
-+      }
-+      define(lir, mod);
-+      return;
-+    }
-+  }
-+  auto* lir =
-+      new (alloc()) LModI(useRegister(mod->lhs()), useRegister(mod->rhs()));
-+  if (mod->fallible()) {
-+    assignSnapshot(lir, mod->bailoutKind());
-+  }
-+  define(lir, mod);
-+}
-+
-+void LIRGeneratorPPC64::lowerModI64(MMod* mod) {
-+  auto* lir = new (alloc())
-+      LDivOrModI64(useRegister(mod->lhs()), useRegister(mod->rhs()));
-+  defineInt64(lir, mod);
-+}
-+
-+void LIRGeneratorPPC64::lowerUDiv(MDiv* div) {
-+  MDefinition* lhs = div->getOperand(0);
-+  MDefinition* rhs = div->getOperand(1);
-+  LUDivOrMod* lir = new (alloc()) LUDivOrMod;
-+  // useRegisterAtStart: CodeGenerator-ppc64's visitUDivOrMod zero-extends
-+  // lhs/rhs into their own slots in place before the 32-bit divwu, so the
-+  // inputs must not be required live after the LIR op begins.
-+  lir->setOperand(0, useRegisterAtStart(lhs));
-+  lir->setOperand(1, useRegisterAtStart(rhs));
-+  if (div->fallible()) {
-+    assignSnapshot(lir, div->bailoutKind());
-+  }
-+  define(lir, div);
-+}
-+
-+void LIRGeneratorPPC64::lowerUDivI64(MDiv* div) {
-+  auto* lir = new (alloc())
-+      LUDivOrModI64(useRegister(div->lhs()), useRegister(div->rhs()));
-+  defineInt64(lir, div);
-+}
-+
-+void LIRGeneratorPPC64::lowerUMod(MMod* mod) {
-+  MDefinition* lhs = mod->getOperand(0);
-+  MDefinition* rhs = mod->getOperand(1);
-+  LUDivOrMod* lir = new (alloc()) LUDivOrMod;
-+  // See lowerUDiv above for why useRegisterAtStart is required here.
-+  lir->setOperand(0, useRegisterAtStart(lhs));
-+  lir->setOperand(1, useRegisterAtStart(rhs));
-+  if (mod->fallible()) {
-+    assignSnapshot(lir, mod->bailoutKind());
-+  }
-+  define(lir, mod);
-+}
-+
-+void LIRGeneratorPPC64::lowerUModI64(MMod* mod) {
-+  auto* lir = new (alloc())
-+      LUDivOrModI64(useRegister(mod->lhs()), useRegister(mod->rhs()));
-+  defineInt64(lir, mod);
-+}
-+
-+void LIRGeneratorPPC64::lowerUrshD(MUrsh* mir) {
-+  MDefinition* lhs = mir->lhs();
-+  MDefinition* rhs = mir->rhs();
-+  MOZ_ASSERT(lhs->type() == MIRType::Int32);
-+  MOZ_ASSERT(rhs->type() == MIRType::Int32);
-+  auto* lir = new (alloc()) LUrshD(useRegisterAtStart(lhs),
-+                                   useRegisterOrConstantAtStart(rhs), temp());
-+  define(lir, mir);
-+}
-+
-+void LIRGeneratorPPC64::lowerPowOfTwoI(MPow* mir) {
-+  int32_t base = mir->input()->toConstant()->toInt32();
-+  MDefinition* power = mir->power();
-+  auto* lir = new (alloc()) LPowOfTwoI(useRegister(power), base);
-+  assignSnapshot(lir, mir->bailoutKind());
-+  define(lir, mir);
-+}
-+
-+void LIRGeneratorPPC64::lowerBigIntPtrDiv(MBigIntPtrDiv* ins) {
-+  auto* lir = new (alloc())
-+      LBigIntPtrDiv(useRegister(ins->lhs()), useRegister(ins->rhs()),
-+                    LDefinition::BogusTemp(), LDefinition::BogusTemp());
-+  assignSnapshot(lir, ins->bailoutKind());
-+  define(lir, ins);
-+}
-+
-+void LIRGeneratorPPC64::lowerBigIntPtrMod(MBigIntPtrMod* ins) {
-+  auto* lir = new (alloc())
-+      LBigIntPtrMod(useRegister(ins->lhs()), useRegister(ins->rhs()), temp(),
-+                    LDefinition::BogusTemp());
-+  if (ins->canBeDivideByZero()) {
-+    assignSnapshot(lir, ins->bailoutKind());
-+  }
-+  define(lir, ins);
-+}
-+
-+void LIRGeneratorPPC64::lowerBigIntPtrLsh(MBigIntPtrLsh* ins) {
-+  auto* lir = new (alloc()) LBigIntPtrLsh(
-+      useRegister(ins->lhs()), useRegister(ins->rhs()), temp(), temp());
-+  assignSnapshot(lir, ins->bailoutKind());
-+  define(lir, ins);
-+}
-+
-+void LIRGeneratorPPC64::lowerBigIntPtrRsh(MBigIntPtrRsh* ins) {
-+  auto* lir = new (alloc()) LBigIntPtrRsh(
-+      useRegister(ins->lhs()), useRegister(ins->rhs()), temp(), temp());
-+  assignSnapshot(lir, ins->bailoutKind());
-+  define(lir, ins);
-+}
-+
-+void LIRGeneratorPPC64::lowerTruncateDToInt32(MTruncateToInt32* ins) {
-+  MDefinition* opd = ins->input();
-+  MOZ_ASSERT(opd->type() == MIRType::Double);
-+  define(new (alloc()) LTruncateDToInt32(useRegister(opd), tempDouble()), ins);
-+}
-+
-+void LIRGeneratorPPC64::lowerTruncateFToInt32(MTruncateToInt32* ins) {
-+  MDefinition* opd = ins->input();
-+  MOZ_ASSERT(opd->type() == MIRType::Float32);
-+  define(new (alloc()) LTruncateFToInt32(useRegister(opd), tempFloat32()), ins);
-+}
-+
-+void LIRGeneratorPPC64::lowerBuiltinInt64ToFloatingPoint(
-+    MBuiltinInt64ToFloatingPoint* ins) {
-+  MOZ_CRASH("We don't use it for this architecture");
-+}
-+
-+void LIRGeneratorPPC64::lowerWasmSelectI(MWasmSelect* select) {
-+  auto* lir = new (alloc())
-+      LWasmSelect(useRegisterAtStart(select->trueExpr()),
-+                  useAny(select->falseExpr()), useRegister(select->condExpr()));
-+  defineReuseInput(lir, select, LWasmSelect::TrueExprIndex);
-+}
-+
-+void LIRGeneratorPPC64::lowerWasmSelectI64(MWasmSelect* select) {
-+  auto* lir = new (alloc()) LWasmSelectI64(
-+      useInt64RegisterAtStart(select->trueExpr()),
-+      useInt64(select->falseExpr()), useRegister(select->condExpr()));
-+  defineInt64ReuseInput(lir, select, LWasmSelectI64::TrueExprIndex);
-+}
-+
-+void LIRGeneratorPPC64::lowerWasmBuiltinTruncateToInt32(
-+    MWasmBuiltinTruncateToInt32* ins) {
-+  MDefinition* opd = ins->input();
-+  MOZ_ASSERT(opd->type() == MIRType::Double || opd->type() == MIRType::Float32);
-+
-+  if (opd->type() == MIRType::Double) {
-+    define(new (alloc()) LWasmBuiltinTruncateDToInt32(
-+               useRegister(opd), useFixed(ins->instance(), InstanceReg),
-+               LDefinition::BogusTemp()),
-+           ins);
-+    return;
-+  }
-+
-+  define(new (alloc()) LWasmBuiltinTruncateFToInt32(
-+             useRegister(opd), useFixed(ins->instance(), InstanceReg),
-+             LDefinition::BogusTemp()),
-+         ins);
-+}
-+
-+void LIRGeneratorPPC64::lowerWasmBuiltinTruncateToInt64(
-+    MWasmBuiltinTruncateToInt64* ins) {
-+  MOZ_CRASH("We don't use it for this architecture");
-+}
-+
-+void LIRGeneratorPPC64::lowerWasmBuiltinDivI64(MWasmBuiltinDivI64* div) {
-+  MOZ_CRASH("We don't use runtime div for this architecture");
-+}
-+
-+void LIRGeneratorPPC64::lowerWasmBuiltinModI64(MWasmBuiltinModI64* mod) {
-+  MOZ_CRASH("We don't use runtime mod for this architecture");
-+}
-+
-+void LIRGeneratorPPC64::lowerAtomicLoad64(MLoadUnboxedScalar* ins) {
-+  const LUse elements = useRegister(ins->elements());
-+  const LAllocation index =
-+      useRegisterOrIndexConstant(ins->index(), ins->storageType());
-+  auto* lir = new (alloc()) LAtomicLoad64(elements, index);
-+  defineInt64(lir, ins);
-+}
-+
-+void LIRGeneratorPPC64::lowerAtomicStore64(MStoreUnboxedScalar* ins) {
-+  LUse elements = useRegister(ins->elements());
-+  LAllocation index =
-+      useRegisterOrIndexConstant(ins->index(), ins->writeType());
-+  LInt64Allocation value = useInt64Register(ins->value());
-+  add(new (alloc()) LAtomicStore64(elements, index, value), ins);
-+}
-+
-+// ===============================================================
-+// LIRGenerator::visit* implementations
-+
-+void LIRGenerator::visitBox(MBox* box) {
-+  MDefinition* opd = box->getOperand(0);
-+
-+  if (opd->isConstant() && box->canEmitAtUses()) {
-+    emitAtUses(box);
-+    return;
-+  }
-+
-+  if (opd->isConstant()) {
-+    define(new (alloc()) LValue(opd->toConstant()->toJSValue()), box,
-+           LDefinition(LDefinition::BOX));
-+  } else {
-+    LBox* ins = new (alloc()) LBox(useRegisterAtStart(opd), opd->type());
-+    define(ins, box, LDefinition(LDefinition::BOX));
-+  }
-+}
-+
-+void LIRGenerator::visitUnbox(MUnbox* unbox) {
-+  MDefinition* box = unbox->getOperand(0);
-+  MOZ_ASSERT(box->type() == MIRType::Value);
-+
-+  LInstructionHelper<1, BOX_PIECES, 0>* lir;
-+  if (IsFloatingPointType(unbox->type())) {
-+    MOZ_ASSERT(unbox->type() == MIRType::Double);
-+    lir = new (alloc()) LUnboxFloatingPoint(useBoxAtStart(box));
-+  } else if (unbox->fallible()) {
-+    lir = new (alloc()) LUnbox(useRegisterAtStart(box));
-+  } else {
-+    lir = new (alloc()) LUnbox(useAtStart(box));
-+  }
-+
-+  if (unbox->fallible()) {
-+    assignSnapshot(lir, unbox->bailoutKind());
-+  }
-+
-+  define(lir, unbox);
-+}
-+
-+void LIRGenerator::visitCopySign(MCopySign* ins) {
-+  MDefinition* lhs = ins->lhs();
-+  MDefinition* rhs = ins->rhs();
-+
-+  MOZ_ASSERT(IsFloatingPointType(lhs->type()));
-+  MOZ_ASSERT(lhs->type() == rhs->type());
-+  MOZ_ASSERT(lhs->type() == ins->type());
-+
-+  LInstructionHelper<1, 2, 0>* lir;
-+  if (lhs->type() == MIRType::Double) {
-+    lir = new (alloc()) LCopySignD();
-+  } else {
-+    lir = new (alloc()) LCopySignF();
-+  }
-+
-+  lowerForFPU(lir, ins, lhs, rhs);
-+}
-+
-+void LIRGenerator::visitExtendInt32ToInt64(MExtendInt32ToInt64* ins) {
-+  defineInt64(
-+      new (alloc()) LExtendInt32ToInt64(useRegisterAtStart(ins->input())), ins);
-+}
-+
-+void LIRGenerator::visitSignExtendInt64(MSignExtendInt64* ins) {
-+  defineInt64(new (alloc())
-+                  LSignExtendInt64(useInt64RegisterAtStart(ins->input())),
-+              ins);
-+}
-+
-+void LIRGenerator::visitInt64ToFloatingPoint(MInt64ToFloatingPoint* ins) {
-+  MDefinition* opd = ins->input();
-+  MOZ_ASSERT(opd->type() == MIRType::Int64);
-+  MOZ_ASSERT(IsFloatingPointType(ins->type()));
-+  define(new (alloc()) LInt64ToFloatingPoint(useInt64Register(opd)), ins);
-+}
-+
-+void LIRGenerator::visitSubstr(MSubstr* ins) {
-+  LSubstr* lir = new (alloc())
-+      LSubstr(useRegister(ins->string()), useRegister(ins->begin()),
-+              useRegister(ins->length()), temp(), temp(), temp());
-+  define(lir, ins);
-+  assignSafepoint(lir, ins);
-+}
-+
-+void LIRGenerator::visitReturnImpl(MDefinition* opd, bool isGenerator) {
-+  MOZ_ASSERT(opd->type() == MIRType::Value);
-+  LReturn* ins = new (alloc()) LReturn(isGenerator);
-+  ins->setOperand(0, useFixed(opd, JSReturnReg));
-+  add(ins);
-+}
-+void LIRGenerator::visitCompareExchangeTypedArrayElement(
-+    MCompareExchangeTypedArrayElement* ins) {
-+  MOZ_ASSERT(!Scalar::isFloatingType(ins->arrayType()));
-+  MOZ_ASSERT(ins->elements()->type() == MIRType::Elements);
-+  MOZ_ASSERT(ins->index()->type() == MIRType::IntPtr);
-+
-+  const LUse elements = useRegister(ins->elements());
-+  const LAllocation index =
-+      useRegisterOrIndexConstant(ins->index(), ins->arrayType());
-+
-+  if (Scalar::isBigIntType(ins->arrayType())) {
-+    LInt64Allocation oldval = useInt64Register(ins->oldval());
-+    LInt64Allocation newval = useInt64Register(ins->newval());
-+
-+    auto* lir = new (alloc())
-+        LCompareExchangeTypedArrayElement64(elements, index, oldval, newval);
-+    defineInt64(lir, ins);
-+    return;
-+  }
-+
-+  const LAllocation oldval = useRegister(ins->oldval());
-+  const LAllocation newval = useRegister(ins->newval());
-+
-+  LDefinition outTemp = LDefinition::BogusTemp();
-+  LDefinition valueTemp = LDefinition::BogusTemp();
-+  LDefinition offsetTemp = LDefinition::BogusTemp();
-+  LDefinition maskTemp = LDefinition::BogusTemp();
-+
-+  if (ins->arrayType() == Scalar::Uint32 && IsFloatingPointType(ins->type())) {
-+    outTemp = temp();
-+  }
-+
-+  if (Scalar::byteSize(ins->arrayType()) < 4) {
-+    // PPC64 sub-word CAS uses lbarx/lharx + stbcx./sthcx. (POWER7+); only
-+    // valueTemp is needed, to hold the extsb/extsh-canonicalised oldval
-+    // for the 32-bit cmpw. offsetTemp/maskTemp are unused (no round-down
-+    // + bit-isolate dance), and remain BogusTemp.
-+    valueTemp = temp();
-+  }
-+
-+  LCompareExchangeTypedArrayElement* lir = new (alloc())
-+      LCompareExchangeTypedArrayElement(elements, index, oldval, newval,
-+                                        outTemp, valueTemp, offsetTemp,
-+                                        maskTemp);
-+
-+  define(lir, ins);
-+}
-+
-+void LIRGenerator::visitAtomicExchangeTypedArrayElement(
-+    MAtomicExchangeTypedArrayElement* ins) {
-+  MOZ_ASSERT(ins->elements()->type() == MIRType::Elements);
-+  MOZ_ASSERT(ins->index()->type() == MIRType::IntPtr);
-+
-+  const LUse elements = useRegister(ins->elements());
-+  const LAllocation index =
-+      useRegisterOrIndexConstant(ins->index(), ins->arrayType());
-+
-+  if (Scalar::isBigIntType(ins->arrayType())) {
-+    LInt64Allocation value = useInt64Register(ins->value());
-+
-+    auto* lir = new (alloc())
-+        LAtomicExchangeTypedArrayElement64(elements, index, value);
-+    defineInt64(lir, ins);
-+    return;
-+  }
-+
-+  MOZ_ASSERT(ins->arrayType() <= Scalar::Uint32);
-+
-+  const LAllocation value = useRegister(ins->value());
-+
-+  LDefinition outTemp = LDefinition::BogusTemp();
-+  LDefinition valueTemp = LDefinition::BogusTemp();
-+  LDefinition offsetTemp = LDefinition::BogusTemp();
-+  LDefinition maskTemp = LDefinition::BogusTemp();
-+
-+  if (ins->arrayType() == Scalar::Uint32) {
-+    MOZ_ASSERT(ins->type() == MIRType::Double);
-+    outTemp = temp();
-+  }
-+
-+  // PPC64 sub-word atomic exchange uses lbarx/lharx + stbcx./sthcx. directly
-+  // (POWER7+); valueTemp/offsetTemp/maskTemp are never read by the
-+  // implementation (see MacroAssembler-ppc64.cpp's AtomicExchange template).
-+  // Leave them as BogusTemp.
-+
-+  LAtomicExchangeTypedArrayElement* lir =
-+      new (alloc()) LAtomicExchangeTypedArrayElement(
-+          elements, index, value, outTemp, valueTemp, offsetTemp, maskTemp);
-+
-+  define(lir, ins);
-+}
-+
-+void LIRGenerator::visitAtomicTypedArrayElementBinop(
-+    MAtomicTypedArrayElementBinop* ins) {
-+  MOZ_ASSERT(ins->arrayType() != Scalar::Uint8Clamped);
-+  MOZ_ASSERT(!Scalar::isFloatingType(ins->arrayType()));
-+  MOZ_ASSERT(ins->elements()->type() == MIRType::Elements);
-+  MOZ_ASSERT(ins->index()->type() == MIRType::IntPtr);
-+
-+  const LUse elements = useRegister(ins->elements());
-+  const LAllocation index =
-+      useRegisterOrIndexConstant(ins->index(), ins->arrayType());
-+
-+  if (Scalar::isBigIntType(ins->arrayType())) {
-+    LInt64Allocation value = useInt64Register(ins->value());
-+    LInt64Definition temp = tempInt64();
-+
-+    if (ins->isForEffect()) {
-+      auto* lir = new (alloc()) LAtomicTypedArrayElementBinopForEffect64(
-+          elements, index, value, temp);
-+      add(lir, ins);
-+      return;
-+    }
-+
-+    auto* lir = new (alloc())
-+        LAtomicTypedArrayElementBinop64(elements, index, value, temp);
-+    defineInt64(lir, ins);
-+    return;
-+  }
-+
-+  LAllocation value = useRegister(ins->value());
-+  LDefinition valueTemp = LDefinition::BogusTemp();
-+  LDefinition offsetTemp = LDefinition::BogusTemp();
-+  LDefinition maskTemp = LDefinition::BogusTemp();
-+
-+  // PPC64 sub-word atomic-binop uses lbarx/lharx + stbcx./sthcx. (POWER7+).
-+  // The fetch-op variant needs valueTemp to hold the post-op value being
-+  // condition-stored (MacroAssembler-ppc64.cpp's AtomicFetchOp); the
-+  // for-effect variant uses an internal scratch and needs no temps at
-+  // all. offsetTemp/maskTemp are unused in either path.
-+  if (Scalar::byteSize(ins->arrayType()) < 4 && !ins->isForEffect()) {
-+    valueTemp = temp();
-+  }
-+
-+  if (ins->isForEffect()) {
-+    LAtomicTypedArrayElementBinopForEffect* lir =
-+        new (alloc()) LAtomicTypedArrayElementBinopForEffect(
-+            elements, index, value, valueTemp, offsetTemp, maskTemp);
-+    add(lir, ins);
-+    return;
-+  }
-+
-+  LDefinition outTemp = LDefinition::BogusTemp();
-+
-+  if (ins->arrayType() == Scalar::Uint32 && IsFloatingPointType(ins->type())) {
-+    outTemp = temp();
-+  }
-+
-+  LAtomicTypedArrayElementBinop* lir =
-+      new (alloc()) LAtomicTypedArrayElementBinop(
-+          elements, index, value, outTemp, valueTemp, offsetTemp, maskTemp);
-+  define(lir, ins);
-+}
-+void LIRGenerator::visitAsmJSLoadHeap(MAsmJSLoadHeap* ins) {
-+  MDefinition* base = ins->base();
-+  MOZ_ASSERT(base->type() == MIRType::Int32);
-+
-+  MDefinition* boundsCheckLimit = ins->boundsCheckLimit();
-+  MOZ_ASSERT_IF(ins->needsBoundsCheck(),
-+                boundsCheckLimit->type() == MIRType::Int32);
-+
-+  LAllocation baseAlloc = useRegisterAtStart(base);
-+
-+  LAllocation limitAlloc = ins->needsBoundsCheck()
-+                               ? useRegisterAtStart(boundsCheckLimit)
-+                               : LAllocation();
-+
-+  MOZ_ASSERT(!ins->hasMemoryBase());
-+  auto* lir =
-+      new (alloc()) LAsmJSLoadHeap(baseAlloc, limitAlloc, LAllocation());
-+  define(lir, ins);
-+}
-+void LIRGenerator::visitAsmJSStoreHeap(MAsmJSStoreHeap* ins) {
-+  MDefinition* base = ins->base();
-+  MOZ_ASSERT(base->type() == MIRType::Int32);
-+
-+  MDefinition* boundsCheckLimit = ins->boundsCheckLimit();
-+  MOZ_ASSERT_IF(ins->needsBoundsCheck(),
-+                boundsCheckLimit->type() == MIRType::Int32);
-+
-+  LAllocation baseAlloc = useRegisterAtStart(base);
-+
-+  LAllocation limitAlloc = ins->needsBoundsCheck()
-+                               ? useRegisterAtStart(boundsCheckLimit)
-+                               : LAllocation();
-+
-+  MOZ_ASSERT(!ins->hasMemoryBase());
-+  add(new (alloc()) LAsmJSStoreHeap(baseAlloc, useRegisterAtStart(ins->value()),
-+                                    limitAlloc, LAllocation()),
-+      ins);
-+}
-+void LIRGenerator::visitWasmLoad(MWasmLoad* ins) {
-+  MDefinition* base = ins->base();
-+  MOZ_ASSERT(base->type() == MIRType::Int32 || base->type() == MIRType::Int64);
-+
-+  LAllocation memoryBase =
-+      ins->hasMemoryBase() ? LAllocation(useRegisterAtStart(ins->memoryBase()))
-+                           : LGeneralReg(HeapReg);
-+
-+  LAllocation ptr = useRegisterAtStart(base);
-+
-+  LDefinition ptrCopy = LDefinition::BogusTemp();
-+  if (ins->access().offset32()) {
-+    ptrCopy = tempCopy(base, 0);
-+  }
-+
-+  if (ins->type() == MIRType::Int64) {
-+    auto* lir = new (alloc()) LWasmLoadI64(ptr, memoryBase, ptrCopy);
-+    defineInt64(lir, ins);
-+    return;
-+  }
-+
-+  auto* lir = new (alloc()) LWasmLoad(ptr, memoryBase, ptrCopy);
-+  define(lir, ins);
-+}
-+void LIRGenerator::visitWasmStore(MWasmStore* ins) {
-+  MDefinition* base = ins->base();
-+  MOZ_ASSERT(base->type() == MIRType::Int32 || base->type() == MIRType::Int64);
-+
-+  MDefinition* value = ins->value();
-+  LAllocation memoryBase =
-+      ins->hasMemoryBase() ? LAllocation(useRegisterAtStart(ins->memoryBase()))
-+                           : LGeneralReg(HeapReg);
-+
-+  LAllocation baseAlloc = useRegisterAtStart(base);
-+
-+  LDefinition ptrCopy = LDefinition::BogusTemp();
-+  if (ins->access().offset32()) {
-+    ptrCopy = tempCopy(base, 0);
-+  }
-+
-+  if (ins->access().type() == Scalar::Int64) {
-+    LInt64Allocation valueAlloc = useInt64RegisterAtStart(value);
-+    auto* lir =
-+        new (alloc()) LWasmStoreI64(baseAlloc, valueAlloc, memoryBase, ptrCopy);
-+    add(lir, ins);
-+    return;
-+  }
-+
-+  LAllocation valueAlloc = useRegisterAtStart(value);
-+  auto* lir =
-+      new (alloc()) LWasmStore(baseAlloc, valueAlloc, memoryBase, ptrCopy);
-+  add(lir, ins);
-+}
-+void LIRGenerator::visitWasmTruncateToInt64(MWasmTruncateToInt64* ins) {
-+  MDefinition* opd = ins->input();
-+  MOZ_ASSERT(opd->type() == MIRType::Double || opd->type() == MIRType::Float32);
-+
-+  defineInt64(new (alloc()) LWasmTruncateToInt64(useRegister(opd)), ins);
-+}
-+void LIRGenerator::visitWasmUnsignedToDouble(MWasmUnsignedToDouble* ins) {
-+  MOZ_ASSERT(ins->input()->type() == MIRType::Int32);
-+  LWasmUint32ToDouble* lir =
-+      new (alloc()) LWasmUint32ToDouble(useRegisterAtStart(ins->input()));
-+  define(lir, ins);
-+}
-+void LIRGenerator::visitWasmUnsignedToFloat32(MWasmUnsignedToFloat32* ins) {
-+  MOZ_ASSERT(ins->input()->type() == MIRType::Int32);
-+  LWasmUint32ToFloat32* lir =
-+      new (alloc()) LWasmUint32ToFloat32(useRegisterAtStart(ins->input()));
-+  define(lir, ins);
-+}
-+void LIRGenerator::visitWasmCompareExchangeHeap(MWasmCompareExchangeHeap* ins) {
-+  MDefinition* base = ins->base();
-+  MOZ_ASSERT(base->type() == MIRType::Int32 || base->type() == MIRType::Int64);
-+  LAllocation memoryBase = ins->hasMemoryBase()
-+                               ? LAllocation(useRegister(ins->memoryBase()))
-+                               : LGeneralReg(HeapReg);
-+
-+  if (ins->access().type() == Scalar::Int64) {
-+    auto* lir = new (alloc()) LWasmCompareExchangeI64(
-+        useRegister(base), useInt64Register(ins->oldValue()),
-+        useInt64Register(ins->newValue()), memoryBase);
-+    defineInt64(lir, ins);
-+    return;
-+  }
-+
-+  LDefinition valueTemp = LDefinition::BogusTemp();
-+  LDefinition offsetTemp = LDefinition::BogusTemp();
-+  LDefinition maskTemp = LDefinition::BogusTemp();
-+
-+  // PPC64 sub-word wasm CAS uses lbarx/lharx + stbcx./sthcx. (POWER7+);
-+  // valueTemp holds the extsb/extsh-canonicalised oldval for cmpw, while
-+  // offsetTemp/maskTemp are unused (no round-down + bit-isolate dance).
-+  if (ins->access().byteSize() < 4) {
-+    valueTemp = temp();
-+  }
-+
-+  auto* lir = new (alloc())
-+      LWasmCompareExchangeHeap(useRegister(base), useRegister(ins->oldValue()),
-+                               useRegister(ins->newValue()), memoryBase,
-+                               valueTemp, offsetTemp, maskTemp);
-+
-+  define(lir, ins);
-+}
-+void LIRGenerator::visitWasmAtomicExchangeHeap(MWasmAtomicExchangeHeap* ins) {
-+  MDefinition* base = ins->base();
-+  MOZ_ASSERT(base->type() == MIRType::Int32 || base->type() == MIRType::Int64);
-+  LAllocation memoryBase = ins->hasMemoryBase()
-+                               ? LAllocation(useRegister(ins->memoryBase()))
-+                               : LGeneralReg(HeapReg);
-+
-+  if (ins->access().type() == Scalar::Int64) {
-+    auto* lir = new (alloc()) LWasmAtomicExchangeI64(
-+        useRegister(base), useInt64Register(ins->value()), memoryBase);
-+    defineInt64(lir, ins);
-+    return;
-+  }
-+
-+  // PPC64 sub-word wasm atomic exchange uses lbarx/lharx + stbcx./sthcx.
-+  // (POWER7+); valueTemp/offsetTemp/maskTemp are never read by the
-+  // implementation (see MacroAssembler-ppc64.cpp's AtomicExchange template).
-+  // Pass BogusTemp for all three.
-+  LDefinition valueTemp = LDefinition::BogusTemp();
-+  LDefinition offsetTemp = LDefinition::BogusTemp();
-+  LDefinition maskTemp = LDefinition::BogusTemp();
-+
-+  auto* lir = new (alloc())
-+      LWasmAtomicExchangeHeap(useRegister(base), useRegister(ins->value()),
-+                              memoryBase, valueTemp, offsetTemp, maskTemp);
-+  define(lir, ins);
-+}
-+void LIRGenerator::visitWasmAtomicBinopHeap(MWasmAtomicBinopHeap* ins) {
-+  MDefinition* base = ins->base();
-+  MOZ_ASSERT(base->type() == MIRType::Int32 || base->type() == MIRType::Int64);
-+  LAllocation memoryBase = ins->hasMemoryBase()
-+                               ? LAllocation(useRegister(ins->memoryBase()))
-+                               : LGeneralReg(HeapReg);
-+
-+  if (ins->access().type() == Scalar::Int64) {
-+    auto* lir = new (alloc())
-+        LWasmAtomicBinopI64(useRegister(base), useInt64Register(ins->value()),
-+                            memoryBase, tempInt64());
-+    defineInt64(lir, ins);
-+    return;
-+  }
-+
-+  LDefinition valueTemp = LDefinition::BogusTemp();
-+  LDefinition offsetTemp = LDefinition::BogusTemp();
-+  LDefinition maskTemp = LDefinition::BogusTemp();
-+
-+  // PPC64 sub-word wasm atomic-binop uses lbarx/lharx + stbcx./sthcx.
-+  // (POWER7+). The fetch-op variant needs valueTemp for the post-op value
-+  // being condition-stored; the for-effect variant uses an internal
-+  // scratch and needs no temps at all. offsetTemp/maskTemp are unused
-+  // in either path.
-+  if (ins->access().byteSize() < 4 && ins->hasUses()) {
-+    valueTemp = temp();
-+  }
-+
-+  if (!ins->hasUses()) {
-+    LWasmAtomicBinopHeapForEffect* lir = new (alloc())
-+        LWasmAtomicBinopHeapForEffect(useRegister(base),
-+                                      useRegister(ins->value()), memoryBase,
-+                                      valueTemp, offsetTemp, maskTemp);
-+    add(lir, ins);
-+    return;
-+  }
-+
-+  auto* lir = new (alloc())
-+      LWasmAtomicBinopHeap(useRegister(base), useRegister(ins->value()),
-+                           memoryBase, valueTemp, offsetTemp, maskTemp);
-+
-+  define(lir, ins);
-+}
-+
-+// SIMD lowering
-+void LIRGenerator::visitWasmTernarySimd128(MWasmTernarySimd128* ins) {
-+#ifdef ENABLE_WASM_SIMD
-+  MOZ_ASSERT(ins->type() == MIRType::Simd128);
-+  // useRegister for v0/v1 and useRegisterAtStart only for v2 — matches
-+  // ARM64's V128Bitselect policy. defineReuseInput requires the reused
-+  // input to be useRegisterAtStart and the others to remain alive
-+  // (useRegister); reusing all three policies as useRegisterAtStart
-+  // trips the allocator's "*def->output() != alloc" assertion because
-+  // v0/v1 may then share the slot with the output.
-+  LDefinition temp0 = LDefinition::BogusTemp();
-+  if (ins->simdOp() == wasm::SimdOp::I32x4RelaxedDotI8x16I7x16AddS) {
-+    temp0 = tempSimd128();
-+  }
-+  auto* lir = new (alloc()) LWasmTernarySimd128(
-+      useRegister(ins->v0()), useRegister(ins->v1()),
-+      useRegisterAtStart(ins->v2()), temp0,
-+      ins->simdOp());
-+  // The PPC64 visitor (CodeGenerator-ppc64.cpp:visitWasmTernarySimd128)
-+  // emits the FMA / DOT_THEN_ADD chain with v2 as the implicit
-+  // accumulator. defineReuseInput tells the allocator to put `dest`
-+  // in v2's slot, eliminating the previous conditional moveSimd128.
-+  defineReuseInput(lir, ins, LWasmTernarySimd128::V2Index);
-+#else
-+  MOZ_CRASH("No SIMD");
-+#endif
-+}
-+void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) {
-+#ifdef ENABLE_WASM_SIMD
-+  MOZ_ASSERT(ins->type() == MIRType::Simd128);
-+  LDefinition temp0 = LDefinition::BogusTemp();
-+  LDefinition temp1 = LDefinition::BogusTemp();
-+  // mulInt64x2 (i64x2.mul) routes through GPRs (mfvsrd/mulld/mtvsrd) and
-+  // uses an internal ScratchSimd128 + GPR scratches; its FloatRegister
-+  // temp1/temp2 parameters are inherited from the shared ARM64+PPC64
-+  // signature but unused on PPC64. Only FP min/max need SIMD temps for
-+  // the wasm NaN-canonicalisation dance.
-+  if (ins->simdOp() == wasm::SimdOp::F32x4Min ||
-+      ins->simdOp() == wasm::SimdOp::F32x4Max ||
-+      ins->simdOp() == wasm::SimdOp::F64x2Min ||
-+      ins->simdOp() == wasm::SimdOp::F64x2Max) {
-+    temp0 = tempSimd128();
-+    temp1 = tempSimd128();
-+  }
-+  auto* lir = new (alloc()) LWasmBinarySimd128(
-+      useRegisterAtStart(ins->lhs()), useRegisterAtStart(ins->rhs()),
-+      temp0, temp1, ins->simdOp());
-+  define(lir, ins);
-+#else
-+  MOZ_CRASH("No SIMD");
-+#endif
-+}
-+void LIRGenerator::visitWasmBinarySimd128WithConstant(
-+    MWasmBinarySimd128WithConstant* ins) {
-+#ifdef ENABLE_WASM_SIMD
-+  MOZ_ASSERT(ins->type() == MIRType::Simd128);
-+  auto* lir = new (alloc()) LWasmBinarySimd128WithConstant(
-+      useRegisterAtStart(ins->lhs()), LDefinition::BogusTemp(), ins->rhs());
-+  define(lir, ins);
-+#else
-+  MOZ_CRASH("No SIMD");
-+#endif
-+}
-+void LIRGenerator::visitWasmShiftSimd128(MWasmShiftSimd128* ins) {
-+#ifdef ENABLE_WASM_SIMD
-+  MOZ_ASSERT(ins->type() == MIRType::Simd128);
-+  MOZ_ASSERT(ins->rhs()->type() == MIRType::Int32);
-+
-+  if (ins->rhs()->isConstant()) {
-+    int32_t shiftCountMask;
-+    switch (ins->simdOp()) {
-+      case wasm::SimdOp::I8x16Shl:
-+      case wasm::SimdOp::I8x16ShrU:
-+      case wasm::SimdOp::I8x16ShrS:
-+        shiftCountMask = 7;
-+        break;
-+      case wasm::SimdOp::I16x8Shl:
-+      case wasm::SimdOp::I16x8ShrU:
-+      case wasm::SimdOp::I16x8ShrS:
-+        shiftCountMask = 15;
-+        break;
-+      case wasm::SimdOp::I32x4Shl:
-+      case wasm::SimdOp::I32x4ShrU:
-+      case wasm::SimdOp::I32x4ShrS:
-+        shiftCountMask = 31;
-+        break;
-+      case wasm::SimdOp::I64x2Shl:
-+      case wasm::SimdOp::I64x2ShrU:
-+      case wasm::SimdOp::I64x2ShrS:
-+        shiftCountMask = 63;
-+        break;
-+      default:
-+        MOZ_CRASH("Unexpected shift operation");
-+    }
-+    int32_t shiftCount = ins->rhs()->toConstant()->toInt32() & shiftCountMask;
-+#ifdef DEBUG
-+    js::wasm::ReportSimdAnalysis("shift -> constant shift");
-+#endif
-+    auto* lir = new (alloc())
-+        LWasmConstantShiftSimd128(useRegisterAtStart(ins->lhs()), shiftCount);
-+    define(lir, ins);
-+  } else {
-+#ifdef DEBUG
-+    js::wasm::ReportSimdAnalysis("shift -> variable shift");
-+#endif
-+    auto* lir = new (alloc()) LWasmVariableShiftSimd128(
-+        useRegisterAtStart(ins->lhs()), useRegisterAtStart(ins->rhs()));
-+    define(lir, ins);
-+  }
-+#else
-+  MOZ_CRASH("No SIMD");
-+#endif
-+}
-+#ifdef ENABLE_WASM_SIMD
-+// Helper: reconstruct raw Wasm byte lane indices from analyzed SimdShuffle.
-+static SimdConstant ReconstructShuffleBytes(const SimdShuffle& s) {
-+  int8_t bytes[16];
-+  if (s.permuteOp) {
-+    switch (*s.permuteOp) {
-+      case SimdPermuteOp::MOVE:
-+        for (int i = 0; i < 16; i++) bytes[i] = i;
-+        return SimdConstant::CreateX16(bytes);
-+      case SimdPermuteOp::PERMUTE_32x4: {
-+        const int32_t* w = reinterpret_cast<const int32_t*>(s.control.bytes());
-+        for (int i = 0; i < 4; i++)
-+          for (int j = 0; j < 4; j++) bytes[i*4+j] = w[i]*4+j;
-+        return SimdConstant::CreateX16(bytes);
-+      }
-+      case SimdPermuteOp::PERMUTE_16x8: {
-+        const int16_t* h = reinterpret_cast<const int16_t*>(s.control.bytes());
-+        for (int i = 0; i < 8; i++) {
-+          int idx = h[i] & 0x7;
-+          bytes[i*2] = idx*2;
-+          bytes[i*2+1] = idx*2+1;
-+        }
-+        return SimdConstant::CreateX16(bytes);
-+      }
-+      case SimdPermuteOp::BROADCAST_8x16: {
-+        int8_t lane = reinterpret_cast<const int8_t*>(s.control.bytes())[0];
-+        for (int i = 0; i < 16; i++) bytes[i] = lane;
-+        return SimdConstant::CreateX16(bytes);
-+      }
-+      case SimdPermuteOp::BROADCAST_16x8: {
-+        int8_t lane = reinterpret_cast<const int8_t*>(s.control.bytes())[0];
-+        for (int i = 0; i < 8; i++) {
-+          bytes[i*2] = lane*2; bytes[i*2+1] = lane*2+1;
-+        }
-+        return SimdConstant::CreateX16(bytes);
-+      }
-+      case SimdPermuteOp::ROTATE_RIGHT_8x16: {
-+        uint8_t shift = reinterpret_cast<const int8_t*>(s.control.bytes())[0];
-+        for (int i = 0; i < 16; i++) bytes[i] = (i + shift) % 16;
-+        return SimdConstant::CreateX16(bytes);
-+      }
-+      case SimdPermuteOp::SHIFT_RIGHT_8x16: {
-+        uint8_t shift = reinterpret_cast<const int8_t*>(s.control.bytes())[0];
-+        for (int i = 0; i < 16; i++) bytes[i] = (i+shift < 16) ? (i+shift) : 0;
-+        return SimdConstant::CreateX16(bytes);
-+      }
-+      case SimdPermuteOp::SHIFT_LEFT_8x16: {
-+        uint8_t shift = reinterpret_cast<const int8_t*>(s.control.bytes())[0];
-+        for (int i = 0; i < 16; i++) bytes[i] = (i >= shift) ? (i-shift) : 0;
-+        return SimdConstant::CreateX16(bytes);
-+      }
-+      case SimdPermuteOp::REVERSE_16x8:
-+        // Reverse bytes within each 16-bit lane: [1,0,3,2,5,4,...]
-+        for (int i = 0; i < 8; i++) {
-+          bytes[i*2] = i*2+1; bytes[i*2+1] = i*2;
-+        }
-+        return SimdConstant::CreateX16(bytes);
-+      case SimdPermuteOp::REVERSE_32x4:
-+        // Reverse bytes within each 32-bit lane: [3,2,1,0,7,6,5,4,...]
-+        for (int i = 0; i < 4; i++)
-+          for (int j = 0; j < 4; j++) bytes[i*4+j] = i*4+(3-j);
-+        return SimdConstant::CreateX16(bytes);
-+      case SimdPermuteOp::REVERSE_64x2:
-+        // Reverse bytes within each 64-bit lane: [7,6,5,4,3,2,1,0,15,...]
-+        for (int i = 0; i < 2; i++)
-+          for (int j = 0; j < 8; j++) bytes[i*8+j] = i*8+(7-j);
-+        return SimdConstant::CreateX16(bytes);
-+      default:
-+        break;
-+    }
-+  }
-+  // Handle SimdShuffleOp (two-operand patterns).
-+  if (s.shuffleOp) {
-+    switch (*s.shuffleOp) {
-+      case SimdShuffleOp::CONCAT_RIGHT_SHIFT_8x16: {
-+        // control[0] = suffix length. ARM64 uses 16-count as the EXT shift.
-+        // Reconstruct raw byte indices: EXT(rhs, lhs, 16-count) =
-+        // take (16-count) bytes from rhs end, then count bytes from lhs start.
-+        uint8_t count = 16 - reinterpret_cast<const int8_t*>(s.control.bytes())[0];
-+        for (int i = 0; i < 16; i++) {
-+          int idx = i + count;
-+          bytes[i] = (idx < 16) ? (idx + 16) : (idx - 16);
-+        }
-+        return SimdConstant::CreateX16(bytes);
-+      }
-+      case SimdShuffleOp::BLEND_8x16: {
-+        // control has 0 (lhs) or -1 (rhs) per byte.
-+        const int8_t* mask = reinterpret_cast<const int8_t*>(s.control.bytes());
-+        for (int i = 0; i < 16; i++)
-+          bytes[i] = mask[i] ? (i + 16) : i;
-+        return SimdConstant::CreateX16(bytes);
-+      }
-+      case SimdShuffleOp::BLEND_16x8: {
-+        const int16_t* mask = reinterpret_cast<const int16_t*>(s.control.bytes());
-+        for (int i = 0; i < 8; i++) {
-+          int base = mask[i] ? (i * 2 + 16) : (i * 2);
-+          bytes[i * 2] = base;
-+          bytes[i * 2 + 1] = base + 1;
-+        }
-+        return SimdConstant::CreateX16(bytes);
-+      }
-+#define INTERLEAVE(name, width, low_start, count) \
-+      case SimdShuffleOp::name: { \
-+        for (int i = 0; i < count; i++) { \
-+          int lhsIdx = low_start + i * width; \
-+          int rhsIdx = lhsIdx + 16; \
-+          for (int j = 0; j < width; j++) { \
-+            bytes[(i * 2) * width + j] = lhsIdx + j; \
-+            bytes[(i * 2 + 1) * width + j] = rhsIdx + j; \
-+          } \
-+        } \
-+        return SimdConstant::CreateX16(bytes); \
-+      }
-+      INTERLEAVE(INTERLEAVE_LOW_8x16, 1, 0, 8)
-+      INTERLEAVE(INTERLEAVE_HIGH_8x16, 1, 8, 8)
-+      INTERLEAVE(INTERLEAVE_LOW_16x8, 2, 0, 4)
-+      INTERLEAVE(INTERLEAVE_HIGH_16x8, 2, 8, 4)
-+      INTERLEAVE(INTERLEAVE_LOW_32x4, 4, 0, 2)
-+      INTERLEAVE(INTERLEAVE_HIGH_32x4, 4, 8, 2)
-+      INTERLEAVE(INTERLEAVE_LOW_64x2, 8, 0, 1)
-+      INTERLEAVE(INTERLEAVE_HIGH_64x2, 8, 8, 1)
-+#undef INTERLEAVE
-+      default:
-+        break;
-+    }
-+  }
-+  // PERMUTE_8x16, SHUFFLE_BLEND_8x16, etc: control should have raw byte indices.
-+  // Force to Int8x16 type to avoid assertions from mismatched types.
-+  if (s.control.type() == SimdConstant::Int8x16) {
-+    return s.control;
-+  }
-+  // Fallback: re-create as Int8x16 from raw bytes.
-+  memcpy(bytes, s.control.bytes(), 16);
-+  return SimdConstant::CreateX16(bytes);
-+}
-+
-+#endif  // ENABLE_WASM_SIMD
-+
-+void LIRGenerator::visitWasmShuffleSimd128(MWasmShuffleSimd128* ins) {
-+#ifdef ENABLE_WASM_SIMD
-+  MOZ_ASSERT(ins->type() == MIRType::Simd128);
-+  SimdShuffle s = ins->shuffle();
-+  switch (s.opd) {
-+    case SimdShuffle::Operand::LEFT:
-+    case SimdShuffle::Operand::RIGHT: {
-+      // Single-operand permute: the analysis has identified that only one
-+      // input matters (the other is zero or unused).
-+      LAllocation src;
-+      if (s.opd == SimdShuffle::Operand::LEFT) {
-+        src = useRegisterAtStart(ins->lhs());
-+      } else {
-+        src = useRegisterAtStart(ins->rhs());
-+      }
-+      auto* lir =
-+          new (alloc()) LWasmPermuteSimd128(src, *s.permuteOp, s.control);
-+      define(lir, ins);
-+      break;
-+    }
-+    case SimdShuffle::Operand::BOTH:
-+    case SimdShuffle::Operand::BOTH_SWAPPED: {
-+      SimdConstant ctrl = ReconstructShuffleBytes(s);
-+      LAllocation lhs, rhs;
-+      if (s.opd == SimdShuffle::Operand::BOTH_SWAPPED) {
-+        lhs = useRegisterAtStart(ins->rhs());
-+        rhs = useRegisterAtStart(ins->lhs());
-+      } else {
-+        lhs = useRegisterAtStart(ins->lhs());
-+        rhs = useRegisterAtStart(ins->rhs());
-+      }
-+      auto* lir = new (alloc()) LWasmShuffleSimd128(
-+          lhs, rhs, *s.shuffleOp, ctrl);
-+      define(lir, ins);
-+      break;
-+    }
-+  }
-+#else
-+  MOZ_CRASH("No SIMD");
-+#endif
-+}
-+void LIRGenerator::visitWasmReplaceLaneSimd128(MWasmReplaceLaneSimd128* ins) {
-+#ifdef ENABLE_WASM_SIMD
-+  MOZ_ASSERT(ins->type() == MIRType::Simd128);
-+  if (ins->rhs()->type() == MIRType::Int64) {
-+    auto* lir = new (alloc()) LWasmReplaceInt64LaneSimd128(
-+        useRegisterAtStart(ins->lhs()), useInt64Register(ins->rhs()));
-+    defineReuseInput(lir, ins, LWasmReplaceInt64LaneSimd128::LhsIndex);
-+  } else {
-+    auto* lir = new (alloc()) LWasmReplaceLaneSimd128(
-+        useRegisterAtStart(ins->lhs()), useRegister(ins->rhs()));
-+    defineReuseInput(lir, ins, LWasmReplaceLaneSimd128::LhsIndex);
-+  }
-+#else
-+  MOZ_CRASH("No SIMD");
-+#endif
-+}
-+void LIRGenerator::visitWasmScalarToSimd128(MWasmScalarToSimd128* ins) {
-+#ifdef ENABLE_WASM_SIMD
-+  MOZ_ASSERT(ins->type() == MIRType::Simd128);
-+  if (ins->input()->type() == MIRType::Int64) {
-+    auto* lir =
-+        new (alloc()) LWasmInt64ToSimd128(useInt64RegisterAtStart(ins->input()));
-+    define(lir, ins);
-+  } else {
-+    auto* lir =
-+        new (alloc()) LWasmScalarToSimd128(useRegisterAtStart(ins->input()));
-+    define(lir, ins);
-+  }
-+#else
-+  MOZ_CRASH("No SIMD");
-+#endif
-+}
-+void LIRGenerator::visitWasmUnarySimd128(MWasmUnarySimd128* ins) {
-+#ifdef ENABLE_WASM_SIMD
-+  MOZ_ASSERT(ins->type() == MIRType::Simd128);
-+  auto* lir = new (alloc())
-+      LWasmUnarySimd128(useRegisterAtStart(ins->input()),
-+                        LDefinition::BogusTemp());
-+  define(lir, ins);
-+#else
-+  MOZ_CRASH("No SIMD");
-+#endif
-+}
-+#ifdef ENABLE_WASM_SIMD
-+bool LIRGeneratorPPC64::canFoldReduceSimd128AndBranch(wasm::SimdOp op) {
-+  switch (op) {
-+    case wasm::SimdOp::V128AnyTrue:
-+    case wasm::SimdOp::I8x16AllTrue:
-+    case wasm::SimdOp::I16x8AllTrue:
-+    case wasm::SimdOp::I32x4AllTrue:
-+    case wasm::SimdOp::I64x2AllTrue:
-+      return true;
-+    default:
-+      return false;
-+  }
-+}
-+
-+bool LIRGeneratorPPC64::canEmitWasmReduceSimd128AtUses(
-+    MWasmReduceSimd128* ins) {
-+  if (!ins->canEmitAtUses()) {
-+    return false;
-+  }
-+  if (ins->type() != MIRType::Int32) {
-+    return false;
-+  }
-+  if (!canFoldReduceSimd128AndBranch(ins->simdOp())) {
-+    return false;
-+  }
-+  MUseIterator iter(ins->usesBegin());
-+  if (iter == ins->usesEnd()) {
-+    return true;
-+  }
-+  MNode* node = iter->consumer();
-+  if (!node->isDefinition() || !node->toDefinition()->isTest()) {
-+    return false;
-+  }
-+  iter++;
-+  return iter == ins->usesEnd();
-+}
-+#endif  // ENABLE_WASM_SIMD
-+
-+void LIRGenerator::visitWasmReduceSimd128(MWasmReduceSimd128* ins) {
-+#ifdef ENABLE_WASM_SIMD
-+  if (canEmitWasmReduceSimd128AtUses(ins)) {
-+    emitAtUses(ins);
-+    return;
-+  }
-+  if (ins->type() == MIRType::Int64) {
-+    auto* lir = new (alloc())
-+        LWasmReduceSimd128ToInt64(useRegisterAtStart(ins->input()));
-+    defineInt64(lir, ins);
-+  } else {
-+    auto* lir =
-+        new (alloc()) LWasmReduceSimd128(useRegisterAtStart(ins->input()));
-+    define(lir, ins);
-+  }
-+#else
-+  MOZ_CRASH("No SIMD");
-+#endif
-+}
-+void LIRGenerator::visitWasmLoadLaneSimd128(MWasmLoadLaneSimd128* ins) {
-+#ifdef ENABLE_WASM_SIMD
-+  LUse base = useRegisterAtStart(ins->base());
-+  LUse inputUse = useRegisterAtStart(ins->value());
-+  LAllocation memoryBase =
-+      ins->hasMemoryBase() ? LAllocation(useRegisterAtStart(ins->memoryBase()))
-+                           : LGeneralReg(HeapReg);
-+  auto* lir = new (alloc()) LWasmLoadLaneSimd128(base, inputUse, memoryBase);
-+  define(lir, ins);
-+#else
-+  MOZ_CRASH("No SIMD");
-+#endif
-+}
-+void LIRGenerator::visitWasmStoreLaneSimd128(MWasmStoreLaneSimd128* ins) {
-+#ifdef ENABLE_WASM_SIMD
-+  LUse base = useRegisterAtStart(ins->base());
-+  LUse input = useRegisterAtStart(ins->value());
-+  LAllocation memoryBase =
-+      ins->hasMemoryBase() ? LAllocation(useRegisterAtStart(ins->memoryBase()))
-+                           : LGeneralReg(HeapReg);
-+  auto* lir = new (alloc()) LWasmStoreLaneSimd128(base, input, memoryBase);
-+  add(lir, ins);
-+#else
-+  MOZ_CRASH("No SIMD");
-+#endif
-+}
-+
-+// PPC64 specializes compare+select for {U,}Int32 / {U,}Int64 compare with
-+// Int32 / Int64 result. The CodeGen visitor
-+// (CodeGenerator-ppc64.cpp:visitWasmCompareAndSelect) emits
-+// cmpw/cmplw/cmpd/cmpld + isel = 2 insns, replacing the ~5-7 insns the
-+// generic path would emit (boolean materialization + test + isel). FP
-+// specialization is not worthwhile — the generic FP select path already
-+// runs faster than the specialized integer one and PPC64 lacks a true
-+// fcsel equivalent (fsel only compares against zero).
-+bool LIRGeneratorShared::canSpecializeWasmCompareAndSelect(
-+    MCompare::CompareType compTy, MIRType insTy) {
-+  const bool insOk = insTy == MIRType::Int32 || insTy == MIRType::Int64;
-+  const bool cmpOk = compTy == MCompare::Compare_Int32 ||
-+                     compTy == MCompare::Compare_UInt32 ||
-+                     compTy == MCompare::Compare_Int64 ||
-+                     compTy == MCompare::Compare_UInt64;
-+  return insOk && cmpOk;
-+}
-+
-+void LIRGeneratorShared::lowerWasmCompareAndSelect(MWasmSelect* ins,
-+                                                   MDefinition* lhs,
-+                                                   MDefinition* rhs,
-+                                                   MCompare::CompareType compTy,
-+                                                   JSOp jsop) {
-+  MOZ_ASSERT(canSpecializeWasmCompareAndSelect(compTy, ins->type()));
-+  auto* lir = new (alloc()) LWasmCompareAndSelect(
-+      useRegister(lhs), useRegister(rhs), useRegisterAtStart(ins->trueExpr()),
-+      useRegister(ins->falseExpr()), compTy, jsop);
-+  defineReuseInput(lir, ins, LWasmCompareAndSelect::IfTrueExprIndex);
-+}
-+
-+// MIR helpers needed by the linker
-+#ifdef ENABLE_WASM_SIMD
-+bool MWasmTernarySimd128::specializeBitselectConstantMaskAsShuffle(
-+    int8_t shuffle[16]) {
-+  return false;
-+}
-+#endif
-+
-+bool MWasmBinarySimd128::specializeForConstantRhs() { return false; }
-+
-+#ifdef ENABLE_WASM_SIMD
-+bool MWasmTernarySimd128::canRelaxBitselect() { return false; }
-+#endif
-+
-+#ifdef ENABLE_WASM_SIMD
-+bool MWasmBinarySimd128::canPmaddubsw() { return false; }
-+#endif
-+
-+}  // namespace jit
-+}  // namespace js
-diff --git a/js/src/jit/ppc64/Lowering-ppc64.h b/js/src/jit/ppc64/Lowering-ppc64.h
-new file mode 100644
-index 000000000000..9c3519a7bb23
---- /dev/null
-+++ b/js/src/jit/ppc64/Lowering-ppc64.h
-@@ -0,0 +1,105 @@
-+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
-+ * vim: set ts=8 sts=2 et sw=2 tw=80:
-+ * This Source Code Form is subject to the terms of the Mozilla Public
-+ * License, v. 2.0. If a copy of the MPL was not distributed with this
-+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-+
-+#ifndef jit_ppc64_Lowering_ppc64_h
-+#define jit_ppc64_Lowering_ppc64_h
-+
-+#include "jit/shared/Lowering-shared.h"
-+
-+namespace js {
-+namespace jit {
-+
-+class LIRGeneratorPPC64 : public LIRGeneratorShared {
-+ protected:
-+  LIRGeneratorPPC64(MIRGenerator* gen, MIRGraph& graph, LIRGraph& lirGraph)
-+      : LIRGeneratorShared(gen, graph, lirGraph) {}
-+
-+  LTableSwitch* newLTableSwitch(const LAllocation& in,
-+                                const LDefinition& inputCopy);
-+  LTableSwitchV* newLTableSwitchV(const LBoxAllocation& in);
-+
-+  void lowerForShift(LInstructionHelper<1, 2, 0>* ins, MDefinition* mir,
-+                     MDefinition* lhs, MDefinition* rhs);
-+  template <class LInstr>
-+  void lowerForShiftInt64(LInstr* ins, MDefinition* mir, MDefinition* lhs,
-+                          MDefinition* rhs);
-+  void lowerForALU(LInstructionHelper<1, 1, 0>* ins, MDefinition* mir,
-+                   MDefinition* input);
-+  void lowerForALU(LInstructionHelper<1, 2, 0>* ins, MDefinition* mir,
-+                   MDefinition* lhs, MDefinition* rhs);
-+  void lowerForALUInt64(LInstructionHelper<INT64_PIECES, INT64_PIECES, 0>* ins,
-+                        MDefinition* mir, MDefinition* input);
-+  void lowerForALUInt64(
-+      LInstructionHelper<INT64_PIECES, 2 * INT64_PIECES, 0>* ins,
-+      MDefinition* mir, MDefinition* lhs, MDefinition* rhs);
-+  void lowerForMulInt64(LMulI64* ins, MMul* mir, MDefinition* lhs,
-+                        MDefinition* rhs);
-+  void lowerForFPU(LInstructionHelper<1, 1, 0>* ins, MDefinition* mir,
-+                   MDefinition* input);
-+  void lowerForFPU(LInstructionHelper<1, 2, 0>* ins, MDefinition* mir,
-+                   MDefinition* lhs, MDefinition* rhs);
-+
-+  template <size_t Temps>
-+  void lowerForCompareI64(LInstructionHelper<1, 2 * INT64_PIECES, Temps>* lir,
-+                          MDefinition* mir, MDefinition* lhs, MDefinition* rhs);
-+
-+  LBoxAllocation useBoxFixed(MDefinition* mir, Register reg1, Register reg2,
-+                             bool useAtStart = false);
-+
-+  LAllocation useByteOpRegister(MDefinition* mir);
-+  LAllocation useByteOpRegisterAtStart(MDefinition* mir);
-+  LAllocation useByteOpRegisterOrNonDoubleConstant(MDefinition* mir);
-+  LDefinition tempByteOpRegister();
-+
-+  LDefinition tempToUnbox();
-+
-+  bool needTempForPostBarrier() { return true; }
-+
-+  void lowerUntypedPhiInput(MPhi* phi, uint32_t inputPosition, LBlock* block,
-+                            size_t lirIndex);
-+  void lowerInt64PhiInput(MPhi* phi, uint32_t inputPosition, LBlock* block,
-+                          size_t lirIndex);
-+  void defineInt64Phi(MPhi* phi, size_t lirIndex);
-+
-+  void lowerMulI(MMul* mul, MDefinition* lhs, MDefinition* rhs);
-+  void lowerDivI(MDiv* div);
-+  void lowerDivI64(MDiv* div);
-+  void lowerModI(MMod* mod);
-+  void lowerModI64(MMod* mod);
-+  void lowerUDiv(MDiv* div);
-+  void lowerUDivI64(MDiv* div);
-+  void lowerUMod(MMod* mod);
-+  void lowerUModI64(MMod* mod);
-+  void lowerUrshD(MUrsh* mir);
-+  void lowerPowOfTwoI(MPow* mir);
-+  void lowerBigIntPtrDiv(MBigIntPtrDiv* ins);
-+  void lowerBigIntPtrMod(MBigIntPtrMod* ins);
-+  void lowerBigIntPtrLsh(MBigIntPtrLsh* ins);
-+  void lowerBigIntPtrRsh(MBigIntPtrRsh* ins);
-+  void lowerTruncateDToInt32(MTruncateToInt32* ins);
-+  void lowerTruncateFToInt32(MTruncateToInt32* ins);
-+  void lowerBuiltinInt64ToFloatingPoint(MBuiltinInt64ToFloatingPoint* ins);
-+  void lowerWasmSelectI(MWasmSelect* select);
-+  void lowerWasmSelectI64(MWasmSelect* select);
-+  void lowerWasmBuiltinTruncateToInt64(MWasmBuiltinTruncateToInt64* ins);
-+  void lowerWasmBuiltinTruncateToInt32(MWasmBuiltinTruncateToInt32* ins);
-+  void lowerWasmBuiltinDivI64(MWasmBuiltinDivI64* div);
-+  void lowerWasmBuiltinModI64(MWasmBuiltinModI64* mod);
-+  void lowerAtomicLoad64(MLoadUnboxedScalar* ins);
-+  void lowerAtomicStore64(MStoreUnboxedScalar* ins);
-+
-+#ifdef ENABLE_WASM_SIMD
-+  bool canFoldReduceSimd128AndBranch(wasm::SimdOp op);
-+  bool canEmitWasmReduceSimd128AtUses(MWasmReduceSimd128* ins);
-+#endif
-+};
-+
-+typedef LIRGeneratorPPC64 LIRGeneratorSpecific;
-+
-+}  // namespace jit
-+}  // namespace js
-+
-+#endif /* jit_ppc64_Lowering_ppc64_h */
-diff --git a/js/src/jit/ppc64/MacroAssembler-ppc64-inl.h b/js/src/jit/ppc64/MacroAssembler-ppc64-inl.h
-new file mode 100644
-index 000000000000..f82ca36b4e40
---- /dev/null
-+++ b/js/src/jit/ppc64/MacroAssembler-ppc64-inl.h
-@@ -0,0 +1,6142 @@
-+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
-+ * vim: set ts=8 sts=2 et sw=2 tw=80:
-+ * This Source Code Form is subject to the terms of the Mozilla Public
-+ * License, v. 2.0. If a copy of the MPL was not distributed with this
-+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-+
-+#ifndef jit_ppc64_MacroAssembler_ppc64_inl_h
-+#define jit_ppc64_MacroAssembler_ppc64_inl_h
-+
-+#include "jit/ppc64/MacroAssembler-ppc64.h"
-+
-+namespace js {
-+namespace jit {
-+
-+//{{{ check_macroassembler_style
-+
-+// ===============================================================
-+// Move instructions
-+
-+void MacroAssembler::move64(Register64 src, Register64 dest) {
-+  movePtr(src.reg, dest.reg);
-+}
-+
-+void MacroAssembler::move64(Imm64 imm, Register64 dest) {
-+  movePtr(ImmWord(imm.value), dest.reg);
-+}
-+
-+void MacroAssembler::moveDoubleToGPR64(FloatRegister src, Register64 dest) {
-+  as_mfvsrd(dest.reg, src);
-+}
-+
-+void MacroAssembler::moveGPR64ToDouble(Register64 src, FloatRegister dest) {
-+  as_mtvsrd(dest, src.reg);
-+}
-+
-+void MacroAssembler::moveLowDoubleToGPR(FloatRegister src, Register dest) {
-+  MOZ_CRASH("Not supported for this target");
-+}
-+
-+void MacroAssembler::move64To32(Register64 src, Register dest) {
-+  as_extsw(dest, src.reg);
-+}
-+
-+void MacroAssembler::move32To64ZeroExtend(Register src, Register64 dest) {
-+  // clrldi dest, src, 32 — clear upper 32 bits.
-+  as_rldicl(dest.reg, src, 0, 32);
-+}
-+
-+void MacroAssembler::move8To64SignExtend(Register src, Register64 dest) {
-+  as_extsb(dest.reg, src);
-+}
-+
-+void MacroAssembler::move16To64SignExtend(Register src, Register64 dest) {
-+  as_extsh(dest.reg, src);
-+}
-+
-+void MacroAssembler::move32To64SignExtend(Register src, Register64 dest) {
-+  as_extsw(dest.reg, src);
-+}
-+
-+void MacroAssembler::moveFloat32ToGPR(FloatRegister src, Register dest) {
-+  // FPR holds double-format value (PPC convention). Convert to
-+  // single-precision bits in bits 0:31 of the VSR, then extract.
-+  as_xscvdpspn(ScratchDoubleReg, src);
-+  as_mfvsrd(dest, ScratchDoubleReg);
-+  x_srdi(dest, dest, 32);
-+}
-+
-+void MacroAssembler::moveGPRToFloat32(Register src, FloatRegister dest) {
-+  // Place raw single-precision bits in VSR bits 0:31, then convert
-+  // to double-precision format (matching PPC's FPR convention, like lfs).
-+  if (HasPOWER9()) {
-+    // mtvsrws splats the 32-bit word to both halves of the VSR.
-+    as_mtvsrws(dest, src);
-+  } else {
-+    // POWER8: shift GPR left 32 bits to place float bits in upper word,
-+    // then move to VSR. xscvspdpn reads from bits 0:31.
-+    UseScratchRegisterScope temps(*this);
-+    Register tmp = temps.Acquire();
-+    x_sldi(tmp, src, 32);
-+    as_mtvsrd(dest, tmp);
-+  }
-+  as_xscvspdpn(dest, dest);
-+}
-+
-+void MacroAssembler::moveFloat16ToGPR(FloatRegister src, Register dest) {
-+  MOZ_ASSERT(HasPOWER9());
-+  // src has FP16 in dw0 bits 48:63; rest of dw0 is 0 (per xscvdphp /
-+  // lxsihzx / mtvsrwz contract). mfvsrd reads dw0 → dest = 0x...0000_HHHH.
-+  // Mask defensively in case a future caller hands us a non-canonical FP16.
-+  as_mfvsrd(dest, src);
-+  as_rldicl(dest, dest, 0, 48);  // clrldi 48: keep low 16 bits
-+}
-+
-+void MacroAssembler::moveGPRToFloat16(Register src, FloatRegister dest) {
-+  MOZ_ASSERT(HasPOWER9());
-+  // mtvsrwz zeros dw0 word 0 and copies src's low 32 to dw0 word 1; mask
-+  // src to its low 16 first so dw0 bits 32:47 stay zero (canonical FP16).
-+  UseScratchRegisterScope temps(*this);
-+  Register scratch = temps.Acquire();
-+  as_rldicl(scratch, src, 0, 48);  // clrldi 48: keep only low 16
-+  as_mtvsrwz(dest, scratch);
-+}
-+
-+void MacroAssembler::move8ZeroExtend(Register src, Register dest) {
-+  // rlwinm dest, src, 0, 24, 31 — mask to low 8 bits.
-+  as_rlwinm(dest, src, 0, 24, 31);
-+}
-+
-+void MacroAssembler::move8SignExtend(Register src, Register dest) {
-+  as_extsb(dest, src);
-+}
-+
-+void MacroAssembler::move16SignExtend(Register src, Register dest) {
-+  as_extsh(dest, src);
-+}
-+
-+void MacroAssembler::move8SignExtendToPtr(Register src, Register dest) {
-+  as_extsb(dest, src);
-+}
-+
-+void MacroAssembler::move16SignExtendToPtr(Register src, Register dest) {
-+  as_extsh(dest, src);
-+}
-+
-+void MacroAssembler::move32SignExtendToPtr(Register src, Register dest) {
-+  as_extsw(dest, src);
-+}
-+
-+void MacroAssembler::move32ZeroExtendToPtr(Register src, Register dest) {
-+  as_rldicl(dest, src, 0, 32);
-+}
-+
-+// ===============================================================
-+// Load instructions
-+
-+void MacroAssembler::load32SignExtendToPtr(const Address& src, Register dest) {
-+  load32(src, dest);
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::loadAbiReturnAddress(Register dest) { xs_mflr(dest); }
-+
-+// ===============================================================
-+// Logical instructions
-+
-+void MacroAssembler::not32(Register reg) {
-+  x_not(reg, reg);
-+  as_extsw(reg, reg);
-+}
-+
-+void MacroAssembler::notPtr(Register reg) { x_not(reg, reg); }
-+
-+void MacroAssembler::andPtr(Register src, Register dest) {
-+  as_and_(dest, dest, src);
-+}
-+
-+// If `mask` is a non-zero, non-all-ones contiguous run of 1-bits in a
-+// 32-bit value (LSB-numbering), set MB/ME to the BE bit positions
-+// (PPC convention: bit 0 = MSB) needed by `rlwinm SH=0` and return true.
-+// Otherwise return false. Run-time cost is at JIT emit time only.
-+static inline bool IsContigMask32(uint32_t mask, unsigned& mb, unsigned& me) {
-+  if (mask == 0 || mask == 0xFFFFFFFFu) return false;
-+  unsigned tz = (unsigned)__builtin_ctz(mask);
-+  uint32_t shifted = mask >> tz;
-+  if ((shifted & (shifted + 1)) != 0) return false;  // Has a 0 between 1s.
-+  unsigned width = 32 - (unsigned)__builtin_clz(shifted);
-+  // LSB bits set: [tz, tz+width-1]. BE bits: [32-tz-width, 31-tz].
-+  mb = 32 - tz - width;
-+  me = 31 - tz;
-+  return true;
-+}
-+
-+// 64-bit contiguous-mask classification for AND-with-imm via PPC's
-+// rotate-and-mask family (SH=0). On success, sets `lsb` (LSB-numbering
-+// of lowest set bit) and `width` (number of contiguous 1-bits).
-+// Caller picks the encoding:
-+//   - lsb == 0:                low `width` bits set        → rldicl
-+//   (mb6=64-width)
-+//   - lsb + width == 64:       high `width` bits set       → rldicr
-+//   (me6=width-1)
-+//   - lsb + width <= 32:       contig mask within low 32   → rlwinm (zeros high
-+//   32)
-+//   - otherwise (mid-run mask straddling bit 32 with lsb>0): no SH=0 single
-+//     insn fits, return false to fall back to scratch+and.
-+static inline bool IsContigMask64(uint64_t mask, unsigned& lsb,
-+                                  unsigned& width) {
-+  if (mask == 0 || mask == ~uint64_t(0)) return false;
-+  unsigned tz = (unsigned)__builtin_ctzll(mask);
-+  uint64_t shifted = mask >> tz;
-+  if ((shifted & (shifted + 1)) != 0) return false;  // Has a 0 between 1s.
-+  width = 64 - (unsigned)__builtin_clzll(shifted);
-+  lsb = tz;
-+  return true;
-+}
-+
-+void MacroAssembler::andPtr(Imm32 imm, Register dest) {
-+  // andi. handles 16-bit unsigned immediates in 1 insn (sets CR0).
-+  // For wider positive immediates, IsContigMask32 → rlwinm (1 insn,
-+  // also sets CR0). NOTE: andPtr sign-extends Imm32 to 64-bit before
-+  // ANDing, so contig-mask is only safe when the immediate is
-+  // non-negative (high bit clear) — rlwinm always zeros the high 32.
-+  uint32_t uimm = uint32_t(imm.value);
-+  if (is_uintN(uimm, 16)) {
-+    as_andi_rc(dest, dest, uimm);
-+    return;
-+  }
-+  unsigned mb, me;
-+  if (imm.value >= 0 && IsContigMask32(uimm, mb, me)) {
-+    as_rlwinm_rc(dest, dest, 0, mb, me);
-+    return;
-+  }
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  movePtr(ImmWord(uintptr_t(intptr_t(imm.value))), scratch);
-+  as_and_(dest, dest, scratch);
-+}
-+
-+void MacroAssembler::andPtr(Imm32 imm, Register src, Register dest) {
-+  if (src != dest) {
-+    xs_mr(dest, src);
-+  }
-+  andPtr(imm, dest);
-+}
-+
-+void MacroAssembler::and64(Imm64 imm, Register64 dest) {
-+  uint64_t u = imm.value;
-+  // 16-bit unsigned → andi. (1 insn).
-+  if (u <= 0xFFFFu) {
-+    as_andi_rc(dest.reg, dest.reg, uint16_t(u));
-+    return;
-+  }
-+  unsigned lsb, width;
-+  if (IsContigMask64(u, lsb, width)) {
-+    if (lsb == 0) {
-+      // low `width` bits set: rldicl SH=0 MB=64-width.
-+      as_rldicl_rc(dest.reg, dest.reg, 0, 64 - width);
-+      return;
-+    }
-+    if (lsb + width == 64) {
-+      // high `width` bits set: rldicr SH=0 ME=width-1.
-+      as_rldicr_rc(dest.reg, dest.reg, 0, width - 1);
-+      return;
-+    }
-+    if (lsb + width <= 32) {
-+      // contig mask within low 32: rlwinm SH=0 zeros bits 0..31 too.
-+      // BE positions: mb = 32 - lsb - width, me = 31 - lsb.
-+      as_rlwinm_rc(dest.reg, dest.reg, 0, 32 - lsb - width, 31 - lsb);
-+      return;
-+    }
-+    // mid-run mask straddling bit 32 (lsb>0, lsb+width>32, lsb+width<64):
-+    // not encodable as SH=0 mask. Fall through to scratch+and.
-+  }
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  movePtr(ImmWord(u), scratch);
-+  as_and_(dest.reg, dest.reg, scratch);
-+}
-+
-+void MacroAssembler::and64(Register64 src, Register64 dest) {
-+  as_and_(dest.reg, dest.reg, src.reg);
-+}
-+
-+void MacroAssembler::and32(Register src, Register dest) {
-+  as_and_(dest, dest, src);
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::and32(Imm32 imm, Register dest) {
-+  uint32_t uimm = uint32_t(imm.value);
-+  if (is_uintN(uimm, 16)) {
-+    as_andi_rc(dest, dest, uimm);
-+  } else {
-+    unsigned mb, me;
-+    if (IsContigMask32(uimm, mb, me)) {
-+      // rlwinm.SH=0 ANDs with the contiguous mask; record form sets CR0
-+      // to match the side-effect of the andi. fast path above.
-+      as_rlwinm_rc(dest, dest, 0, mb, me);
-+    } else {
-+      UseScratchRegisterScope temps(asMasm());
-+      Register scratch = temps.Acquire();
-+      move32(imm, scratch);
-+      as_and_(dest, dest, scratch);
-+    }
-+  }
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::and32(Imm32 imm, Register src, Register dest) {
-+  if (src != dest) {
-+    xs_mr(dest, src);
-+  }
-+  and32(imm, dest);
-+}
-+
-+void MacroAssembler::and32(Imm32 imm, const Address& dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  load32(dest, scratch);
-+  and32(imm, scratch);
-+  store32(scratch, dest);
-+}
-+
-+void MacroAssembler::and32(const Address& src, Register dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  load32(src, scratch);
-+  as_and_(dest, dest, scratch);
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::or64(Imm64 imm, Register64 dest) {
-+  uint64_t u = imm.value;
-+  // ori/oris zero-extend their immediates and don't touch other bits, so
-+  // when imm fits in unsigned 32 (high 32 == 0) the pair handles it in
-+  // 1-2 insns with no scratch.
-+  if (u <= 0xFFFFFFFFu) {
-+    uint16_t lo = uint16_t(u);
-+    uint16_t hi = uint16_t(u >> 16);
-+    if (hi == 0) {
-+      as_ori(dest.reg, dest.reg, lo);
-+    } else if (lo == 0) {
-+      as_oris(dest.reg, dest.reg, hi);
-+    } else {
-+      as_ori(dest.reg, dest.reg, lo);
-+      as_oris(dest.reg, dest.reg, hi);
-+    }
-+    return;
-+  }
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  movePtr(ImmWord(u), scratch);
-+  as_or_(dest.reg, dest.reg, scratch);
-+}
-+
-+void MacroAssembler::or32(Register src, Register dest) {
-+  as_or_(dest, dest, src);
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::or32(Imm32 imm, Register dest) {
-+  uint32_t uimm = uint32_t(imm.value);
-+  uint16_t lo = uimm & 0xFFFF;
-+  uint16_t hi = (uimm >> 16) & 0xFFFF;
-+  if (hi == 0) {
-+    as_ori(dest, dest, lo);
-+  } else if (lo == 0) {
-+    as_oris(dest, dest, hi);
-+  } else {
-+    // ori + oris pair handles arbitrary 32-bit unsigned imm in 2 insns
-+    // without a scratch GPR. ori/oris are non-record forms (don't touch
-+    // CR0), matching the behavior of the previous scratch+or_ path
-+    // (or_ is the record form, but the value-only result is what callers
-+    // observe through dest).
-+    as_ori(dest, dest, lo);
-+    as_oris(dest, dest, hi);
-+  }
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::or32(Imm32 imm, Register src, Register dest) {
-+  if (src != dest) {
-+    xs_mr(dest, src);
-+  }
-+  or32(imm, dest);
-+}
-+
-+void MacroAssembler::or32(Imm32 imm, const Address& dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  load32(dest, scratch);
-+  or32(imm, scratch);
-+  store32(scratch, dest);
-+}
-+
-+void MacroAssembler::xor64(Imm64 imm, Register64 dest) {
-+  uint64_t u = imm.value;
-+  // xori/xoris zero-extend their immediates; for unsigned-32-fit values
-+  // they replace the scratch+xor sequence with 1-2 insns.
-+  if (u <= 0xFFFFFFFFu) {
-+    uint16_t lo = uint16_t(u);
-+    uint16_t hi = uint16_t(u >> 16);
-+    if (hi == 0) {
-+      as_xori(dest.reg, dest.reg, lo);
-+    } else if (lo == 0) {
-+      as_xoris(dest.reg, dest.reg, hi);
-+    } else {
-+      as_xori(dest.reg, dest.reg, lo);
-+      as_xoris(dest.reg, dest.reg, hi);
-+    }
-+    return;
-+  }
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  movePtr(ImmWord(u), scratch);
-+  as_xor_(dest.reg, dest.reg, scratch);
-+}
-+
-+void MacroAssembler::orPtr(Register src, Register dest) {
-+  as_or_(dest, dest, src);
-+}
-+
-+void MacroAssembler::orPtr(Imm32 imm, Register dest) {
-+  uint32_t uimm = uint32_t(imm.value);
-+  uint16_t lo = uimm & 0xFFFF;
-+  uint16_t hi = (uimm >> 16) & 0xFFFF;
-+  // ori/oris zero-extend their immediates, so for non-negative Imm32 (high
-+  // 32 of sign-extended value = 0) we can use ori+oris to OR the full
-+  // 32-bit pattern in 1-2 insns. Negative Imm32 sign-extends to set high
-+  // bits 32..63 in the OR — those bits would be lost with ori+oris alone.
-+  if (imm.value >= 0) {
-+    if (hi == 0) {
-+      as_ori(dest, dest, lo);
-+    } else if (lo == 0) {
-+      as_oris(dest, dest, hi);
-+    } else {
-+      as_ori(dest, dest, lo);
-+      as_oris(dest, dest, hi);
-+    }
-+    return;
-+  }
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  movePtr(ImmWord(uintptr_t(intptr_t(imm.value))), scratch);
-+  as_or_(dest, dest, scratch);
-+}
-+
-+void MacroAssembler::orPtr(Imm32 imm, Register src, Register dest) {
-+  if (src != dest) {
-+    xs_mr(dest, src);
-+  }
-+  orPtr(imm, dest);
-+}
-+
-+void MacroAssembler::or64(Register64 src, Register64 dest) {
-+  as_or_(dest.reg, dest.reg, src.reg);
-+}
-+
-+void MacroAssembler::xor64(Register64 src, Register64 dest) {
-+  as_xor_(dest.reg, dest.reg, src.reg);
-+}
-+
-+void MacroAssembler::xorPtr(Register src, Register dest) {
-+  as_xor_(dest, dest, src);
-+}
-+
-+void MacroAssembler::xorPtr(Imm32 imm, Register dest) {
-+  uint32_t uimm = uint32_t(imm.value);
-+  uint16_t lo = uimm & 0xFFFF;
-+  uint16_t hi = (uimm >> 16) & 0xFFFF;
-+  if (imm.value >= 0) {
-+    if (hi == 0) {
-+      as_xori(dest, dest, lo);
-+    } else if (lo == 0) {
-+      as_xoris(dest, dest, hi);
-+    } else {
-+      as_xori(dest, dest, lo);
-+      as_xoris(dest, dest, hi);
-+    }
-+    return;
-+  }
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  movePtr(ImmWord(uintptr_t(intptr_t(imm.value))), scratch);
-+  as_xor_(dest, dest, scratch);
-+}
-+
-+void MacroAssembler::xorPtr(Imm32 imm, Register src, Register dest) {
-+  if (src != dest) {
-+    xs_mr(dest, src);
-+  }
-+  xorPtr(imm, dest);
-+}
-+
-+void MacroAssembler::xor32(Register src, Register dest) {
-+  as_xor_(dest, dest, src);
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::xor32(Imm32 imm, Register dest) {
-+  uint32_t uimm = uint32_t(imm.value);
-+  uint16_t lo = uimm & 0xFFFF;
-+  uint16_t hi = (uimm >> 16) & 0xFFFF;
-+  if (hi == 0) {
-+    as_xori(dest, dest, lo);
-+  } else if (lo == 0) {
-+    as_xoris(dest, dest, hi);
-+  } else {
-+    // xori + xoris pair — 2 insns, no scratch GPR.
-+    as_xori(dest, dest, lo);
-+    as_xoris(dest, dest, hi);
-+  }
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::xor32(Imm32 imm, Register src, Register dest) {
-+  if (src != dest) {
-+    xs_mr(dest, src);
-+  }
-+  xor32(imm, dest);
-+}
-+
-+void MacroAssembler::xor32(Imm32 imm, const Address& dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  load32(dest, scratch);
-+  xor32(imm, scratch);
-+  store32(scratch, dest);
-+}
-+
-+void MacroAssembler::xor32(const Address& src, Register dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  load32(src, scratch);
-+  xor32(scratch, dest);
-+}
-+
-+// ===============================================================
-+// Swap instructions
-+
-+void MacroAssembler::byteSwap16SignExtend(Register reg) {
-+  if (HasPOWER10()) {
-+    // brh byte-reverses every halfword in reg; extsh keeps just the
-+    // low halfword's byte-reversed value, sign-extended to 64 bits.
-+    as_brh(reg, reg);
-+    as_extsh(reg, reg);
-+    return;
-+  }
-+  // POWER8/9: rotate-and-mask synthesis. Swap bytes in low halfword via
-+  // (reg<<8)&0xFF00 | (reg>>8)&0xFF, then sign-extend.
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  as_rlwinm(scratch, reg, 8, 16, 23);  // scratch = (reg<<8) & 0xFF00
-+  as_rlwinm(reg, reg, 24, 24, 31);     // reg = (reg>>8) & 0xFF
-+  as_or_(reg, reg, scratch);
-+  as_extsh(reg, reg);
-+}
-+
-+void MacroAssembler::byteSwap16ZeroExtend(Register reg) {
-+  if (HasPOWER10()) {
-+    // brh byte-reverses every halfword; rldicl with sh=0,mb=48 zeroes
-+    // the upper 48 bits — no CR0 side effect (vs andi.).
-+    as_brh(reg, reg);
-+    as_rldicl(reg, reg, 0, 48);
-+    return;
-+  }
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  // Both rlwinm forms zero-extend the 64-bit destination per ISA v3.0B
-+  // (mask M = MASK(MB+32, ME+32) is 0 above bit 31), so after the OR the
-+  // upper 48 bits are already zero — no follow-up clearing needed.
-+  as_rlwinm(scratch, reg, 8, 16, 23);
-+  as_rlwinm(reg, reg, 24, 24, 31);
-+  as_or_(reg, reg, scratch);
-+}
-+
-+void MacroAssembler::byteSwap32(Register reg) {
-+  if (HasPOWER10()) {
-+    // brw byte-reverses both 32-bit halves; extsw drops the upper half
-+    // and sign-extends the byte-reversed low word to 64 bits.
-+    as_brw(reg, reg);
-+    as_extsw(reg, reg);
-+    return;
-+  }
-+  // POWER8/9: rotate-with-insert synthesis (4 insns).
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  // scratch = rotate reg left 8, mask bytes 0,3
-+  as_rlwinm(scratch, reg, 8, 0, 31);    // rotl32 by 8
-+  as_rlwimi(scratch, reg, 24, 0, 7);    // insert byte 0
-+  as_rlwimi(scratch, reg, 24, 16, 23);  // insert byte 2
-+  // Sign-extend to 64 bits (as 32-bit value).
-+  as_extsw(reg, scratch);
-+}
-+
-+void MacroAssembler::byteSwap64(Register64 reg64) {
-+  if (HasPOWER10()) {
-+    // 1 insn, no FPR round-trip.
-+    as_brd(reg64.reg, reg64.reg);
-+  } else if (HasPOWER9()) {
-+    as_mtvsrd(ScratchDoubleReg, reg64.reg);
-+    as_xxbrd(ScratchDoubleReg, ScratchDoubleReg);
-+    as_mfvsrd(reg64.reg, ScratchDoubleReg);
-+  } else {
-+    // POWER8: byte-swap via stack using stwbrx (word byte-reverse store).
-+    // stwbrx RS,RA,RB stores RS byte-reversed at RA+RB.
-+    // For 64-bit swap: store high word reversed at addr+0, low word at addr+4.
-+    Register r = reg64.reg;
-+    UseScratchRegisterScope temps(*this);
-+    Register tmp = temps.Acquire();
-+    as_stdu(StackPointer, StackPointer, -16);
-+    // Store low 32 bits byte-reversed at SP+12.
-+    as_addi(tmp, StackPointer, 12);
-+    as_stwbrx(r, r0, tmp);  // r0 as RA = 0, so addr = tmp
-+    // Store high 32 bits byte-reversed at SP+8.
-+    x_srdi(r, r, 32);
-+    as_addi(tmp, StackPointer, 8);
-+    as_stwbrx(r, r0, tmp);  // addr = tmp
-+    // Load reversed 64-bit value from SP+8.
-+    as_ld(r, StackPointer, 8);
-+    as_addi(StackPointer, StackPointer, 16);
-+  }
-+}
-+
-+// ===============================================================
-+// Arithmetic functions
-+
-+void MacroAssembler::addPtr(Register src, Register dest) {
-+  as_add(dest, dest, src);
-+}
-+
-+void MacroAssembler::addPtr(Imm32 imm, Register dest) {
-+  int32_t val = imm.value;
-+  if (is_intN(val, 16)) {
-+    as_addi(dest, dest, val);
-+    return;
-+  }
-+  if (HasPOWER10()) {
-+    // Imm32 always fits 34-bit signed; paddi does dest = dest + imm in one
-+    // prefixed instruction with no scratch.
-+    as_paddi(dest, dest, int64_t(val), /*R=*/false);
-+    return;
-+  }
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  movePtr(ImmWord(int64_t(val)), scratch);
-+  as_add(dest, dest, scratch);
-+}
-+
-+void MacroAssembler::addPtr(ImmWord imm, Register dest) {
-+  if (is_intN(int64_t(imm.value), 16)) {
-+    as_addi(dest, dest, int16_t(imm.value));
-+    return;
-+  }
-+  if (HasPOWER10() && is_intN((intptr_t)imm.value, 34)) {
-+    as_paddi(dest, dest, (int64_t)imm.value, /*R=*/false);
-+    return;
-+  }
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  movePtr(imm, scratch);
-+  as_add(dest, dest, scratch);
-+}
-+
-+void MacroAssembler::add64(Register64 src, Register64 dest) {
-+  as_add(dest.reg, dest.reg, src.reg);
-+}
-+
-+void MacroAssembler::add64(Imm32 imm, Register64 dest) {
-+  addPtr(Imm32(imm.value), dest.reg);
-+}
-+
-+void MacroAssembler::add64(Imm64 imm, Register64 dest) {
-+  if (is_intN(int64_t(imm.value), 16)) {
-+    as_addi(dest.reg, dest.reg, int16_t(imm.value));
-+    return;
-+  }
-+  if (HasPOWER10() && is_intN((int64_t)imm.value, 34)) {
-+    as_paddi(dest.reg, dest.reg, (int64_t)imm.value, /*R=*/false);
-+    return;
-+  }
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  MOZ_ASSERT(dest.reg != scratch);
-+  movePtr(ImmWord(imm.value), scratch);
-+  as_add(dest.reg, dest.reg, scratch);
-+}
-+
-+void MacroAssembler::add32(Register src, Register dest) {
-+  as_add(dest, dest, src);
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::add32(Imm32 imm, Register dest) {
-+  if (is_intN(imm.value, 16)) {
-+    as_addi(dest, dest, imm.value);
-+  } else {
-+    UseScratchRegisterScope temps(asMasm());
-+    Register scratch = temps.Acquire();
-+    move32(imm, scratch);
-+    as_add(dest, dest, scratch);
-+  }
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::add32(Imm32 imm, Register src, Register dest) {
-+  move32(src, dest);
-+  add32(imm, dest);
-+}
-+
-+void MacroAssembler::add32(Imm32 imm, const Address& dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  load32(dest, scratch);
-+  add32(imm, scratch);
-+  store32(scratch, dest);
-+}
-+
-+void MacroAssembler::add32(const Address& src, Register dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  load32(src, scratch);
-+  as_add(dest, dest, scratch);
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::addPtr(Imm32 imm, const Address& dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  loadPtr(dest, scratch);
-+  addPtr(imm, scratch);
-+  storePtr(scratch, dest);
-+}
-+
-+void MacroAssembler::addPtr(const Address& src, Register dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  loadPtr(src, scratch);
-+  addPtr(scratch, dest);
-+}
-+
-+void MacroAssembler::addDouble(FloatRegister src, FloatRegister dest) {
-+  as_fadd(dest, dest, src);
-+}
-+
-+void MacroAssembler::addFloat32(FloatRegister src, FloatRegister dest) {
-+  as_fadds(dest, dest, src);
-+}
-+
-+CodeOffset MacroAssembler::sub32FromStackPtrWithPatch(Register dest) {
-+  CodeOffset offset = CodeOffset(currentOffset());
-+  emitLoad64Stanza(dest, 0);
-+  as_subf(dest, dest, StackPointer);
-+  return offset;
-+}
-+
-+void MacroAssembler::patchSub32FromStackPtr(CodeOffset offset, Imm32 imm) {
-+  Instruction* inst = (Instruction*)editSrc(BufferOffset(offset.offset()));
-+  UpdateLoad64Value(inst, uint64_t(int64_t(imm.value)));
-+}
-+
-+void MacroAssembler::subPtr(Register src, Register dest) {
-+  as_subf(dest, src, dest);
-+}
-+
-+void MacroAssembler::subPtr(Imm32 imm, Register dest) {
-+  if (is_intN(-int64_t(imm.value), 16)) {
-+    as_addi(dest, dest, -imm.value);
-+    return;
-+  }
-+  if (HasPOWER10()) {
-+    // -Imm32 fits 34-bit signed (worst case -INT32_MIN = +2^31, well within
-+    // ±2^33). paddi with the negated immediate does the subtract in 1 prefixed
-+    // insn with no scratch.
-+    as_paddi(dest, dest, -int64_t(imm.value), /*R=*/false);
-+    return;
-+  }
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  movePtr(ImmWord(int64_t(imm.value)), scratch);
-+  as_subf(dest, scratch, dest);
-+}
-+
-+void MacroAssembler::sub64(Register64 src, Register64 dest) {
-+  as_subf(dest.reg, src.reg, dest.reg);
-+}
-+
-+void MacroAssembler::sub64(Imm64 imm, Register64 dest) {
-+  if (is_intN(-int64_t(imm.value), 16)) {
-+    as_addi(dest.reg, dest.reg, int16_t(-int64_t(imm.value)));
-+    return;
-+  }
-+  if (HasPOWER10() && is_intN(-(int64_t)imm.value, 34)) {
-+    as_paddi(dest.reg, dest.reg, -(int64_t)imm.value, /*R=*/false);
-+    return;
-+  }
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  MOZ_ASSERT(dest.reg != scratch);
-+  movePtr(ImmWord(imm.value), scratch);
-+  as_subf(dest.reg, scratch, dest.reg);
-+}
-+
-+void MacroAssembler::sub32(Register src, Register dest) {
-+  as_subf(dest, src, dest);
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::sub32(Imm32 imm, Register dest) {
-+  if (is_intN(-int64_t(imm.value), 16)) {
-+    as_addi(dest, dest, -imm.value);
-+  } else {
-+    UseScratchRegisterScope temps(asMasm());
-+    Register scratch = temps.Acquire();
-+    move32(imm, scratch);
-+    as_subf(dest, scratch, dest);
-+  }
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::sub32(const Address& src, Register dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  load32(src, scratch);
-+  as_subf(dest, scratch, dest);
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::subPtr(Register src, const Address& dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  loadPtr(dest, scratch);
-+  as_subf(scratch, src, scratch);
-+  storePtr(scratch, dest);
-+}
-+
-+void MacroAssembler::subPtr(const Address& addr, Register dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  loadPtr(addr, scratch);
-+  as_subf(dest, scratch, dest);
-+}
-+
-+void MacroAssembler::subDouble(FloatRegister src, FloatRegister dest) {
-+  as_fsub(dest, dest, src);
-+}
-+
-+void MacroAssembler::subFloat32(FloatRegister src, FloatRegister dest) {
-+  as_fsubs(dest, dest, src);
-+}
-+
-+void MacroAssembler::mul64(const Register64& rhs, const Register64& srcDest) {
-+  as_mulld(srcDest.reg, srcDest.reg, rhs.reg);
-+}
-+
-+void MacroAssembler::mul64(Imm64 imm, const Register64& dest) {
-+  if (is_intN(int64_t(imm.value), 16)) {
-+    as_mulli(dest.reg, dest.reg, int16_t(imm.value));
-+  } else {
-+    UseScratchRegisterScope temps(asMasm());
-+    Register scratch = temps.Acquire();
-+    MOZ_ASSERT(dest.reg != scratch);
-+    movePtr(ImmWord(imm.value), scratch);
-+    as_mulld(dest.reg, dest.reg, scratch);
-+  }
-+}
-+
-+void MacroAssembler::mul64(Imm64 imm, const Register64& dest,
-+                           const Register temp) {
-+  MOZ_ASSERT(temp == Register::Invalid());
-+  mul64(imm, dest);
-+}
-+
-+void MacroAssembler::mul64(const Register64& src, const Register64& dest,
-+                           const Register temp) {
-+  MOZ_ASSERT(temp == Register::Invalid());
-+  as_mulld(dest.reg, dest.reg, src.reg);
-+}
-+
-+void MacroAssembler::mulPtr(Register rhs, Register srcDest) {
-+  as_mulld(srcDest, srcDest, rhs);
-+}
-+
-+void MacroAssembler::mulPtr(ImmWord rhs, Register srcDest) {
-+  if (is_intN(int64_t(rhs.value), 16)) {
-+    as_mulli(srcDest, srcDest, int16_t(rhs.value));
-+    return;
-+  }
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  MOZ_ASSERT(srcDest != scratch);
-+  movePtr(rhs, scratch);
-+  mulPtr(scratch, srcDest);
-+}
-+
-+void MacroAssembler::mulBy3(Register src, Register dest) {
-+  // mulli is the 16-bit-immediate form of mulld. 1 insn, no scratch,
-+  // src==dest aliasing safe (RA read before RT write).
-+  as_mulli(dest, src, 3);
-+}
-+
-+void MacroAssembler::mul32(Register rhs, Register srcDest) {
-+  as_mullw(srcDest, srcDest, rhs);
-+  as_extsw(srcDest, srcDest);
-+}
-+
-+void MacroAssembler::mul32(Imm32 imm, Register srcDest) {
-+  if (is_intN(imm.value, 16)) {
-+    as_mulli(srcDest, srcDest, imm.value);
-+  } else {
-+    UseScratchRegisterScope temps(asMasm());
-+    Register scratch = temps.Acquire();
-+    move32(imm, scratch);
-+    as_mullw(srcDest, srcDest, scratch);
-+  }
-+  as_extsw(srcDest, srcDest);
-+}
-+
-+void MacroAssembler::mulHighUnsigned32(Imm32 imm, Register src, Register dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  MOZ_ASSERT(src != scratch);
-+  move32(imm, scratch);
-+  as_mulhwu(dest, src, scratch);
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::mulFloat32(FloatRegister src, FloatRegister dest) {
-+  as_fmuls(dest, dest, src);
-+}
-+
-+void MacroAssembler::mulDouble(FloatRegister src, FloatRegister dest) {
-+  as_fmul(dest, dest, src);
-+}
-+
-+void MacroAssembler::mulDoublePtr(ImmPtr imm, Register temp,
-+                                  FloatRegister dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  movePtr(imm, scratch);
-+  as_lfd(ScratchDoubleReg, scratch, 0);
-+  as_fmul(dest, dest, ScratchDoubleReg);
-+}
-+
-+void MacroAssembler::inc64(AbsoluteAddress dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register addrReg = temps.Acquire();
-+  movePtr(ImmWord(uintptr_t(dest.addr)), addrReg);
-+  Register scratch = SecondScratchReg;
-+  as_ld(scratch, addrReg, 0);
-+  as_addi(scratch, scratch, 1);
-+  as_std(scratch, addrReg, 0);
-+}
-+
-+void MacroAssembler::divFloat32(FloatRegister src, FloatRegister dest) {
-+  as_fdivs(dest, dest, src);
-+}
-+
-+void MacroAssembler::divDouble(FloatRegister src, FloatRegister dest) {
-+  as_fdiv(dest, dest, src);
-+}
-+
-+void MacroAssembler::quotient32(Register lhs, Register rhs, Register dest,
-+                                bool isUnsigned) {
-+  if (isUnsigned) {
-+    as_divwu(dest, lhs, rhs);
-+  } else {
-+    as_divw(dest, lhs, rhs);
-+  }
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::quotient64(Register lhs, Register rhs, Register dest,
-+                                bool isUnsigned) {
-+  if (isUnsigned) {
-+    as_divdu(dest, lhs, rhs);
-+  } else {
-+    as_divd(dest, lhs, rhs);
-+  }
-+}
-+
-+void MacroAssembler::remainder32(Register lhs, Register rhs, Register dest,
-+                                 bool isUnsigned) {
-+  if (HasPOWER9()) {
-+    if (isUnsigned) {
-+      as_moduw(dest, lhs, rhs);
-+    } else {
-+      as_modsw(dest, lhs, rhs);
-+    }
-+  } else {
-+    UseScratchRegisterScope temps(asMasm());
-+    Register scratch = temps.Acquire();
-+    if (isUnsigned) {
-+      as_divwu(scratch, lhs, rhs);
-+      as_mullw(scratch, scratch, rhs);
-+    } else {
-+      as_divw(scratch, lhs, rhs);
-+      as_mullw(scratch, scratch, rhs);
-+    }
-+    as_subf(dest, scratch, lhs);
-+  }
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::remainder64(Register lhs, Register rhs, Register dest,
-+                                 bool isUnsigned) {
-+  if (HasPOWER9()) {
-+    if (isUnsigned) {
-+      as_modud(dest, lhs, rhs);
-+    } else {
-+      as_modsd(dest, lhs, rhs);
-+    }
-+  } else {
-+    UseScratchRegisterScope temps(asMasm());
-+    Register scratch = temps.Acquire();
-+    if (isUnsigned) {
-+      as_divdu(scratch, lhs, rhs);
-+      as_mulld(scratch, scratch, rhs);
-+    } else {
-+      as_divd(scratch, lhs, rhs);
-+      as_mulld(scratch, scratch, rhs);
-+    }
-+    as_subf(dest, scratch, lhs);
-+  }
-+}
-+
-+void MacroAssembler::neg64(Register64 reg) { as_neg(reg.reg, reg.reg); }
-+
-+void MacroAssembler::negPtr(Register reg) { as_neg(reg, reg); }
-+
-+void MacroAssembler::neg32(Register reg) {
-+  as_neg(reg, reg);
-+  as_extsw(reg, reg);
-+}
-+
-+void MacroAssembler::negateDouble(FloatRegister reg) { as_fneg(reg, reg); }
-+
-+void MacroAssembler::negateFloat(FloatRegister reg) { as_fneg(reg, reg); }
-+
-+void MacroAssembler::abs32(Register src, Register dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  as_srawi(scratch, src, 31);
-+  as_xor_(dest, src, scratch);
-+  as_subf(dest, scratch, dest);
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::absFloat32(FloatRegister src, FloatRegister dest) {
-+  as_fabs(dest, src);
-+}
-+
-+void MacroAssembler::absDouble(FloatRegister src, FloatRegister dest) {
-+  as_fabs(dest, src);
-+}
-+
-+void MacroAssembler::sqrtFloat32(FloatRegister src, FloatRegister dest) {
-+  as_fsqrts(dest, src);
-+}
-+
-+void MacroAssembler::sqrtDouble(FloatRegister src, FloatRegister dest) {
-+  as_fsqrt(dest, src);
-+}
-+
-+void MacroAssembler::min32(Register lhs, Register rhs, Register dest) {
-+  as_cmpw(lhs, rhs);
-+  // isel rt, ra, rb, cond: rt = (CR[cond] set) ? ra : rb
-+  // LessThan set if lhs < rhs (signed), so pick lhs; else rhs = min.
-+  as_isel(dest, lhs, rhs, LessThan, cr0);
-+}
-+
-+void MacroAssembler::min32(Register lhs, Imm32 rhs, Register dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  move32(rhs, scratch);
-+  min32(lhs, scratch, dest);
-+}
-+
-+void MacroAssembler::max32(Register lhs, Register rhs, Register dest) {
-+  as_cmpw(lhs, rhs);
-+  // GT set if lhs > rhs (signed), so pick lhs; else rhs = max.
-+  as_isel(dest, lhs, rhs, GreaterThan, cr0);
-+}
-+
-+void MacroAssembler::max32(Register lhs, Imm32 rhs, Register dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  move32(rhs, scratch);
-+  max32(lhs, scratch, dest);
-+}
-+
-+void MacroAssembler::minPtr(Register lhs, Register rhs, Register dest) {
-+  as_cmpd(lhs, rhs);
-+  as_isel(dest, lhs, rhs, LessThan, cr0);
-+}
-+
-+void MacroAssembler::minPtr(Register lhs, ImmWord rhs, Register dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  movePtr(rhs, scratch);
-+  minPtr(lhs, scratch, dest);
-+}
-+
-+void MacroAssembler::maxPtr(Register lhs, Register rhs, Register dest) {
-+  as_cmpd(lhs, rhs);
-+  as_isel(dest, lhs, rhs, GreaterThan, cr0);
-+}
-+
-+void MacroAssembler::maxPtr(Register lhs, ImmWord rhs, Register dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  movePtr(rhs, scratch);
-+  maxPtr(lhs, scratch, dest);
-+}
-+
-+void MacroAssembler::minFloat32(FloatRegister other, FloatRegister srcDest,
-+                                bool handleNaN) {
-+  if (HasPOWER9()) {
-+    // xsminjdp matches ECMA-262 Math.min semantics for ±0 and NaN.
-+    // Float32 values are stored as doubles in PPC FPRs; the J-form
-+    // result is bit-exact for values representable in float32 (which
-+    // includes every NaN/±0/±Inf corner case JS observes). 1 insn.
-+    as_xsminjdp(srcDest, srcDest, other);
-+    return;
-+  }
-+  Label done, nan, equal;
-+  as_fcmpu(srcDest, other);
-+  if (handleNaN) {
-+    ma_b(Assembler::DoubleUnordered, &nan);
-+  }
-+  // Handle +0 vs -0.
-+  ma_b(Assembler::DoubleEqual, &equal);
-+  ma_b(Assembler::DoubleLessThan, &done);
-+  as_fmr(srcDest, other);
-+  jump(&done);
-+
-+  bind(&equal);
-+  // Both operands are equal. Check if they're zero.
-+  loadConstantFloat32(0.0f, ScratchFloat32Reg);
-+  as_fcmpu(srcDest, ScratchFloat32Reg);
-+  // If not zero, they're identical; keep srcDest.
-+  ma_b(Assembler::DoubleNotEqual, &done);
-+  // Both are some combination of +0/-0. For min, result should be -0
-+  // if either is -0: -((-srcDest) - other) gives -0 when either is -0.
-+  as_fneg(ScratchFloat32Reg, srcDest);
-+  as_fsubs(ScratchFloat32Reg, ScratchFloat32Reg, other);
-+  as_fneg(srcDest, ScratchFloat32Reg);
-+  jump(&done);
-+
-+  if (handleNaN) {
-+    bind(&nan);
-+    as_fadds(srcDest, srcDest, other);
-+  }
-+  bind(&done);
-+}
-+
-+void MacroAssembler::minDouble(FloatRegister other, FloatRegister srcDest,
-+                               bool handleNaN) {
-+  if (HasPOWER9()) {
-+    // xsminjdp matches ECMA-262 Math.min semantics exactly (covers
-+    // 19 corner cases including ±0 and NaN). 1 insn vs ~12 for the
-+    // fcmpu/branch fallback. POWER8 fallback follows.
-+    as_xsminjdp(srcDest, srcDest, other);
-+    return;
-+  }
-+  Label done, nan, equal;
-+  as_fcmpu(srcDest, other);
-+  if (handleNaN) {
-+    ma_b(Assembler::DoubleUnordered, &nan);
-+  }
-+  // Handle +0 vs -0.
-+  ma_b(Assembler::DoubleEqual, &equal);
-+  ma_b(Assembler::DoubleLessThan, &done);
-+  as_fmr(srcDest, other);
-+  jump(&done);
-+
-+  bind(&equal);
-+  loadConstantDouble(0.0, ScratchDoubleReg);
-+  as_fcmpu(srcDest, ScratchDoubleReg);
-+  ma_b(Assembler::DoubleNotEqual, &done);
-+  // -((-srcDest) - other) gives -0 when either is -0.
-+  as_fneg(ScratchDoubleReg, srcDest);
-+  as_fsub(ScratchDoubleReg, ScratchDoubleReg, other);
-+  as_fneg(srcDest, ScratchDoubleReg);
-+  jump(&done);
-+
-+  if (handleNaN) {
-+    bind(&nan);
-+    as_fadd(srcDest, srcDest, other);
-+  }
-+  bind(&done);
-+}
-+
-+void MacroAssembler::maxFloat32(FloatRegister other, FloatRegister srcDest,
-+                                bool handleNaN) {
-+  if (HasPOWER9()) {
-+    // See minFloat32 above for the float32 ↔ J-form bit-exactness note.
-+    as_xsmaxjdp(srcDest, srcDest, other);
-+    return;
-+  }
-+  Label done, nan, equal;
-+  as_fcmpu(srcDest, other);
-+  if (handleNaN) {
-+    ma_b(Assembler::DoubleUnordered, &nan);
-+  }
-+  // Handle +0 vs -0.
-+  ma_b(Assembler::DoubleEqual, &equal);
-+  ma_b(Assembler::DoubleGreaterThan, &done);
-+  as_fmr(srcDest, other);
-+  jump(&done);
-+
-+  bind(&equal);
-+  loadConstantFloat32(0.0f, ScratchFloat32Reg);
-+  as_fcmpu(srcDest, ScratchFloat32Reg);
-+  ma_b(Assembler::DoubleNotEqual, &done);
-+  // -0 + -0 = -0 and -0 + 0 = +0.
-+  as_fadds(srcDest, srcDest, other);
-+  jump(&done);
-+
-+  if (handleNaN) {
-+    bind(&nan);
-+    as_fadds(srcDest, srcDest, other);
-+  }
-+  bind(&done);
-+}
-+
-+void MacroAssembler::maxDouble(FloatRegister other, FloatRegister srcDest,
-+                               bool handleNaN) {
-+  if (HasPOWER9()) {
-+    // See minDouble above for the J-form semantics note.
-+    as_xsmaxjdp(srcDest, srcDest, other);
-+    return;
-+  }
-+  Label done, nan, equal;
-+  as_fcmpu(srcDest, other);
-+  if (handleNaN) {
-+    ma_b(Assembler::DoubleUnordered, &nan);
-+  }
-+  // Handle +0 vs -0.
-+  ma_b(Assembler::DoubleEqual, &equal);
-+  ma_b(Assembler::DoubleGreaterThan, &done);
-+  as_fmr(srcDest, other);
-+  jump(&done);
-+
-+  bind(&equal);
-+  loadConstantDouble(0.0, ScratchDoubleReg);
-+  as_fcmpu(srcDest, ScratchDoubleReg);
-+  ma_b(Assembler::DoubleNotEqual, &done);
-+  // -0 + -0 = -0 and -0 + 0 = +0.
-+  as_fadd(srcDest, srcDest, other);
-+  jump(&done);
-+
-+  if (handleNaN) {
-+    bind(&nan);
-+    as_fadd(srcDest, srcDest, other);
-+  }
-+  bind(&done);
-+}
-+
-+// ===============================================================
-+// Shift functions
-+
-+void MacroAssembler::lshift32(Register src, Register dest) {
-+  UseScratchRegisterScope temps(*this);
-+  Register masked = temps.Acquire();
-+  as_rlwinm(masked, src, 0, 27, 31);
-+  as_slw(dest, dest, masked);
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::lshift32(Imm32 imm, Register dest) {
-+  lshift32(imm, dest, dest);
-+}
-+
-+void MacroAssembler::lshift32(Imm32 imm, Register src, Register dest) {
-+  x_slwi(dest, src, imm.value & 0x1f);
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::flexibleLshift32(Register src, Register dest) {
-+  lshift32(src, dest);
-+}
-+
-+void MacroAssembler::lshift64(Register shift, Register64 dest) {
-+  // PPC64 sld uses a 7-bit shift field; shifts >= 64 produce 0.
-+  // Wasm i64.shl requires shift count modulo 64, so mask to 6 bits.
-+  UseScratchRegisterScope temps(asMasm());
-+  Register masked = temps.Acquire();
-+  as_rldicl(masked, shift, 0, 58);  // clrldi: keep low 6 bits
-+  as_sld(dest.reg, dest.reg, masked);
-+}
-+
-+void MacroAssembler::lshift64(Imm32 imm, Register64 dest) {
-+  MOZ_ASSERT(0 <= imm.value && imm.value < 64);
-+  x_sldi(dest.reg, dest.reg, imm.value);
-+}
-+
-+void MacroAssembler::lshiftPtr(Register shift, Register dest) {
-+  as_sld(dest, dest, shift);
-+}
-+
-+void MacroAssembler::lshiftPtr(Imm32 imm, Register dest) {
-+  lshiftPtr(imm, dest, dest);
-+}
-+
-+void MacroAssembler::lshiftPtr(Imm32 imm, Register src, Register dest) {
-+  MOZ_ASSERT(0 <= imm.value && imm.value < 64);
-+  x_sldi(dest, src, imm.value);
-+}
-+
-+void MacroAssembler::flexibleLshiftPtr(Register shift, Register srcDest) {
-+  lshiftPtr(shift, srcDest);
-+}
-+
-+void MacroAssembler::rshift32(Register src, Register dest) {
-+  UseScratchRegisterScope temps(*this);
-+  Register masked = temps.Acquire();
-+  as_rlwinm(masked, src, 0, 27, 31);
-+  as_srw(dest, dest, masked);
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::rshift32(Imm32 imm, Register dest) {
-+  rshift32(imm, dest, dest);
-+}
-+
-+void MacroAssembler::rshift32(Imm32 imm, Register src, Register dest) {
-+  x_srwi(dest, src, imm.value & 0x1f);
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::flexibleRshift32(Register src, Register dest) {
-+  rshift32(src, dest);
-+}
-+
-+void MacroAssembler::rshift32Arithmetic(Register src, Register dest) {
-+  UseScratchRegisterScope temps(*this);
-+  Register masked = temps.Acquire();
-+  as_rlwinm(masked, src, 0, 27, 31);
-+  as_sraw(dest, dest, masked);
-+}
-+
-+void MacroAssembler::rshift32Arithmetic(Imm32 imm, Register dest) {
-+  rshift32Arithmetic(imm, dest, dest);
-+}
-+
-+void MacroAssembler::rshift32Arithmetic(Imm32 imm, Register src,
-+                                        Register dest) {
-+  as_srawi(dest, src, imm.value & 0x1f);
-+}
-+
-+void MacroAssembler::flexibleRshift32Arithmetic(Register src, Register dest) {
-+  rshift32Arithmetic(src, dest);
-+}
-+
-+void MacroAssembler::rshift64(Register shift, Register64 dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register masked = temps.Acquire();
-+  as_rldicl(masked, shift, 0, 58);
-+  as_srd(dest.reg, dest.reg, masked);
-+}
-+
-+void MacroAssembler::rshift64(Imm32 imm, Register64 dest) {
-+  MOZ_ASSERT(0 <= imm.value && imm.value < 64);
-+  x_srdi(dest.reg, dest.reg, imm.value);
-+}
-+
-+void MacroAssembler::rshift64Arithmetic(Imm32 imm, Register64 dest) {
-+  MOZ_ASSERT(0 <= imm.value && imm.value < 64);
-+  as_sradi(dest.reg, dest.reg, imm.value);
-+}
-+
-+void MacroAssembler::rshift64Arithmetic(Register shift, Register64 dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register masked = temps.Acquire();
-+  as_rldicl(masked, shift, 0, 58);
-+  as_srad(dest.reg, dest.reg, masked);
-+}
-+
-+void MacroAssembler::rshiftPtr(Register shift, Register dest) {
-+  as_srd(dest, dest, shift);
-+}
-+
-+void MacroAssembler::rshiftPtr(Imm32 imm, Register dest) {
-+  rshiftPtr(imm, dest, dest);
-+}
-+
-+void MacroAssembler::rshiftPtr(Imm32 imm, Register src, Register dest) {
-+  MOZ_ASSERT(0 <= imm.value && imm.value < 64);
-+  x_srdi(dest, src, imm.value);
-+}
-+
-+void MacroAssembler::flexibleRshiftPtr(Register shift, Register srcDest) {
-+  rshiftPtr(shift, srcDest);
-+}
-+
-+void MacroAssembler::rshiftPtrArithmetic(Imm32 imm, Register dest) {
-+  rshiftPtrArithmetic(imm, dest, dest);
-+}
-+
-+void MacroAssembler::rshiftPtrArithmetic(Imm32 imm, Register src,
-+                                         Register dest) {
-+  MOZ_ASSERT(0 <= imm.value && imm.value < 64);
-+  as_sradi(dest, src, imm.value);
-+}
-+
-+void MacroAssembler::rshiftPtrArithmetic(Register shift, Register dest) {
-+  as_srad(dest, dest, shift);
-+}
-+
-+void MacroAssembler::flexibleRshiftPtrArithmetic(Register shift,
-+                                                 Register srcDest) {
-+  rshiftPtrArithmetic(shift, srcDest);
-+}
-+
-+// ===============================================================
-+// Rotation functions
-+
-+void MacroAssembler::rotateLeft(Register count, Register input, Register dest) {
-+  // PPC rotlw is rlwnm with full mask: rlwnm dest, input, count, 0, 31
-+  as_rlwnm(dest, input, count, 0, 31);
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::rotateLeft(Imm32 count, Register input, Register dest) {
-+  as_rlwinm(dest, input, count.value & 31, 0, 31);
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::rotateLeft64(Register count, Register64 src,
-+                                  Register64 dest, Register temp) {
-+  MOZ_ASSERT(temp == Register::Invalid());
-+  // rldcl dest, src, count, 0 — rotate left doubleword then clear left 0 bits.
-+  as_rldcl(dest.reg, src.reg, count, 0);
-+}
-+
-+void MacroAssembler::rotateLeft64(Imm32 count, Register64 src, Register64 dest,
-+                                  Register temp) {
-+  MOZ_ASSERT(temp == Register::Invalid());
-+  // rldicl dest, src, count, 0 — rotate left doubleword immediate then clear.
-+  as_rldicl(dest.reg, src.reg, count.value & 63, 0);
-+}
-+
-+void MacroAssembler::rotateRight(Register count, Register input,
-+                                 Register dest) {
-+  // rotateRight(n) = rotateLeft(32-n). When dest != input, the negated
-+  // count can land directly in dest, dropping the GPR scratch. dest may
-+  // alias count harmlessly (subfic reads count, then writes dest, then
-+  // rlwnm consumes the new dest as its rotate-count).
-+  if (dest != input) {
-+    as_subfic(dest, count, 32);
-+    as_rlwnm(dest, input, dest, 0, 31);
-+    as_extsw(dest, dest);
-+    return;
-+  }
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  as_subfic(scratch, count, 32);
-+  as_rlwnm(dest, input, scratch, 0, 31);
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::rotateRight(Imm32 count, Register input, Register dest) {
-+  rotateLeft(Imm32((32 - count.value) & 31), input, dest);
-+}
-+
-+void MacroAssembler::rotateRight64(Register count, Register64 src,
-+                                   Register64 dest, Register temp) {
-+  MOZ_ASSERT(temp == Register::Invalid());
-+  // Same shape as rotateRight32: when dest != src, the negated count
-+  // can land directly in dest, dropping the GPR scratch.
-+  if (dest.reg != src.reg) {
-+    as_subfic(dest.reg, count, 64);
-+    as_rldcl(dest.reg, src.reg, dest.reg, 0);
-+    return;
-+  }
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  as_subfic(scratch, count, 64);
-+  as_rldcl(dest.reg, src.reg, scratch, 0);
-+}
-+
-+void MacroAssembler::rotateRight64(Imm32 count, Register64 src, Register64 dest,
-+                                   Register temp) {
-+  MOZ_ASSERT(temp == Register::Invalid());
-+  rotateLeft64(Imm32((64 - count.value) & 63), src, dest, temp);
-+}
-+
-+// ===============================================================
-+// Bit counting functions
-+
-+void MacroAssembler::clz64(Register64 src, Register64 dest) {
-+  as_cntlzd(dest.reg, src.reg);
-+}
-+
-+void MacroAssembler::ctz64(Register64 src, Register64 dest) {
-+  if (HasPOWER9()) {
-+    as_cnttzd(dest.reg, src.reg);
-+  } else {
-+    UseScratchRegisterScope temps(*this);
-+    Register tmp = temps.Acquire();
-+    as_neg(tmp, src.reg);
-+    // and. (record form) sets CR0[eq] based on result; result is 0 iff src==0,
-+    // so this folds the explicit zero-check that would otherwise need cmpdi.
-+    as_and__rc(tmp, src.reg, tmp);  // tmp = x & -x; CR0[eq] = (src == 0)
-+    as_cntlzd(tmp, tmp);            // tmp = clz(isolated bit)
-+    as_subfic(dest.reg, tmp, 63);   // dest = 63 - clz = ctz (for nonzero)
-+    xs_li(tmp, 64);
-+    as_isel(dest.reg, tmp, dest.reg, Equal);  // CR0[eq] → 64 if src==0
-+  }
-+}
-+
-+void MacroAssembler::popcnt64(Register64 input, Register64 output,
-+                              Register tmp) {
-+  as_popcntd(output.reg, input.reg);
-+}
-+
-+void MacroAssembler::clz32(Register src, Register dest, bool knownNotZero) {
-+  as_cntlzw(dest, src);
-+}
-+
-+void MacroAssembler::ctz32(Register src, Register dest, bool knownNotZero) {
-+  if (HasPOWER9()) {
-+    as_cnttzw(dest, src);
-+  } else {
-+    UseScratchRegisterScope temps(*this);
-+    Register tmp = temps.Acquire();
-+    as_neg(tmp, src);
-+    // and. record form folds the cmpwi src,0 that would otherwise be needed
-+    // to drive the isel below: tmp == 0 iff src == 0.
-+    if (knownNotZero) {
-+      as_and_(tmp, src, tmp);
-+    } else {
-+      as_and__rc(tmp, src, tmp);  // CR0[eq] = (src == 0)
-+    }
-+    as_cntlzw(tmp, tmp);
-+    as_subfic(dest, tmp, 31);
-+    if (!knownNotZero) {
-+      xs_li(tmp, 32);
-+      as_isel(dest, tmp, dest, Equal);  // CR0[eq] → 32 if src==0
-+    }
-+  }
-+}
-+
-+void MacroAssembler::popcnt32(Register input, Register output, Register tmp) {
-+  as_popcntw(output, input);
-+  // popcntw gives per-word results; on 64-bit the low word count is in
-+  // bits 32:63, so just mask to 32 bits.
-+  as_rlwinm(output, output, 0, 0, 31);
-+}
-+
-+// ===============================================================
-+// Condition functions
-+
-+void MacroAssembler::cmp8Set(Condition cond, Address lhs, Imm32 rhs,
-+                             Register dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  MOZ_ASSERT(scratch != lhs.base);
-+  bool isUnsigned = (cond & Assembler::ConditionUnsigned) != 0;
-+  if (isUnsigned) {
-+    load8ZeroExtend(lhs, scratch);
-+    Condition c = ma_cmp(scratch, Imm32(uint8_t(rhs.value)), cond, true);
-+    ma_cmp_set(dest, c);
-+  } else {
-+    load8SignExtend(lhs, scratch);
-+    Condition c = ma_cmp(scratch, Imm32(int8_t(rhs.value)), cond, true);
-+    ma_cmp_set(dest, c);
-+  }
-+}
-+
-+void MacroAssembler::cmp16Set(Condition cond, Address lhs, Imm32 rhs,
-+                              Register dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  MOZ_ASSERT(scratch != lhs.base);
-+  bool isUnsigned = (cond & Assembler::ConditionUnsigned) != 0;
-+  if (isUnsigned) {
-+    load16ZeroExtend(lhs, scratch);
-+    Condition c = ma_cmp(scratch, Imm32(uint16_t(rhs.value)), cond, true);
-+    ma_cmp_set(dest, c);
-+  } else {
-+    load16SignExtend(lhs, scratch);
-+    Condition c = ma_cmp(scratch, Imm32(int16_t(rhs.value)), cond, true);
-+    ma_cmp_set(dest, c);
-+  }
-+}
-+
-+template <typename T1, typename T2>
-+void MacroAssembler::cmp32Set(Condition cond, T1 lhs, T2 rhs, Register dest) {
-+  Condition c = ma_cmp(lhs, rhs, cond, true);
-+  ma_cmp_set(dest, c);
-+}
-+
-+void MacroAssembler::cmp64Set(Condition cond, Register64 lhs, Register64 rhs,
-+                              Register dest) {
-+  Condition c = ma_cmp(lhs.reg, rhs.reg, cond);
-+  ma_cmp_set(dest, c);
-+}
-+
-+void MacroAssembler::cmp64Set(Condition cond, Register64 lhs, Imm64 rhs,
-+                              Register dest) {
-+  Condition c = ma_cmp(lhs.reg, ImmWord(uint64_t(rhs.value)), cond);
-+  ma_cmp_set(dest, c);
-+}
-+
-+void MacroAssembler::cmp64Set(Condition cond, Address lhs, Register64 rhs,
-+                              Register dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  loadPtr(lhs, scratch);
-+  Condition c = ma_cmp(scratch, rhs.reg, cond);
-+  ma_cmp_set(dest, c);
-+}
-+
-+void MacroAssembler::cmp64Set(Condition cond, Address lhs, Imm64 rhs,
-+                              Register dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  loadPtr(lhs, scratch);
-+  Condition c = ma_cmp(scratch, ImmWord(uint64_t(rhs.value)), cond);
-+  ma_cmp_set(dest, c);
-+}
-+
-+template <typename T1, typename T2>
-+void MacroAssembler::cmpPtrSet(Condition cond, T1 lhs, T2 rhs, Register dest) {
-+  Condition c = ma_cmp(lhs, rhs, cond);
-+  ma_cmp_set(dest, c);
-+}
-+
-+// ===============================================================
-+// Branch functions
-+
-+void MacroAssembler::branch8(Condition cond, const Address& lhs, Imm32 rhs,
-+                             Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  // Mirror ARM64/LoongArch64/RISC-V: narrow the immediate to the 8-bit
-+  // memory operand width so both sides of the compare have matching bit
-+  // patterns regardless of how move32(Imm32) materializes the imm. Use
-+  // uint8 cast for equality / unsigned, int8 cast for signed relational.
-+  bool isEqOrNe = (cond == Assembler::Equal) || (cond == Assembler::NotEqual);
-+  bool isUnsigned = (cond & Assembler::ConditionUnsigned) != 0;
-+  Imm32 narrowed(0);
-+  if (isEqOrNe || isUnsigned) {
-+    load8ZeroExtend(lhs, scratch);
-+    narrowed = Imm32(uint8_t(rhs.value));
-+  } else {
-+    load8SignExtend(lhs, scratch);
-+    narrowed = Imm32(int8_t(rhs.value));
-+  }
-+  Condition c = ma_cmp(scratch, narrowed, cond, true);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branch8(Condition cond, const BaseIndex& lhs, Register rhs,
-+                             Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  load8ZeroExtend(lhs, scratch);
-+  Condition c = ma_cmp(scratch, rhs, cond, true);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branch16(Condition cond, const Address& lhs, Imm32 rhs,
-+                              Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  // See branch8: narrow the immediate to 16 bits so both sides have matching
-+  // bit patterns. uint16 for equality / unsigned, int16 for signed relational.
-+  bool isEqOrNe = (cond == Assembler::Equal) || (cond == Assembler::NotEqual);
-+  bool isUnsigned = (cond & Assembler::ConditionUnsigned) != 0;
-+  Imm32 narrowed(0);
-+  if (isEqOrNe || isUnsigned) {
-+    load16ZeroExtend(lhs, scratch);
-+    narrowed = Imm32(uint16_t(rhs.value));
-+  } else {
-+    load16SignExtend(lhs, scratch);
-+    narrowed = Imm32(int16_t(rhs.value));
-+  }
-+  Condition c = ma_cmp(scratch, narrowed, cond, true);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branch32(Condition cond, Register lhs, Register rhs,
-+                              Label* label) {
-+  Condition c = ma_cmp(lhs, rhs, cond, true);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branch32(Condition cond, Register lhs, Imm32 imm,
-+                              Label* label) {
-+  Condition c = ma_cmp(lhs, imm, cond, true);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branch32(Condition cond, const Address& lhs, Register rhs,
-+                              Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  load32(lhs, scratch);
-+  Condition c = ma_cmp(scratch, rhs, cond, true);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branch32(Condition cond, const Address& lhs, Imm32 rhs,
-+                              Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  load32(lhs, scratch);
-+  Condition c = ma_cmp(scratch, rhs, cond, true);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branch32(Condition cond, const AbsoluteAddress& lhs,
-+                              Register rhs, Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  movePtr(ImmWord((uintptr_t)lhs.addr), scratch);
-+  load32(Address(scratch, 0), scratch);
-+  Condition c = ma_cmp(scratch, rhs, cond, true);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branch32(Condition cond, const AbsoluteAddress& lhs,
-+                              Imm32 rhs, Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  movePtr(ImmWord((uintptr_t)lhs.addr), scratch);
-+  load32(Address(scratch, 0), scratch);
-+  Condition c = ma_cmp(scratch, rhs, cond, true);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branch32(Condition cond, const BaseIndex& lhs, Imm32 rhs,
-+                              Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  load32(lhs, scratch);
-+  Condition c = ma_cmp(scratch, rhs, cond, true);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branch32(Condition cond, wasm::SymbolicAddress addr,
-+                              Imm32 imm, Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  movePtr(addr, scratch);
-+  load32(Address(scratch, 0), scratch);
-+  Condition c = ma_cmp(scratch, imm, cond, true);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branch64(Condition cond, Register64 lhs, Imm64 val,
-+                              Label* success, Label* fail) {
-+  Condition c = ma_cmp(lhs.reg, ImmWord(uint64_t(val.value)), cond);
-+  if (fail) {
-+    ma_b(c, success);
-+    jump(fail);
-+  } else {
-+    ma_b(c, success);
-+  }
-+}
-+
-+void MacroAssembler::branch64(Condition cond, Register64 lhs, Register64 rhs,
-+                              Label* success, Label* fail) {
-+  Condition c = ma_cmp(lhs.reg, rhs.reg, cond);
-+  if (fail) {
-+    ma_b(c, success);
-+    jump(fail);
-+  } else {
-+    ma_b(c, success);
-+  }
-+}
-+
-+void MacroAssembler::branch64(Condition cond, const Address& lhs, Imm64 val,
-+                              Label* success, Label* fail) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  loadPtr(lhs, scratch);
-+  Condition c = ma_cmp(scratch, ImmWord(uint64_t(val.value)), cond);
-+  if (fail) {
-+    ma_b(c, success);
-+    jump(fail);
-+  } else {
-+    ma_b(c, success);
-+  }
-+}
-+
-+void MacroAssembler::branch64(Condition cond, const Address& lhs,
-+                              Register64 rhs, Label* success, Label* fail) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  loadPtr(lhs, scratch);
-+  Condition c = ma_cmp(scratch, rhs.reg, cond);
-+  if (fail) {
-+    ma_b(c, success);
-+    jump(fail);
-+  } else {
-+    ma_b(c, success);
-+  }
-+}
-+
-+void MacroAssembler::branch64(Condition cond, const Address& lhs,
-+                              const Address& rhs, Register scratch,
-+                              Label* label) {
-+  loadPtr(rhs, scratch);
-+  branch64(cond, lhs, Register64(scratch), label, nullptr);
-+}
-+
-+void MacroAssembler::branchPtr(Condition cond, Register lhs, Register rhs,
-+                               Label* label) {
-+  Condition c = ma_cmp(lhs, rhs, cond);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchPtr(Condition cond, Register lhs, Imm32 rhs,
-+                               Label* label) {
-+  Condition c = ma_cmp(lhs, ImmWord(int64_t(rhs.value)), cond);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchPtr(Condition cond, Register lhs, ImmPtr rhs,
-+                               Label* label) {
-+  Condition c = ma_cmp(lhs, rhs, cond);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchPtr(Condition cond, Register lhs, ImmGCPtr rhs,
-+                               Label* label) {
-+  Condition c = ma_cmp(lhs, rhs, cond);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchPtr(Condition cond, Register lhs, ImmWord rhs,
-+                               Label* label) {
-+  Condition c = ma_cmp(lhs, rhs, cond);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchPtr(Condition cond, const Address& lhs, Register rhs,
-+                               Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  loadPtr(lhs, scratch);
-+  Condition c = ma_cmp(scratch, rhs, cond);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchPtr(Condition cond, const Address& lhs, ImmPtr rhs,
-+                               Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  loadPtr(lhs, scratch);
-+  Condition c = ma_cmp(scratch, rhs, cond);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchPtr(Condition cond, const Address& lhs, ImmGCPtr rhs,
-+                               Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  loadPtr(lhs, scratch);
-+  Condition c = ma_cmp(scratch, rhs, cond);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchPtr(Condition cond, const Address& lhs, ImmWord rhs,
-+                               Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  loadPtr(lhs, scratch);
-+  Condition c = ma_cmp(scratch, rhs, cond);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchPtr(Condition cond, const AbsoluteAddress& lhs,
-+                               Register rhs, Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  movePtr(ImmWord((uintptr_t)lhs.addr), scratch);
-+  loadPtr(Address(scratch, 0), scratch);
-+  Condition c = ma_cmp(scratch, rhs, cond);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchPtr(Condition cond, const AbsoluteAddress& lhs,
-+                               ImmWord rhs, Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  movePtr(ImmWord((uintptr_t)lhs.addr), scratch);
-+  loadPtr(Address(scratch, 0), scratch);
-+  Condition c = ma_cmp(scratch, rhs, cond);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchPtr(Condition cond, wasm::SymbolicAddress lhs,
-+                               Register rhs, Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  movePtr(lhs, scratch);
-+  loadPtr(Address(scratch, 0), scratch);
-+  Condition c = ma_cmp(scratch, rhs, cond);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchPtr(Condition cond, const BaseIndex& lhs,
-+                               Register rhs, Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  loadPtr(lhs, scratch);
-+  Condition c = ma_cmp(scratch, rhs, cond);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchPtr(Condition cond, const BaseIndex& lhs,
-+                               ImmWord rhs, Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  loadPtr(lhs, scratch);
-+  Condition c = ma_cmp(scratch, rhs, cond);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchPrivatePtr(Condition cond, const Address& lhs,
-+                                      Register rhs, Label* label) {
-+  branchPtr(cond, lhs, rhs, label);
-+}
-+
-+void MacroAssembler::branchFloat(DoubleCondition cond, FloatRegister lhs,
-+                                 FloatRegister rhs, Label* label) {
-+  as_fcmpu(lhs, rhs);
-+  ma_b(cond, label);
-+}
-+
-+void MacroAssembler::branchTruncateFloat32MaybeModUint32(FloatRegister src,
-+                                                         Register dest,
-+                                                         Label* fail) {
-+  // Convert float32 to int64 (truncating toward zero), fail on NaN/overflow.
-+  as_fctidz(ScratchDoubleReg, src);
-+  as_mfvsrd(dest, ScratchDoubleReg);
-+  // PPC64 fctidz saturates to INT64_MIN on negative overflow/NaN,
-+  // and to INT64_MAX on positive overflow. Check both.
-+  asMasm().branchPtr(Assembler::Equal, dest, ImmWord(int64_t(INT64_MIN)), fail);
-+  asMasm().branchPtr(Assembler::Equal, dest, ImmWord(int64_t(INT64_MAX)), fail);
-+  // Truncate to uint32 (keep low 32 bits).
-+  as_rldicl(dest, dest, 0, 32);
-+}
-+
-+void MacroAssembler::branchTruncateFloat32ToInt32(FloatRegister src,
-+                                                  Register dest, Label* fail) {
-+  convertFloat32ToInt32(src, dest, fail, false);
-+}
-+
-+void MacroAssembler::branchDouble(DoubleCondition cond, FloatRegister lhs,
-+                                  FloatRegister rhs, Label* label) {
-+  as_fcmpu(lhs, rhs);
-+  ma_b(cond, label);
-+}
-+
-+void MacroAssembler::branchTruncateDoubleMaybeModUint32(FloatRegister src,
-+                                                        Register dest,
-+                                                        Label* fail) {
-+  // Convert double to int64 (truncating toward zero), fail on NaN/overflow.
-+  as_fctidz(ScratchDoubleReg, src);
-+  as_mfvsrd(dest, ScratchDoubleReg);
-+  // PPC64 fctidz saturates to INT64_MIN on negative overflow/NaN,
-+  // and to INT64_MAX on positive overflow. Check both.
-+  asMasm().branchPtr(Assembler::Equal, dest, ImmWord(int64_t(INT64_MIN)), fail);
-+  asMasm().branchPtr(Assembler::Equal, dest, ImmWord(int64_t(INT64_MAX)), fail);
-+  // Truncate to uint32 (keep low 32 bits).
-+  as_rldicl(dest, dest, 0, 32);
-+}
-+
-+void MacroAssembler::branchTruncateDoubleToInt32(FloatRegister src,
-+                                                 Register dest, Label* fail) {
-+  convertDoubleToInt32(src, dest, fail, false);
-+}
-+
-+void MacroAssembler::branchInt64NotInPtrRange(Register64 src, Label* label) {
-+  // No-op on 64-bit.
-+}
-+
-+void MacroAssembler::branchUInt64NotInPtrRange(Register64 src, Label* label) {
-+  // Branch if src >= 2^63 (sign bit set = out of signed ptr range).
-+  as_cmpdi(src.reg, 0);
-+  ma_b(Assembler::LessThan, label);
-+}
-+
-+template <typename T>
-+void MacroAssembler::branchAdd32(Condition cond, T src, Register dest,
-+                                 Label* overflow) {
-+  switch (cond) {
-+    case Overflow: {
-+      // Do raw 64-bit add (no sign extension) so we can detect 32-bit overflow.
-+      // Both inputs should already be sign-extended 32-bit values, so the
-+      // 64-bit result is mathematically correct. If extsw(result) != result,
-+      // the 32-bit add overflowed.
-+      UseScratchRegisterScope temps(asMasm());
-+      Register scratch = temps.Acquire();
-+      addPtr(src, dest);
-+      as_extsw(scratch, dest);
-+      as_cmpd(dest, scratch);
-+      as_extsw(dest, dest);
-+      ma_b(NotEqual, overflow);
-+      break;
-+    }
-+    case NonZero:
-+    case Zero:
-+      add32(src, dest);
-+      as_cmpdi(dest, 0);
-+      ma_b(cond == NonZero ? NotEqual : Equal, overflow);
-+      break;
-+    case Signed:
-+    case NotSigned:
-+      add32(src, dest);
-+      as_cmpdi(dest, 0);
-+      ma_b(cond == Signed ? LessThan : GreaterThanOrEqual, overflow);
-+      break;
-+    case CarryClear:
-+    case CarrySet: {
-+      // Unsigned 32-bit carry detection: save dest, do 32-bit add,
-+      // then unsigned-compare result with original. If result < original
-+      // (unsigned), a carry occurred.
-+      UseScratchRegisterScope temps(asMasm());
-+      Register scratch = temps.Acquire();
-+      move32(dest, scratch);
-+      add32(src, dest);
-+      as_cmplw(dest, scratch);
-+      ma_b(cond == CarrySet ? LessThan : GreaterThanOrEqual, overflow);
-+      break;
-+    }
-+    default:
-+      MOZ_CRASH("NYI");
-+  }
-+}
-+
-+template <typename T>
-+void MacroAssembler::branchSub32(Condition cond, T src, Register dest,
-+                                 Label* overflow) {
-+  switch (cond) {
-+    case Overflow: {
-+      // Do raw 64-bit sub (no sign extension) so we can detect 32-bit overflow.
-+      UseScratchRegisterScope temps(asMasm());
-+      Register scratch = temps.Acquire();
-+      subPtr(src, dest);
-+      as_extsw(scratch, dest);
-+      as_cmpd(dest, scratch);
-+      as_extsw(dest, dest);
-+      ma_b(NotEqual, overflow);
-+      break;
-+    }
-+    case NonZero:
-+    case Zero:
-+      sub32(src, dest);
-+      as_cmpdi(dest, 0);
-+      ma_b(cond == NonZero ? NotEqual : Equal, overflow);
-+      break;
-+    case Signed:
-+    case NotSigned:
-+      sub32(src, dest);
-+      as_cmpdi(dest, 0);
-+      ma_b(cond == Signed ? LessThan : GreaterThanOrEqual, overflow);
-+      break;
-+    default:
-+      MOZ_CRASH("NYI");
-+  }
-+}
-+
-+template <typename T>
-+void MacroAssembler::branchMul32(Condition cond, T src, Register dest,
-+                                 Label* overflow) {
-+  MOZ_ASSERT(cond == Overflow);
-+  // Do raw 64-bit multiply (no sign extension) so we can detect 32-bit
-+  // overflow. as_mulld gives full 64-bit low result; if extsw(result) !=
-+  // result, overflow. scratch is dead after the mulld (consumed as RB),
-+  // so the sign-extension round-trip reuses it instead of acquiring a
-+  // second scratch.
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  move32(src, scratch);
-+  as_mulld(dest, dest, scratch);
-+  as_extsw(scratch, dest);
-+  as_cmpd(dest, scratch);
-+  as_extsw(dest, dest);
-+  ma_b(NotEqual, overflow);
-+}
-+
-+template <typename T>
-+void MacroAssembler::branchRshift32(Condition cond, T src, Register dest,
-+                                    Label* label) {
-+  MOZ_ASSERT(cond == Zero || cond == NonZero);
-+  rshift32(src, dest);
-+  branch32(cond == Zero ? Equal : NotEqual, dest, Imm32(0), label);
-+}
-+
-+void MacroAssembler::branchNeg32(Condition cond, Register reg, Label* label) {
-+  MOZ_ASSERT(cond == Overflow);
-+  neg32(reg);
-+  branch32(Equal, reg, Imm32(INT32_MIN), label);
-+}
-+
-+template <typename T>
-+void MacroAssembler::branchAddPtr(Condition cond, T src, Register dest,
-+                                  Label* label) {
-+  switch (cond) {
-+    case Overflow: {
-+      UseScratchRegisterScope temps(asMasm());
-+      Register scratch = temps.Acquire();
-+      movePtr(dest, scratch);  // scratch = old_dest
-+      addPtr(src, dest);       // dest = result = old_dest + src
-+      as_xor_(SecondScratchReg, dest,
-+              scratch);  // SecondScratch = result ^ old_dest
-+      as_subf(scratch, scratch,
-+              dest);  // scratch = result - old_dest = src_value
-+      as_xor_(scratch, scratch, dest);  // scratch = src_value ^ result
-+      // (old_dest ^ result) & (src_value ^ result): bit 63 set iff overflow.
-+      // and. record form sets CR0[lt]=(bit 63 set), folding the cmpdi.
-+      as_and__rc(scratch, scratch, SecondScratchReg);
-+      ma_b(LessThan, label);
-+      break;
-+    }
-+    case NonZero:
-+    case Zero:
-+      addPtr(src, dest);
-+      as_cmpdi(dest, 0);
-+      ma_b(cond == NonZero ? NotEqual : Equal, label);
-+      break;
-+    case Signed:
-+    case NotSigned:
-+      addPtr(src, dest);
-+      as_cmpdi(dest, 0);
-+      ma_b(cond == Signed ? LessThan : GreaterThanOrEqual, label);
-+      break;
-+    case CarryClear:
-+    case CarrySet: {
-+      // Unsigned 64-bit carry detection: save dest, do 64-bit add,
-+      // then unsigned-compare result with original. If result < original
-+      // (unsigned), a carry occurred.
-+      UseScratchRegisterScope temps(asMasm());
-+      Register scratch = temps.Acquire();
-+      movePtr(dest, scratch);
-+      addPtr(src, dest);
-+      as_cmpld(dest, scratch);
-+      ma_b(cond == CarrySet ? LessThan : GreaterThanOrEqual, label);
-+      break;
-+    }
-+    default:
-+      MOZ_CRASH("NYI");
-+  }
-+}
-+
-+template <typename T>
-+void MacroAssembler::branchSubPtr(Condition cond, T src, Register dest,
-+                                  Label* label) {
-+  switch (cond) {
-+    case Overflow: {
-+      UseScratchRegisterScope temps(asMasm());
-+      Register scratch = temps.Acquire();
-+      movePtr(dest, scratch);  // scratch = old_dest
-+      subPtr(src, dest);       // dest = result = old_dest - src
-+      // Overflow if (old_dest ^ src_value) & (old_dest ^ result) has bit 63
-+      // set.
-+      as_subf(SecondScratchReg, dest,
-+              scratch);  // SecondScratch = old_dest - result = src_value
-+      as_xor_(SecondScratchReg, scratch,
-+              SecondScratchReg);        // old_dest ^ src_value
-+      as_xor_(scratch, scratch, dest);  // old_dest ^ result
-+      // Record-form AND sets CR0 to the signed compare of the result vs 0,
-+      // so a separate cmpdi is unnecessary; LessThan reads CR0.LT.
-+      as_and__rc(scratch, scratch, SecondScratchReg);
-+      ma_b(LessThan, label);
-+      break;
-+    }
-+    case NonZero:
-+    case Zero:
-+      subPtr(src, dest);
-+      as_cmpdi(dest, 0);
-+      ma_b(cond == NonZero ? NotEqual : Equal, label);
-+      break;
-+    case Signed:
-+    case NotSigned:
-+      subPtr(src, dest);
-+      as_cmpdi(dest, 0);
-+      ma_b(cond == Signed ? LessThan : GreaterThanOrEqual, label);
-+      break;
-+    default:
-+      MOZ_CRASH("NYI");
-+  }
-+}
-+
-+void MacroAssembler::branchMulPtr(Condition cond, Register src, Register dest,
-+                                  Label* label) {
-+  MOZ_ASSERT(cond == Assembler::Overflow);
-+  as_mulldo(dest, dest, src);
-+  ma_b(Overflow, label);
-+}
-+
-+void MacroAssembler::branchNegPtr(Condition cond, Register reg, Label* label) {
-+  MOZ_ASSERT(cond == Overflow);
-+  negPtr(reg);
-+  branchPtr(Assembler::Equal, reg, ImmWord(intptr_t(INTPTR_MIN)), label);
-+}
-+
-+void MacroAssembler::decBranchPtr(Condition cond, Register lhs, Imm32 rhs,
-+                                  Label* label) {
-+  subPtr(rhs, lhs);
-+  branchPtr(cond, lhs, Imm32(0), label);
-+}
-+
-+void MacroAssembler::branchTest32(Condition cond, Register lhs, Register rhs,
-+                                  Label* label) {
-+  MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
-+             cond == NotSigned);
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  if (lhs != rhs) {
-+    as_and_(scratch, lhs, rhs);
-+    as_extsw_rc(scratch, scratch);  // CR0 set on sign-extended i32; folds cmpdi
-+  } else {
-+    as_extsw_rc(scratch, lhs);
-+  }
-+  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
-+  ma_b(base, label);
-+}
-+
-+void MacroAssembler::branchTest32(Condition cond, Register lhs, Imm32 rhs,
-+                                  Label* label) {
-+  MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
-+             cond == NotSigned);
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  if (is_uintN(rhs.value, 16)) {
-+    as_andi_rc(scratch, lhs, rhs.value);
-+    // andi_rc sets CR0 on the masked value, but only the low 16 bits matter
-+    // since rhs is a 16-bit unsigned mask — sign of the i32 result is always
-+    // 0, so CR0[lt] is always 0. For Signed/NotSigned conditions the answer
-+    // is fixed; for Zero/NonZero CR0[eq] is correct.
-+  } else {
-+    move32(rhs, scratch);
-+    as_and_(scratch, lhs, scratch);
-+    as_extsw_rc(scratch, scratch);  // CR0 set on sign-extended i32; folds cmpdi
-+  }
-+  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
-+  ma_b(base, label);
-+}
-+
-+void MacroAssembler::branchTest32(Condition cond, const Address& lhs, Imm32 rhs,
-+                                  Label* label) {
-+  MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
-+             cond == NotSigned);
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  load32(lhs, scratch);
-+  // and32 picks up the rlwinm contig-mask fast path for non-16-bit-fit
-+  // immediates that are a contiguous run of 1-bits (common: tag masks,
-+  // header bit-fields). It also emits the trailing extsw.
-+  and32(rhs, scratch);
-+  as_cmpdi(scratch, 0);
-+  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
-+  ma_b(base, label);
-+}
-+
-+void MacroAssembler::branchTest32(Condition cond, const AbsoluteAddress& lhs,
-+                                  Imm32 rhs, Label* label) {
-+  MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
-+             cond == NotSigned);
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  movePtr(ImmWord((uintptr_t)lhs.addr), scratch);
-+  load32(Address(scratch, 0), scratch);
-+  and32(rhs, scratch);
-+  as_cmpdi(scratch, 0);
-+  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
-+  ma_b(base, label);
-+}
-+
-+void MacroAssembler::branchTestPtr(Condition cond, Register lhs, Register rhs,
-+                                   Label* label) {
-+  MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
-+             cond == NotSigned);
-+  if (lhs == rhs) {
-+    as_cmpdi(lhs, 0);
-+  } else {
-+    UseScratchRegisterScope temps(asMasm());
-+    Register scratch = temps.Acquire();
-+    // Record-form AND sets CR0; no follow-up cmpdi needed.
-+    as_and__rc(scratch, lhs, rhs);
-+  }
-+  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
-+  ma_b(base, label);
-+}
-+
-+void MacroAssembler::branchTestPtr(Condition cond, Register lhs, Imm32 rhs,
-+                                   Label* label) {
-+  MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
-+             cond == NotSigned);
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  if (is_uintN(rhs.value, 16)) {
-+    as_andi_rc(scratch, lhs, rhs.value);
-+  } else {
-+    move32(rhs, scratch);
-+    as_and__rc(scratch, lhs, scratch);  // record form folds the cmpdi
-+  }
-+  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
-+  ma_b(base, label);
-+}
-+
-+void MacroAssembler::branchTestPtr(Condition cond, Register lhs, ImmWord rhs,
-+                                   Label* label) {
-+  MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
-+             cond == NotSigned);
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  movePtr(rhs, scratch);
-+  as_and__rc(scratch, lhs, scratch);
-+  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
-+  ma_b(base, label);
-+}
-+
-+void MacroAssembler::branchTestPtr(Condition cond, const Address& lhs,
-+                                   Imm32 rhs, Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  loadPtr(lhs, scratch);
-+  // andPtr picks up the rlwinm contig-mask fast path for non-16-bit-fit
-+  // immediates that are a contiguous run of 1-bits.
-+  andPtr(rhs, scratch);
-+  as_cmpdi(scratch, 0);
-+  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
-+  ma_b(base, label);
-+}
-+
-+void MacroAssembler::branchTest64(Condition cond, Register64 lhs,
-+                                  Register64 rhs, Register temp, Label* success,
-+                                  Label* fail) {
-+  MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
-+             cond == NotSigned);
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  as_and__rc(scratch, lhs.reg, rhs.reg);
-+  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
-+  if (fail) {
-+    ma_b(base, success);
-+    jump(fail);
-+  } else {
-+    ma_b(base, success);
-+  }
-+}
-+
-+void MacroAssembler::branchTest64(Condition cond, Register64 lhs, Imm64 rhs,
-+                                  Label* success, Label* fail) {
-+  MOZ_ASSERT(cond == Zero || cond == NonZero || cond == Signed ||
-+             cond == NotSigned);
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  movePtr(ImmWord(uint64_t(rhs.value)), scratch);
-+  as_and__rc(scratch, lhs.reg, scratch);
-+  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
-+  if (fail) {
-+    ma_b(base, success);
-+    jump(fail);
-+  } else {
-+    ma_b(base, success);
-+  }
-+}
-+
-+// ===============================================================
-+// Value-type branch functions
-+
-+void MacroAssembler::branchTestUndefined(Condition cond, Register tag,
-+                                         Label* label) {
-+  MOZ_ASSERT(cond == Equal || cond == NotEqual);
-+  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_UNDEFINED), cond);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchTestUndefined(Condition cond,
-+                                         const ValueOperand& value,
-+                                         Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  splitTag(value, scratch);
-+  branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_UNDEFINED), label);
-+}
-+
-+void MacroAssembler::branchTestUndefined(Condition cond, const Address& address,
-+                                         Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(address, scratch);
-+  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_UNDEFINED), label);
-+}
-+
-+void MacroAssembler::branchTestUndefined(Condition cond,
-+                                         const BaseIndex& address,
-+                                         Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(address, scratch);
-+  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_UNDEFINED), label);
-+}
-+
-+void MacroAssembler::branchTestInt32(Condition cond, Register tag,
-+                                     Label* label) {
-+  MOZ_ASSERT(cond == Equal || cond == NotEqual);
-+  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_INT32), cond);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchTestInt32(Condition cond, const ValueOperand& value,
-+                                     Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  splitTag(value, scratch);
-+  branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_INT32), label);
-+}
-+
-+void MacroAssembler::branchTestInt32(Condition cond, const Address& address,
-+                                     Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(address, scratch);
-+  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_INT32), label);
-+}
-+
-+void MacroAssembler::branchTestInt32(Condition cond, const BaseIndex& address,
-+                                     Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(address, scratch);
-+  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_INT32), label);
-+}
-+
-+void MacroAssembler::branchTestInt32Truthy(bool b, const ValueOperand& value,
-+                                           Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  unboxInt32(value, scratch);
-+  as_cmpwi(scratch, 0);
-+  ma_b(b ? NotEqual : Equal, label);
-+}
-+
-+void MacroAssembler::branchTestDouble(Condition cond, Register tag,
-+                                      Label* label) {
-+  MOZ_ASSERT(cond == Equal || cond == NotEqual);
-+  Condition actual = (cond == Equal) ? BelowOrEqual : Above;
-+  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_MAX_DOUBLE), actual);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchTestDouble(Condition cond, const ValueOperand& value,
-+                                      Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  splitTag(value, scratch);
-+  branchTestDouble(cond, scratch, label);
-+}
-+
-+void MacroAssembler::branchTestDouble(Condition cond, const Address& address,
-+                                      Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(address, scratch);
-+  branchTestDouble(cond, tag, label);
-+}
-+
-+void MacroAssembler::branchTestDouble(Condition cond, const BaseIndex& address,
-+                                      Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(address, scratch);
-+  branchTestDouble(cond, tag, label);
-+}
-+
-+void MacroAssembler::branchTestDoubleTruthy(bool b, FloatRegister value,
-+                                            Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  xs_li(scratch, 0);
-+  as_mtvsrd(ScratchDoubleReg, scratch);
-+  as_fcmpu(value, ScratchDoubleReg);
-+  DoubleCondition cond = b ? DoubleNotEqual : DoubleEqualOrUnordered;
-+  ma_b(cond, label);
-+}
-+
-+void MacroAssembler::branchTestNumber(Condition cond, Register tag,
-+                                      Label* label) {
-+  MOZ_ASSERT(cond == Equal || cond == NotEqual);
-+  Condition actual = (cond == Equal) ? BelowOrEqual : Above;
-+  Condition c = ma_cmp(tag, Imm32(JS::detail::ValueUpperInclNumberTag), actual);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchTestNumber(Condition cond, const ValueOperand& value,
-+                                      Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  splitTag(value, scratch);
-+  branchTestNumber(cond, scratch, label);
-+}
-+
-+void MacroAssembler::branchTestBoolean(Condition cond, Register tag,
-+                                       Label* label) {
-+  MOZ_ASSERT(cond == Equal || cond == NotEqual);
-+  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_BOOLEAN), cond);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchTestBoolean(Condition cond,
-+                                       const ValueOperand& value,
-+                                       Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  splitTag(value, scratch);
-+  branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_BOOLEAN), label);
-+}
-+
-+void MacroAssembler::branchTestBoolean(Condition cond, const Address& address,
-+                                       Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(address, scratch);
-+  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_BOOLEAN), label);
-+}
-+
-+void MacroAssembler::branchTestBoolean(Condition cond, const BaseIndex& address,
-+                                       Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(address, scratch);
-+  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_BOOLEAN), label);
-+}
-+
-+void MacroAssembler::branchTestBooleanTruthy(bool b, const ValueOperand& value,
-+                                             Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  unboxBoolean(value, scratch);
-+  as_cmpwi(scratch, 0);
-+  ma_b(b ? NotEqual : Equal, label);
-+}
-+
-+void MacroAssembler::branchTestString(Condition cond, Register tag,
-+                                      Label* label) {
-+  MOZ_ASSERT(cond == Equal || cond == NotEqual);
-+  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_STRING), cond);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchTestString(Condition cond, const ValueOperand& value,
-+                                      Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  splitTag(value, scratch);
-+  branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_STRING), label);
-+}
-+
-+void MacroAssembler::branchTestString(Condition cond, const Address& address,
-+                                      Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(address, scratch);
-+  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_STRING), label);
-+}
-+
-+void MacroAssembler::branchTestString(Condition cond, const BaseIndex& address,
-+                                      Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(address, scratch);
-+  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_STRING), label);
-+}
-+
-+void MacroAssembler::branchTestStringTruthy(bool b, const ValueOperand& value,
-+                                            Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  unboxString(value, scratch);
-+  load32(Address(scratch, JSString::offsetOfLength()), scratch);
-+  as_cmpwi(scratch, 0);
-+  ma_b(b ? NotEqual : Equal, label);
-+}
-+
-+void MacroAssembler::branchTestSymbol(Condition cond, Register tag,
-+                                      Label* label) {
-+  MOZ_ASSERT(cond == Equal || cond == NotEqual);
-+  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_SYMBOL), cond);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchTestSymbol(Condition cond, const ValueOperand& value,
-+                                      Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  splitTag(value, scratch);
-+  branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_SYMBOL), label);
-+}
-+
-+void MacroAssembler::branchTestSymbol(Condition cond, const BaseIndex& address,
-+                                      Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(address, scratch);
-+  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_SYMBOL), label);
-+}
-+
-+void MacroAssembler::branchTestSymbol(Condition cond, const Address& address,
-+                                      Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(address, scratch);
-+  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_SYMBOL), label);
-+}
-+
-+void MacroAssembler::branchTestBigInt(Condition cond, Register tag,
-+                                      Label* label) {
-+  MOZ_ASSERT(cond == Equal || cond == NotEqual);
-+  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_BIGINT), cond);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchTestBigInt(Condition cond, const ValueOperand& value,
-+                                      Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  splitTag(value, scratch);
-+  branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_BIGINT), label);
-+}
-+
-+void MacroAssembler::branchTestBigInt(Condition cond, const Address& address,
-+                                      Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(address, scratch);
-+  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_BIGINT), label);
-+}
-+
-+void MacroAssembler::branchTestBigInt(Condition cond, const BaseIndex& address,
-+                                      Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(address, scratch);
-+  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_BIGINT), label);
-+}
-+
-+void MacroAssembler::branchTestBigIntTruthy(bool b, const ValueOperand& value,
-+                                            Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  unboxBigInt(value, scratch);
-+  load32(Address(scratch, BigInt::offsetOfDigitLength()), scratch);
-+  as_cmpwi(scratch, 0);
-+  ma_b(b ? NotEqual : Equal, label);
-+}
-+
-+void MacroAssembler::branchTestNull(Condition cond, Register tag,
-+                                    Label* label) {
-+  MOZ_ASSERT(cond == Equal || cond == NotEqual);
-+  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_NULL), cond);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchTestNull(Condition cond, const ValueOperand& value,
-+                                    Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  splitTag(value, scratch);
-+  branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_NULL), label);
-+}
-+
-+void MacroAssembler::branchTestNull(Condition cond, const Address& address,
-+                                    Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(address, scratch);
-+  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_NULL), label);
-+}
-+
-+void MacroAssembler::branchTestNull(Condition cond, const BaseIndex& address,
-+                                    Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(address, scratch);
-+  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_NULL), label);
-+}
-+
-+void MacroAssembler::branchTestObject(Condition cond, Register tag,
-+                                      Label* label) {
-+  MOZ_ASSERT(cond == Equal || cond == NotEqual);
-+  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_OBJECT), cond);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchTestObject(Condition cond, const ValueOperand& value,
-+                                      Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  splitTag(value, scratch);
-+  branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_OBJECT), label);
-+}
-+
-+void MacroAssembler::branchTestObject(Condition cond, const Address& address,
-+                                      Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(address, scratch);
-+  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_OBJECT), label);
-+}
-+
-+void MacroAssembler::branchTestObject(Condition cond, const BaseIndex& address,
-+                                      Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(address, scratch);
-+  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_OBJECT), label);
-+}
-+
-+void MacroAssembler::branchTestPrimitive(Condition cond,
-+                                         const ValueOperand& value,
-+                                         Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  splitTag(value, scratch);
-+  branchTestPrimitive(cond, scratch, label);
-+}
-+
-+void MacroAssembler::branchTestGCThing(Condition cond, const Address& address,
-+                                       Label* label) {
-+  branchTestGCThingImpl(cond, address, label);
-+}
-+
-+void MacroAssembler::branchTestGCThing(Condition cond, const BaseIndex& address,
-+                                       Label* label) {
-+  branchTestGCThingImpl(cond, address, label);
-+}
-+
-+void MacroAssembler::branchTestGCThing(Condition cond,
-+                                       const ValueOperand& address,
-+                                       Label* label) {
-+  branchTestGCThingImpl(cond, address, label);
-+}
-+
-+template <typename T>
-+void MacroAssembler::branchTestGCThingImpl(Condition cond, const T& address,
-+                                           Label* label) {
-+  MOZ_ASSERT(cond == Equal || cond == NotEqual);
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(address, scratch);
-+  Condition actual = (cond == Equal) ? AboveOrEqual : Below;
-+  Condition c =
-+      ma_cmp(tag, Imm32(JS::detail::ValueLowerInclGCThingTag), actual);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchTestPrimitive(Condition cond, Register tag,
-+                                         Label* label) {
-+  MOZ_ASSERT(cond == Equal || cond == NotEqual);
-+  Condition actual = (cond == Equal) ? Below : AboveOrEqual;
-+  Condition c =
-+      ma_cmp(tag, Imm32(JS::detail::ValueUpperExclPrimitiveTag), actual);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchTestMagic(Condition cond, Register tag,
-+                                     Label* label) {
-+  MOZ_ASSERT(cond == Equal || cond == NotEqual);
-+  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_MAGIC), cond);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchTestMagic(Condition cond, const Address& address,
-+                                     Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(address, scratch);
-+  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_MAGIC), label);
-+}
-+
-+void MacroAssembler::branchTestMagic(Condition cond, const BaseIndex& address,
-+                                     Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(address, scratch);
-+  branchTestTag(cond, tag, ImmTag(JSVAL_TAG_MAGIC), label);
-+}
-+
-+void MacroAssembler::branchTestMagic(Condition cond, const ValueOperand& value,
-+                                     Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  splitTag(value, scratch);
-+  branchTestTag(cond, scratch, ImmTag(JSVAL_TAG_MAGIC), label);
-+}
-+
-+void MacroAssembler::branchTestMagic(Condition cond, const Address& valaddr,
-+                                     JSWhyMagic why, Label* label) {
-+  uint64_t magic = MagicValue(why).asRawBits();
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  loadPtr(valaddr, scratch);
-+  Condition c = ma_cmp(scratch, ImmWord(magic), cond);
-+  ma_b(c, label);
-+}
-+
-+void MacroAssembler::branchTestMagic(Condition cond, const BaseIndex& valaddr,
-+                                     JSWhyMagic why, Label* label) {
-+  uint64_t magic = MagicValue(why).asRawBits();
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  loadPtr(valaddr, scratch);
-+  Condition c = ma_cmp(scratch, ImmWord(magic), cond);
-+  ma_b(c, label);
-+}
-+
-+template <typename T>
-+void MacroAssembler::branchTestValue(Condition cond, const T& lhs,
-+                                     const ValueOperand& rhs, Label* label) {
-+  MOZ_ASSERT(cond == Equal || cond == NotEqual);
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  loadPtr(lhs, scratch);
-+  Condition c = ma_cmp(scratch, rhs.valueReg(), cond);
-+  ma_b(c, label);
-+}
-+
-+// ===============================================================
-+// Test-set functions
-+
-+template <typename T>
-+void MacroAssembler::testNumberSet(Condition cond, const T& src,
-+                                   Register dest) {
-+  MOZ_ASSERT(cond == Equal || cond == NotEqual);
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(src, scratch);
-+  Condition actual = (cond == Equal) ? BelowOrEqual : Above;
-+  Condition c = ma_cmp(tag, Imm32(JS::detail::ValueUpperInclNumberTag), actual);
-+  ma_cmp_set(dest, c);
-+}
-+
-+template <typename T>
-+void MacroAssembler::testBooleanSet(Condition cond, const T& src,
-+                                    Register dest) {
-+  MOZ_ASSERT(cond == Equal || cond == NotEqual);
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(src, scratch);
-+  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_BOOLEAN), cond);
-+  ma_cmp_set(dest, c);
-+}
-+
-+template <typename T>
-+void MacroAssembler::testStringSet(Condition cond, const T& src,
-+                                   Register dest) {
-+  MOZ_ASSERT(cond == Equal || cond == NotEqual);
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(src, scratch);
-+  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_STRING), cond);
-+  ma_cmp_set(dest, c);
-+}
-+
-+template <typename T>
-+void MacroAssembler::testSymbolSet(Condition cond, const T& src,
-+                                   Register dest) {
-+  MOZ_ASSERT(cond == Equal || cond == NotEqual);
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(src, scratch);
-+  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_SYMBOL), cond);
-+  ma_cmp_set(dest, c);
-+}
-+
-+template <typename T>
-+void MacroAssembler::testBigIntSet(Condition cond, const T& src,
-+                                   Register dest) {
-+  MOZ_ASSERT(cond == Equal || cond == NotEqual);
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  Register tag = extractTag(src, scratch);
-+  Condition c = ma_cmp(tag, ImmTag(JSVAL_TAG_BIGINT), cond);
-+  ma_cmp_set(dest, c);
-+}
-+
-+// ===============================================================
-+// Computed address / conditional move / conditional load
-+
-+void MacroAssembler::branchToComputedAddress(const BaseIndex& addr) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  loadPtr(addr, scratch);
-+  branch(scratch);
-+}
-+
-+void MacroAssembler::cmp32Move32(Condition cond, Register lhs, Imm32 rhs,
-+                                 Register src, Register dest) {
-+  Condition c = ma_cmp(lhs, rhs, cond, true);
-+  ma_cmp_move(dest, src, c);
-+}
-+
-+void MacroAssembler::cmp32Move32(Condition cond, Register lhs, Register rhs,
-+                                 Register src, Register dest) {
-+  Condition c = ma_cmp(lhs, rhs, cond, true);
-+  ma_cmp_move(dest, src, c);
-+}
-+
-+void MacroAssembler::cmp32Move32(Condition cond, Register lhs,
-+                                 const Address& rhs, Register src,
-+                                 Register dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  load32(rhs, scratch);
-+  Condition c = ma_cmp(lhs, scratch, cond, true);
-+  ma_cmp_move(dest, src, c);
-+}
-+
-+void MacroAssembler::cmp32MovePtr(Condition cond, Register lhs, Imm32 rhs,
-+                                  Register src, Register dest) {
-+  Condition c = ma_cmp(lhs, rhs, cond, true);
-+  ma_cmp_move(dest, src, c);
-+}
-+
-+void MacroAssembler::cmpPtrMovePtr(Condition cond, Register lhs, Imm32 rhs,
-+                                   Register src, Register dest) {
-+  Condition c = ma_cmp(lhs, ImmWord(int64_t(rhs.value)), cond);
-+  ma_cmp_move(dest, src, c);
-+}
-+
-+void MacroAssembler::cmpPtrMovePtr(Condition cond, Register lhs, Register rhs,
-+                                   Register src, Register dest) {
-+  Condition c = ma_cmp(lhs, rhs, cond);
-+  ma_cmp_move(dest, src, c);
-+}
-+
-+void MacroAssembler::cmpPtrMovePtr(Condition cond, Register lhs,
-+                                   const Address& rhs, Register src,
-+                                   Register dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  loadPtr(rhs, scratch);
-+  Condition c = ma_cmp(lhs, scratch, cond);
-+  ma_cmp_move(dest, src, c);
-+}
-+
-+void MacroAssembler::cmp32Load32(Condition cond, Register lhs,
-+                                 const Address& rhs, const Address& src,
-+                                 Register dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  load32(rhs, scratch);
-+  Condition c = ma_cmp(lhs, scratch, cond, true);
-+  // Conditional load: load into scratch, then isel.
-+  load32(src, scratch);
-+  ma_cmp_move(dest, scratch, c);
-+}
-+
-+void MacroAssembler::cmp32Load32(Condition cond, Register lhs, Register rhs,
-+                                 const Address& src, Register dest) {
-+  Condition c = ma_cmp(lhs, rhs, cond, true);
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  load32(src, scratch);
-+  ma_cmp_move(dest, scratch, c);
-+}
-+
-+void MacroAssembler::cmp32Load32(Condition cond, Register lhs, Imm32 rhs,
-+                                 const Address& src, Register dest) {
-+  Condition c = ma_cmp(lhs, rhs, cond, true);
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  load32(src, scratch);
-+  ma_cmp_move(dest, scratch, c);
-+}
-+
-+void MacroAssembler::cmp32LoadPtr(Condition cond, const Address& lhs, Imm32 rhs,
-+                                  const Address& src, Register dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  load32(lhs, scratch);
-+  Condition c = ma_cmp(scratch, rhs, cond, true);
-+  loadPtr(src, scratch);
-+  ma_cmp_move(dest, scratch, c);
-+}
-+
-+void MacroAssembler::test32LoadPtr(Condition cond, const Address& addr,
-+                                   Imm32 mask, const Address& src,
-+                                   Register dest) {
-+  MOZ_ASSERT(cond == Zero || cond == NonZero);
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  load32(addr, scratch);
-+  if (is_uintN(mask.value, 16)) {
-+    as_andi_rc(scratch, scratch, mask.value);
-+  } else {
-+    // Use a nested scope so scratch2 is released before loadPtr below.
-+    UseScratchRegisterScope temps2(asMasm());
-+    Register scratch2 = temps2.Acquire();
-+    move32(mask, scratch2);
-+    as_and__rc(scratch, scratch, scratch2);  // record form folds the cmpdi
-+  }
-+  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
-+  loadPtr(src, scratch);
-+  ma_cmp_move(dest, scratch, base);
-+}
-+
-+void MacroAssembler::test32MovePtr(Condition cond, Register operand, Imm32 mask,
-+                                   Register src, Register dest) {
-+  MOZ_ASSERT(cond == Zero || cond == NonZero);
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  if (is_uintN(mask.value, 16)) {
-+    as_andi_rc(scratch, operand, mask.value);
-+  } else {
-+    move32(mask, scratch);
-+    as_and__rc(scratch, operand, scratch);  // record form folds the cmpdi
-+  }
-+  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
-+  ma_cmp_move(dest, src, base);
-+}
-+
-+void MacroAssembler::test32MovePtr(Condition cond, const Address& addr,
-+                                   Imm32 mask, Register src, Register dest) {
-+  MOZ_ASSERT(cond == Zero || cond == NonZero);
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  load32(addr, scratch);
-+  and32(mask, scratch);
-+  as_cmpdi(scratch, 0);
-+  Condition base = static_cast<Condition>(cond & ~Assembler::ConditionZero);
-+  ma_cmp_move(dest, src, base);
-+}
-+
-+// ===============================================================
-+// Spectre mitigations
-+
-+void MacroAssembler::spectreMovePtr(Condition cond, Register src,
-+                                    Register dest) {
-+  // Assumes compare already issued.
-+  Condition base = static_cast<Condition>(
-+      cond & ~(Assembler::ConditionUnsigned | Assembler::ConditionZero));
-+  ma_cmp_move(dest, src, base);
-+}
-+
-+void MacroAssembler::spectreZeroRegister(Condition cond, Register scratch,
-+                                         Register dest) {
-+  // Assumes compare already issued. Zero dest if condition is true.
-+  Condition origBase = static_cast<Condition>(
-+      cond & ~(Assembler::ConditionUnsigned | Assembler::ConditionZero));
-+  // If original condition is true, we want dest=0.
-+  // isel: if condition true, select zero; else keep dest.
-+  xs_li(scratch, 0);
-+  ma_cmp_move(dest, scratch, origBase);
-+}
-+
-+void MacroAssembler::spectreBoundsCheck32(Register index, Register length,
-+                                          Register maybeScratch,
-+                                          Label* failure) {
-+  Condition c = ma_cmp(index, length, Below, true);
-+  if (failure) {
-+    ma_b(InvertCondition(c), failure);
-+  }
-+  if (maybeScratch != InvalidReg) {
-+    xs_li(maybeScratch, 0);
-+    ma_cmp_move(index, maybeScratch, InvertCondition(c));
-+  }
-+}
-+
-+void MacroAssembler::spectreBoundsCheck32(Register index, const Address& length,
-+                                          Register maybeScratch,
-+                                          Label* failure) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  load32(length, scratch);
-+  spectreBoundsCheck32(index, scratch, maybeScratch, failure);
-+}
-+
-+void MacroAssembler::spectreBoundsCheckPtr(Register index, Register length,
-+                                           Register maybeScratch,
-+                                           Label* failure) {
-+  Condition c = ma_cmp(index, length, Below);
-+  if (failure) {
-+    ma_b(InvertCondition(c), failure);
-+  }
-+  if (maybeScratch != InvalidReg) {
-+    xs_li(maybeScratch, 0);
-+    ma_cmp_move(index, maybeScratch, InvertCondition(c));
-+  }
-+}
-+
-+void MacroAssembler::spectreBoundsCheckPtr(Register index,
-+                                           const Address& length,
-+                                           Register maybeScratch,
-+                                           Label* failure) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  loadPtr(length, scratch);
-+  spectreBoundsCheckPtr(index, scratch, maybeScratch, failure);
-+}
-+
-+// ===============================================================
-+// Memory access primitives
-+
-+FaultingCodeOffset MacroAssembler::storeFloat32(FloatRegister src,
-+                                                const Address& addr) {
-+  MOZ_ASSERT(addr.base != r0);
-+  if (is_intN(addr.offset, 16)) {
-+    return FaultingCodeOffset(as_stfs(src, addr.base, addr.offset).getOffset());
-+  }
-+  if (HasPOWER10() && is_intN((intptr_t)addr.offset, 34)) {
-+    return FaultingCodeOffset(
-+        as_pstfs(src, addr.base, (int64_t)addr.offset, /*R=*/false)
-+            .getOffset());
-+  }
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  movePtr(ImmWord(addr.offset), scratch);
-+  return FaultingCodeOffset(as_stfsx(src, addr.base, scratch).getOffset());
-+}
-+
-+FaultingCodeOffset MacroAssembler::storeFloat32(FloatRegister src,
-+                                                const BaseIndex& addr) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  computeEffectiveAddress(addr, scratch);
-+  return FaultingCodeOffset(as_stfs(src, scratch, 0).getOffset());
-+}
-+
-+FaultingCodeOffset MacroAssembler::storeDouble(FloatRegister src,
-+                                               const Address& addr) {
-+  MOZ_ASSERT(addr.base != r0);
-+  if (is_intN(addr.offset, 16)) {
-+    return FaultingCodeOffset(as_stfd(src, addr.base, addr.offset).getOffset());
-+  }
-+  if (HasPOWER10() && is_intN((intptr_t)addr.offset, 34)) {
-+    return FaultingCodeOffset(
-+        as_pstfd(src, addr.base, (int64_t)addr.offset, /*R=*/false)
-+            .getOffset());
-+  }
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  movePtr(ImmWord(addr.offset), scratch);
-+  return FaultingCodeOffset(as_stfdx(src, addr.base, scratch).getOffset());
-+}
-+
-+FaultingCodeOffset MacroAssembler::storeDouble(FloatRegister src,
-+                                               const BaseIndex& addr) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  computeEffectiveAddress(addr, scratch);
-+  return FaultingCodeOffset(as_stfd(src, scratch, 0).getOffset());
-+}
-+
-+FaultingCodeOffset MacroAssembler::storeFloat16(FloatRegister src,
-+                                                const Address& dest,
-+                                                Register temp) {
-+  MOZ_ASSERT(HasPOWER9());
-+  if (dest.offset == 0) {
-+    return FaultingCodeOffset(as_stxsihx(src, r0, dest.base).getOffset());
-+  }
-+  if (is_intN(dest.offset, 16)) {
-+    as_addi(temp, dest.base, dest.offset);
-+    return FaultingCodeOffset(as_stxsihx(src, r0, temp).getOffset());
-+  }
-+  movePtr(ImmWord(dest.offset), temp);
-+  return FaultingCodeOffset(as_stxsihx(src, dest.base, temp).getOffset());
-+}
-+
-+FaultingCodeOffset MacroAssembler::storeFloat16(FloatRegister src,
-+                                                const BaseIndex& dest,
-+                                                Register temp) {
-+  MOZ_ASSERT(HasPOWER9());
-+  computeEffectiveAddress(dest, temp);
-+  return FaultingCodeOffset(as_stxsihx(src, r0, temp).getOffset());
-+}
-+
-+void MacroAssembler::memoryBarrier(MemoryBarrier barrier) {
-+  if (barrier.isNone()) {
-+    return;
-+  }
-+  if (barrier.hasStoreLoad() || barrier.hasSync()) {
-+    as_sync();
-+  } else {
-+    as_lwsync();
-+  }
-+}
-+
-+// ===============================================================
-+// Clamping functions
-+
-+void MacroAssembler::clampIntToUint8(Register reg) {
-+  // Clamp to [0, 255].
-+  Label done;
-+  as_cmpwi(reg, 255);
-+  ma_b(LessThanOrEqual, &done);
-+  move32(Imm32(255), reg);
-+  bind(&done);
-+  Label positive;
-+  as_cmpwi(reg, 0);
-+  ma_b(GreaterThanOrEqual, &positive);
-+  move32(Imm32(0), reg);
-+  bind(&positive);
-+}
-+
-+// ===============================================================
-+// Unboxing
-+
-+void MacroAssembler::fallibleUnboxPtr(const ValueOperand& src, Register dest,
-+                                      JSValueType type, Label* fail) {
-+  MOZ_ASSERT(type == JSVAL_TYPE_OBJECT || type == JSVAL_TYPE_STRING ||
-+             type == JSVAL_TYPE_SYMBOL || type == JSVAL_TYPE_BIGINT);
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  splitTag(src, scratch);
-+  Condition c = ma_cmp(scratch, ImmTag(JSVAL_TYPE_TO_TAG(type)), NotEqual);
-+  ma_b(c, fail);
-+  unboxNonDouble(src, dest, type);
-+}
-+
-+void MacroAssembler::fallibleUnboxPtr(const Address& src, Register dest,
-+                                      JSValueType type, Label* fail) {
-+  loadValue(src, ValueOperand(dest));
-+  fallibleUnboxPtr(ValueOperand(dest), dest, type, fail);
-+}
-+
-+void MacroAssembler::fallibleUnboxPtr(const BaseIndex& src, Register dest,
-+                                      JSValueType type, Label* fail) {
-+  loadValue(src, ValueOperand(dest));
-+  fallibleUnboxPtr(ValueOperand(dest), dest, type, fail);
-+}
-+
-+void MacroAssembler::wasmAddSubI128HI64(Register lhsLo, Register lhsHi,
-+                                        Register rhsLo, Register rhsHi,
-+                                        Register output, bool isAdd) {
-+  MOZ_RELEASE_ASSERT(output != lhsLo && output != lhsHi && output != rhsLo &&
-+                     output != rhsHi);
-+  if (isAdd) {
-+    // addc sets CA (carry), adde uses it.
-+    as_addc(output, lhsLo, rhsLo);  // output = lhsLo + rhsLo, CA = carry
-+    as_adde(output, lhsHi, rhsHi);  // output = lhsHi + rhsHi + CA
-+  } else {
-+    // subfc: rd = rb - ra, sets CA (borrow complement).
-+    // subfe: rd = rb + ~ra + CA.
-+    as_subfc(output, rhsLo, lhsLo);  // output = lhsLo - rhsLo, CA = ~borrow
-+    as_subfe(output, rhsHi, lhsHi);  // output = lhsHi - rhsHi - borrow
-+  }
-+}
-+
-+void MacroAssembler::wasmMulI64WideHI64(Register lhs, Register rhs,
-+                                        Register output, bool isSigned) {
-+  if (isSigned) {
-+    as_mulhd(output, lhs, rhs);
-+  } else {
-+    as_mulhdu(output, lhs, rhs);
-+  }
-+}
-+
-+//}}} check_macroassembler_style
-+
-+void MacroAssemblerPPC64Compat::incrementInt32Value(const Address& addr) {
-+  asMasm().add32(Imm32(1), addr);
-+}
-+
-+void MacroAssemblerPPC64Compat::retn(Imm32 n) {
-+  // Load return address from [SP,0] first, then adjust SP, then return.
-+  // Must load RA before adjusting SP (like loong64), since the RA is at
-+  // the current top of stack, not at SP+n.
-+  UseScratchRegisterScope temps(*this);
-+  Register scratch = temps.Acquire();
-+  as_ld(scratch, StackPointer, 0);
-+  if (n.value != 0) {
-+    asMasm().addPtr(Imm32(n.value), StackPointer);
-+  }
-+  xs_mtlr(scratch);
-+  as_blr();
-+}
-+
-+// ===============================================================
-+// Template specializations (outside check_macroassembler_style)
-+
-+template <>
-+inline void MacroAssembler::cmpPtrSet(Assembler::Condition cond, Address lhs,
-+                                      ImmPtr rhs, Register dest) {
-+  UseScratchRegisterScope temps(*this);
-+  Register scratch = temps.Acquire();
-+  loadPtr(lhs, scratch);
-+  Assembler::Condition c = ma_cmp(scratch, rhs, cond);
-+  ma_cmp_set(dest, c);
-+}
-+
-+template <>
-+inline void MacroAssembler::cmpPtrSet(Assembler::Condition cond, Register lhs,
-+                                      Address rhs, Register dest) {
-+  UseScratchRegisterScope temps(*this);
-+  Register scratch = temps.Acquire();
-+  loadPtr(rhs, scratch);
-+  Assembler::Condition c = ma_cmp(lhs, scratch, cond);
-+  ma_cmp_set(dest, c);
-+}
-+
-+template <>
-+inline void MacroAssembler::cmpPtrSet(Assembler::Condition cond, Address lhs,
-+                                      Register rhs, Register dest) {
-+  UseScratchRegisterScope temps(*this);
-+  Register scratch = temps.Acquire();
-+  loadPtr(lhs, scratch);
-+  Assembler::Condition c = ma_cmp(scratch, rhs, cond);
-+  ma_cmp_set(dest, c);
-+}
-+
-+template <>
-+inline void MacroAssembler::cmp32Set(Assembler::Condition cond, Register lhs,
-+                                     Address rhs, Register dest) {
-+  UseScratchRegisterScope temps(*this);
-+  Register scratch = temps.Acquire();
-+  load32(rhs, scratch);
-+  Assembler::Condition c = ma_cmp(lhs, scratch, cond, true);
-+  ma_cmp_set(dest, c);
-+}
-+
-+template <>
-+inline void MacroAssembler::cmp32Set(Assembler::Condition cond, Address lhs,
-+                                     Register rhs, Register dest) {
-+  UseScratchRegisterScope temps(*this);
-+  Register scratch = temps.Acquire();
-+  load32(lhs, scratch);
-+  Assembler::Condition c = ma_cmp(scratch, rhs, cond, true);
-+  ma_cmp_set(dest, c);
-+}
-+
-+template <>
-+inline void MacroAssembler::cmp32Set(Assembler::Condition cond, Address lhs,
-+                                     Imm32 rhs, Register dest) {
-+  UseScratchRegisterScope temps(*this);
-+  Register scratch = temps.Acquire();
-+  load32(lhs, scratch);
-+  Assembler::Condition c = ma_cmp(scratch, rhs, cond, true);
-+  ma_cmp_set(dest, c);
-+}
-+
-+//{{{ check_macroassembler_style
-+// ===============================================================
-+// SIMD load/store (128-bit)
-+
-+FaultingCodeOffset MacroAssembler::loadUnalignedSimd128(const Address& src,
-+                                                        FloatRegister dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  if (HasPOWER10() && is_intN((intptr_t)src.offset, 34)) {
-+    // POWER10 prefixed load — natural-LE byte order, no GPR scratch.
-+    return FaultingCodeOffset(
-+        as_plxv(dest.encoding(), src.base, (int64_t)src.offset, /*R=*/false)
-+            .getOffset());
-+  }
-+  if (HasPOWER9()) {
-+    // POWER9: lxvx (X-form, indexed) loads 128 bits in correct LE order.
-+    Register scratch = temps.Acquire();
-+    if (src.offset == 0) {
-+      // RA=0 means "use 0 as base" in indexed forms, so use r0 encoding.
-+      return FaultingCodeOffset(as_lxvx(dest, r0, src.base).getOffset());
-+    }
-+    movePtr(ImmWord(src.offset), scratch);
-+    return FaultingCodeOffset(as_lxvx(dest, src.base, scratch).getOffset());
-+  }
-+  // POWER8: lxvd2x loads with doubleword swap on LE. Fix with xxpermdi.
-+  Register scratch = temps.Acquire();
-+  FaultingCodeOffset fco;
-+  if (src.offset == 0) {
-+    fco = FaultingCodeOffset(as_lxvd2x(dest, r0, src.base).getOffset());
-+  } else {
-+    movePtr(ImmWord(src.offset), scratch);
-+    fco = FaultingCodeOffset(as_lxvd2x(dest, src.base, scratch).getOffset());
-+  }
-+  as_xxpermdi(dest, dest, dest, 2);
-+  return fco;
-+}
-+
-+FaultingCodeOffset MacroAssembler::loadUnalignedSimd128(const BaseIndex& src,
-+                                                        FloatRegister dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  computeScaledAddress(src, scratch);
-+  if (src.offset != 0) {
-+    // addPtr picks up POWER10 paddi (1 prefixed insn) when available;
-+    // falls back to movePtr+add on P9/P8. Drops the explicit scratch2.
-+    addPtr(ImmWord(src.offset), scratch);
-+  }
-+  if (HasPOWER9()) {
-+    return FaultingCodeOffset(as_lxvx(dest, r0, scratch).getOffset());
-+  }
-+  FaultingCodeOffset fco(as_lxvd2x(dest, r0, scratch).getOffset());
-+  as_xxpermdi(dest, dest, dest, 2);
-+  return fco;
-+}
-+
-+FaultingCodeOffset MacroAssembler::storeUnalignedSimd128(FloatRegister src,
-+                                                         const Address& dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  if (HasPOWER10() && is_intN((intptr_t)dest.offset, 34)) {
-+    // POWER10 prefixed store — natural-LE byte order, no GPR scratch.
-+    return FaultingCodeOffset(
-+        as_pstxv(src.encoding(), dest.base, (int64_t)dest.offset, /*R=*/false)
-+            .getOffset());
-+  }
-+  if (HasPOWER9()) {
-+    Register scratch = temps.Acquire();
-+    if (dest.offset == 0) {
-+      return FaultingCodeOffset(as_stxvx(src, r0, dest.base).getOffset());
-+    }
-+    movePtr(ImmWord(dest.offset), scratch);
-+    return FaultingCodeOffset(as_stxvx(src, dest.base, scratch).getOffset());
-+  }
-+  // POWER8: stxvd2x stores with doubleword swap on LE.
-+  // Swap before store, then swap back to restore the register.
-+  ScratchSimd128Scope scratch128(*this);
-+  as_xxpermdi(scratch128, src, src, 2);
-+  Register scratch = temps.Acquire();
-+  FaultingCodeOffset fco;
-+  if (dest.offset == 0) {
-+    fco = FaultingCodeOffset(as_stxvd2x(scratch128, r0, dest.base).getOffset());
-+  } else {
-+    movePtr(ImmWord(dest.offset), scratch);
-+    fco = FaultingCodeOffset(
-+        as_stxvd2x(scratch128, dest.base, scratch).getOffset());
-+  }
-+  return fco;
-+}
-+
-+FaultingCodeOffset MacroAssembler::storeUnalignedSimd128(
-+    FloatRegister src, const BaseIndex& dest) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  computeScaledAddress(dest, scratch);
-+  if (dest.offset != 0) {
-+    addPtr(ImmWord(dest.offset), scratch);
-+  }
-+  if (HasPOWER9()) {
-+    return FaultingCodeOffset(as_stxvx(src, r0, scratch).getOffset());
-+  }
-+  ScratchSimd128Scope scratch128(*this);
-+  as_xxpermdi(scratch128, src, src, 2);
-+  return FaultingCodeOffset(as_stxvd2x(scratch128, r0, scratch).getOffset());
-+}
-+
-+// ===============================================================
-+// SIMD operations
-+//
-+// Scratch register conventions for SIMD helpers (read this before writing
-+// a new one):
-+//
-+// 1. `ScratchSimd128Scope scratch(*this)` — acquires v0 (= VR0 = VSR32,
-+//    non-allocatable). Constructed as {FloatRegisters::f0, Simd128} so
-+//    encoding() = 0 + 32 = 32 (per Architecture-ppc64.h). Default temp.
-+//    One scope at a time per helper. Safe to pass to any VMX/VSX
-+//    instruction; the allocator never places a live v128 in v0.
-+//
-+// 2. **Do NOT** write to VR1..VR31 (= VSR33..VSR63) without a Lowering
-+//    temp. VR1..VR31 are allocatable; a live wasm v128 may be sitting in
-+//    any of them. Use `ScratchSimd128Scope` (rule 1) or a Lowering temp.
-+//
-+// 3. **Red-zone stash** — use `RedZoneStashSimd128` / `RedZoneRestoreSimd128`
-+//    (declared just below) when a helper genuinely needs >1 SIMD scratch
-+//    AND adding a Lowering temp would require LIR + MIR + CodeGen changes.
-+//    ELFv2 reserves 288 bytes below SP; we use at most 32 (two 16-byte
-+//    slots). Live users: `extAddPairwiseInt*` (2 slots), `swizzleInt8x16`
-+//    (1 slot), `dotInt8x16Int7x16ThenAdd` 4-arg (1 slot). If you find
-+//    yourself wanting a 3rd slot or nested save/restore, prefer a Lowering
-+//    temp instead — the red-zone approach is tolerable because it's
-+//    self-contained to a single helper. The `MOZ_ASSERT(slot < 2)` inside
-+//    the helpers enforces this at test time.
-+//
-+// Simd128 lives in VR-namespace (VSR32-63), so VMX ops address Simd128
-+// FloatRegisters directly with no staging. Encoding is 32-63; the VMX
-+// VR field is 5-bit (0-31), so we mask with `& 31`.
-+
-+// Two 16-byte Simd128 slots available in the ELFv2 red zone for short-lived
-+// SIMD spills (see point 3 of the SIMD conventions preamble above).
-+static constexpr int kRedZoneSimd128MaxSlots = 2;
-+
-+static inline void RedZoneStashSimd128(MacroAssembler& masm, FloatRegister src,
-+                                       int slot) {
-+  MOZ_ASSERT(slot >= 0 && slot < kRedZoneSimd128MaxSlots);
-+  masm.storeUnalignedSimd128(src, Address(StackPointer, -16 * (slot + 1)));
-+}
-+
-+static inline void RedZoneRestoreSimd128(MacroAssembler& masm, int slot,
-+                                         FloatRegister dest) {
-+  MOZ_ASSERT(slot >= 0 && slot < kRedZoneSimd128MaxSlots);
-+  masm.loadUnalignedSimd128(Address(StackPointer, -16 * (slot + 1)), dest);
-+}
-+
-+typedef void (*VmxBinaryFn)(Assembler&, uint8_t, uint8_t, uint8_t);
-+
-+static void EmitVmxBinary(MacroAssembler& masm, VmxBinaryFn vmxOp,
-+                          FloatRegister lhs, FloatRegister rhs,
-+                          FloatRegister dest) {
-+  vmxOp(static_cast<Assembler&>(masm), dest.encoding() & 31,
-+        lhs.encoding() & 31, rhs.encoding() & 31);
-+}
-+
-+// Macro for defining VMX binary wrappers.
-+#define VMX_BINARY_WRAPPER(vmxInst)                         \
-+  [](Assembler& a, uint8_t vrt, uint8_t vra, uint8_t vrb) { \
-+    a.as_##vmxInst(vrt, vra, vrb);                          \
-+  }
-+
-+// Emit op directly on Simd128 dest, then xxlnor in place.
-+template <typename VmxBinaryFnT>
-+static void EmitVmxBinaryNot(MacroAssembler& masm, VmxBinaryFnT vmxOp,
-+                             FloatRegister lhs, FloatRegister rhs,
-+                             FloatRegister dest) {
-+  vmxOp(static_cast<Assembler&>(masm), dest.encoding() & 31,
-+        lhs.encoding() & 31, rhs.encoding() & 31);
-+  masm.as_xxlnor(dest, dest, dest);
-+}
-+
-+// Integer SIMD compare helper. VMX compare instructions produce all-ones
-+// for true, all-zeros for false per element.
-+// Available VMX compares: vcmpequ* (eq), vcmpgts* (signed gt), vcmpgtu*
-+// (unsigned gt). Other conditions derived by swapping operands or
-+// complementing.
-+template <typename EqFn, typename GtsFn, typename GtuFn>
-+static void EmitVmxCompare(MacroAssembler& masm, Assembler::Condition cond,
-+                           FloatRegister lhs, FloatRegister rhs,
-+                           FloatRegister dest, EqFn eqFn, GtsFn gtsFn,
-+                           GtuFn gtuFn) {
-+  switch (cond) {
-+    case Assembler::Equal:
-+      EmitVmxBinary(masm, eqFn, lhs, rhs, dest);
-+      break;
-+    case Assembler::NotEqual:
-+      EmitVmxBinaryNot(masm, eqFn, lhs, rhs, dest);
-+      break;
-+    case Assembler::GreaterThan:
-+      EmitVmxBinary(masm, gtsFn, lhs, rhs, dest);
-+      break;
-+    case Assembler::GreaterThanOrEqual:
-+      // !(rhs > lhs)
-+      EmitVmxBinaryNot(masm, gtsFn, rhs, lhs, dest);
-+      break;
-+    case Assembler::LessThan:
-+      // rhs > lhs (swap)
-+      EmitVmxBinary(masm, gtsFn, rhs, lhs, dest);
-+      break;
-+    case Assembler::LessThanOrEqual:
-+      // !(lhs > rhs)
-+      EmitVmxBinaryNot(masm, gtsFn, lhs, rhs, dest);
-+      break;
-+    case Assembler::Above:
-+      EmitVmxBinary(masm, gtuFn, lhs, rhs, dest);
-+      break;
-+    case Assembler::AboveOrEqual:
-+      EmitVmxBinaryNot(masm, gtuFn, rhs, lhs, dest);
-+      break;
-+    case Assembler::Below:
-+      EmitVmxBinary(masm, gtuFn, rhs, lhs, dest);
-+      break;
-+    case Assembler::BelowOrEqual:
-+      EmitVmxBinaryNot(masm, gtuFn, lhs, rhs, dest);
-+      break;
-+    default:
-+      MOZ_CRASH("Unexpected SIMD integer condition");
-+  }
-+}
-+
-+// Emit ternary VMX op directly on Simd128 regs, no staging.
-+typedef void (*VmxTernaryFn)(Assembler&, uint8_t, uint8_t, uint8_t, uint8_t);
-+
-+static void EmitVmxTernary(MacroAssembler& masm, VmxTernaryFn vmxOp,
-+                           FloatRegister a, FloatRegister b, FloatRegister c,
-+                           FloatRegister dest) {
-+  vmxOp(static_cast<Assembler&>(masm), dest.encoding() & 31, a.encoding() & 31,
-+        b.encoding() & 31, c.encoding() & 31);
-+}
-+
-+// Emit unary VMX op directly on Simd128 regs, no staging.
-+typedef void (*VmxUnaryFn)(Assembler&, uint8_t, uint8_t);
-+
-+static void EmitVmxUnary(MacroAssembler& masm, VmxUnaryFn vmxOp,
-+                         FloatRegister src, FloatRegister dest) {
-+  vmxOp(static_cast<Assembler&>(masm), dest.encoding() & 31,
-+        src.encoding() & 31);
-+}
-+
-+// Helper: create a zero SIMD register using xxlxor.
-+static void ZeroSimd128(MacroAssembler& masm, FloatRegister dest) {
-+  masm.as_xxlxor(dest, dest, dest);
-+}
-+
-+void MacroAssembler::moveSimd128(FloatRegister src, FloatRegister dest) {
-+  if (src != dest) {
-+    as_xxlor(dest, src, src);
-+  }
-+}
-+
-+void MacroAssembler::loadConstantSimd128(const SimdConstant& v,
-+                                         FloatRegister dest) {
-+  // Load 128-bit constant from inline constant pool.
-+  // Clobbers SecondScratchReg (r12).
-+  loadFromPoolSimd128(dest, v);
-+}
-+
-+// PPC64 LE lane mapping:
-+// Wasm lane K = memory byte K = register byte (15-K).
-+// mfvsrd extracts register bits[0:63] = BE dword 0 = Wasm lanes 8-15 (bytes).
-+// For VMX byte ops, BE byte index = 15 - wasm_lane.
-+// For VMX halfword ops, BE halfword index = 7 - wasm_halfword.
-+// For VSX word ops (xxspltw), BE word index = 3 - wasm_word.
-+// For doubleword ops, BE dword index = 1 - wasm_dword.
-+
-+void MacroAssembler::splatX16(Register src, FloatRegister dest) {
-+  // mtvsrd writes src into BE 0..63 of dest (low byte at BE byte 7);
-+  // vspltb then splats that byte over all 16 lanes. dest aliases as
-+  // both source and destination — vspltb tolerates this. No extra
-+  // scratch register required, so callers that already hold a
-+  // ScratchSimd128Scope (extAddPairwise*, var-shift narrow forms) do
-+  // not see a nested-acquire collision.
-+  as_mtvsrd(dest, src);
-+  as_vspltb(dest, dest, 7);
-+}
-+
-+void MacroAssembler::splatX8(Register src, FloatRegister dest) {
-+  // Same shape as splatX16 with halfword granularity. mtvsrd places
-+  // the low 16 bits at BE halfword 3 (= BE bytes 6..7); vsplth picks
-+  // it up and splats across 8 lanes. vsplth reads only the chosen
-+  // halfword, so negative i32 inputs do not need a 16-bit pre-mask
-+  // (which the previous GPR-replicate path required).
-+  as_mtvsrd(dest, src);
-+  as_vsplth(dest, dest, 3);
-+}
-+
-+void MacroAssembler::splatX4(Register src, FloatRegister dest) {
-+  if (HasPOWER9()) {
-+    as_mtvsrws(dest, src);
-+  } else {
-+    as_mtvsrd(dest, src);
-+    as_xxspltw(dest, dest, 1);
-+  }
-+}
-+
-+void MacroAssembler::splatX4(FloatRegister src, FloatRegister dest) {
-+  // src is a double-precision FPR holding a float value (the JIT keeps
-+  // FP32 in DP-equivalent form on PPC64). Convert DP→SP into BE word 0
-+  // (xscvdpspn lays the single at bits[0:31] / BE word 0), then splat
-+  // word 0 to all four lanes.
-+  as_xscvdpspn(dest, src);
-+  as_xxspltw(dest, dest, 0);
-+}
-+
-+void MacroAssembler::splatX2(FloatRegister src, FloatRegister dest) {
-+  // Splat scalar double to both doubleword lanes.
-+  // Scalar value is in register bits[0:63] (BE dword 0).
-+  // xxpermdi dm=0: dest = [src.dw0, src.dw0]
-+  as_xxpermdi(dest, src, src, 0);
-+}
-+
-+// Helpers: splat Imm32 into SIMD register at various element widths.
-+// VMX shift instructions read the shift count from EACH element independently,
-+// so the count must be replicated to every byte/halfword/word as appropriate.
-+//
-+// Fast path for small constants: vspltis{b,h,w} (POWER7+) splats a 5-bit
-+// signed immediate to all lanes in 1 insn with no pool entry. For values
-+// outside [-16, 15] we fall back to the inline-pool path.
-+static void SplatImm8(MacroAssembler& masm, Imm32 imm, FloatRegister dest) {
-+  int8_t val = (int8_t)imm.value;
-+  if (val >= -16 && val <= 15) {
-+    masm.as_vspltisb(dest.encoding() & 31, val);
-+    return;
-+  }
-+  if (HasPOWER9()) {
-+    // P9 xxspltib handles the full 8-bit range in 1 insn.
-+    masm.as_xxspltib(dest, (uint8_t)val);
-+    return;
-+  }
-+  int8_t bytes[16];
-+  for (int i = 0; i < 16; i++) bytes[i] = val;
-+  masm.loadConstantSimd128(SimdConstant::CreateX16(bytes), dest);
-+}
-+
-+static void SplatImm16(MacroAssembler& masm, Imm32 imm, FloatRegister dest) {
-+  int16_t val = (int16_t)imm.value;
-+  if (val >= -16 && val <= 15) {
-+    masm.as_vspltish(dest.encoding() & 31, (int8_t)val);
-+    return;
-+  }
-+  int16_t halfs[8];
-+  for (int i = 0; i < 8; i++) halfs[i] = val;
-+  masm.loadConstantSimd128(SimdConstant::CreateX8(halfs), dest);
-+}
-+
-+static void SplatImm32(MacroAssembler& masm, Imm32 imm, FloatRegister dest) {
-+  int32_t val = imm.value;
-+  if (val >= -16 && val <= 15) {
-+    masm.as_vspltisw(dest.encoding() & 31, (int8_t)val);
-+    return;
-+  }
-+  int32_t words[4] = {val, val, val, val};
-+  masm.loadConstantSimd128(SimdConstant::CreateX4(words), dest);
-+}
-+
-+// ===============================================================
-+// Extract lane
-+
-+static void ExtractLaneToGPR(MacroAssembler& masm, uint32_t lane,
-+                             FloatRegister src, Register dest,
-+                             unsigned laneWidthBytes, unsigned laneWidthBits) {
-+  // Extract Wasm lane from vector register to GPR.
-+  // Wasm lane K → register byte offset (15 - K*laneWidthBytes).
-+  //
-+  // Strategy: use mfvsrd to get one 64-bit half of the register, then shift
-+  // and mask to isolate the lane.
-+  //
-+  // mfvsrd gets register bits[0:63] (BE dword 0) = Wasm lanes in the high
-+  // half of the register (high-numbered lanes in LE memory order).
-+  // For an N-bit lane at Wasm index L:
-+  //   If L is in the high dword (L >= 8/laneWidthBytes):
-+  //     use mfvsrd; lane is at GPR bit offset laneWidthBits*(L -
-+  //     8/laneWidthBytes) from LSB
-+  //   Else (L in low dword):
-+  //     swap dwords, then mfvsrd; lane is at GPR bit offset laneWidthBits*L
-+  //     from LSB
-+
-+  unsigned lanesPerDword = 8 / laneWidthBytes;
-+
-+  if (lane >= lanesPerDword) {
-+    masm.as_mfvsrd(dest, src);
-+    unsigned shift = laneWidthBits * (lane - lanesPerDword);
-+    if (shift) {
-+      masm.x_srdi(dest, dest, shift);
-+    }
-+  } else {
-+    if (HasPOWER9()) {
-+      masm.as_mfvsrld(dest, src);
-+    } else {
-+      // POWER8: swap dwords to get dw1 into scalar position.
-+      // Avoid ScratchSimd128Scope — callers may already hold it.
-+      // Use xxpermdi directly on ScratchSimd128Reg (v0/VSR32, non-allocatable).
-+      masm.as_xxpermdi(ScratchSimd128Reg, src, src, 2);
-+      masm.as_mfvsrd(dest, ScratchSimd128Reg);
-+    }
-+    unsigned shift = laneWidthBits * lane;
-+    if (shift) {
-+      masm.x_srdi(dest, dest, shift);
-+    }
-+  }
-+}
-+
-+void MacroAssembler::unsignedExtractLaneInt8x16(uint32_t lane,
-+                                                FloatRegister src,
-+                                                Register dest) {
-+  MOZ_ASSERT(lane < 16);
-+  if (HasPOWER9()) {
-+    // vextractub puts VRB.BE_byte[UIM] at VRT.BE_byte[7] with the rest
-+    // zeroed; mfvsrd then reads BE bytes 0..7 → low byte of dest, high
-+    // bytes already 0. No mask needed.
-+    as_vextractub(ScratchSimd128Reg, src, 15 - lane);
-+    as_mfvsrd(dest, ScratchSimd128Reg);
-+    return;
-+  }
-+  ExtractLaneToGPR(*this, lane, src, dest, 1, 8);
-+  as_rldicl(dest, dest, 0, 56);
-+}
-+
-+void MacroAssembler::unsignedExtractLaneInt16x8(uint32_t lane,
-+                                                FloatRegister src,
-+                                                Register dest) {
-+  MOZ_ASSERT(lane < 8);
-+  if (HasPOWER9()) {
-+    as_vextractuh(ScratchSimd128Reg, src, 14 - 2 * lane);
-+    as_mfvsrd(dest, ScratchSimd128Reg);
-+    return;
-+  }
-+  ExtractLaneToGPR(*this, lane, src, dest, 2, 16);
-+  as_rldicl(dest, dest, 0, 48);
-+}
-+
-+void MacroAssembler::extractLaneFloat32x4(uint32_t lane, FloatRegister src,
-+                                          FloatRegister dest) {
-+  MOZ_ASSERT(lane < 4);
-+  // BE word index = 3 - lane. xxextractuw extracts a word by BE byte offset.
-+  // BE byte offset of BE word W = W*4. So offset = (3-lane)*4.
-+  // xxextractuw puts the extracted word into bits[32:63] of dest (the low
-+  // word of the scalar doubleword), then xscvspdpn converts SP→DP.
-+  // xxspltw replicates a word into all 4 positions. The scalar SP value
-+  // is then at bits[0:31] where xscvspdpn expects it.
-+  as_xxspltw(dest, src, 3 - lane);
-+  as_xscvspdpn(dest, dest);
-+}
-+
-+void MacroAssembler::extractLaneFloat64x2(uint32_t lane, FloatRegister src,
-+                                          FloatRegister dest) {
-+  MOZ_ASSERT(lane < 2);
-+  if (lane == 0) {
-+    // Lane 0 = LE low dword = BE dword 1. Need to swap to scalar position.
-+    as_xxpermdi(dest, src, src, 2);
-+  } else {
-+    // Lane 1 = LE high dword = BE dword 0 = scalar position.
-+    if (src != dest) {
-+      as_xxlor(dest, src, src);
-+    }
-+  }
-+}
-+
-+// ===============================================================
-+// Replace lane
-+
-+void MacroAssembler::replaceLaneInt8x16(unsigned lane, Register rhs,
-+                                        FloatRegister lhsDest) {
-+  MOZ_ASSERT(lane < 16);
-+  if (HasPOWER10()) {
-+    // 2 insns + 1 GPR scratch: load lane index, vinsbrx (right-indexed
-+    // = LE-natural). vinsbrx masks RA & 0xF, so the immediate fits.
-+    UseScratchRegisterScope temps(asMasm());
-+    Register idx = temps.Acquire();
-+    xs_li(idx, int16_t(lane));
-+    as_vinsbrx(lhsDest, idx, rhs);
-+    return;
-+  }
-+  if (HasPOWER9()) {
-+    // 2 insns + 1 VSR scratch: stage rhs in BE 0..63 of a scratch VSR
-+    // (low byte of rhs lands at BE byte 7), then vinsertb copies that
-+    // BE byte 7 into lhsDest's BE byte (15 - lane) = wasm lane L.
-+    ScratchSimd128Scope scratch(*this);
-+    as_mtvsrd(scratch, rhs);
-+    as_vinsertb(lhsDest, scratch, 15 - lane);
-+    return;
-+  }
-+  {
-+    // POWER8: extract dword, use rldimi to insert byte, write back.
-+    // Only needs 1 GPR scratch.
-+    UseScratchRegisterScope temps(asMasm());
-+    ScratchSimd128Scope scratch128(*this);
-+    Register tmp = temps.Acquire();
-+    unsigned dword = lane / 8;
-+    unsigned byteInDword = lane % 8;
-+    if (dword == 1) {
-+      as_mfvsrd(tmp, lhsDest);
-+    } else {
-+      as_xxpermdi(scratch128, lhsDest, lhsDest, 2);
-+      as_mfvsrd(tmp, scratch128);
-+    }
-+    // rldimi RT,RS,SH,MB: insert rotated RS bits into RT at positions
-+    // MB..63-SH. Insert rhs byte at bit offset 8*byteInDword from LSB:
-+    //   SH = 8*byteInDword, MB = 56 - 8*byteInDword
-+    as_rldimi(tmp, rhs, 8 * byteInDword, 56 - 8 * byteInDword);
-+    as_mtvsrd(scratch128, tmp);
-+    // mtvsrd writes scratch128.dw0 from `tmp` and leaves scratch128.dw1
-+    // undefined. Both xxpermdi forms below select scratch128.dw0 only:
-+    //   DM=0b01 → [scratch.dw0, lhsDest.dw1]
-+    //   DM=0b00 → [lhsDest.dw0, scratch.dw0]
-+    // So the undefined dw1 is never read. INVARIANT: any future change
-+    // to either DM literal MUST first zero scratch128.dw1 via xxlxor or
-+    // adopt a different staging scheme; otherwise reads of dw1 produce
-+    // POWER9-zero / POWER8-undefined garbage in the output.
-+    if (dword == 1) {
-+      as_xxpermdi(lhsDest, scratch128, lhsDest, 1);
-+    } else {
-+      as_xxpermdi(lhsDest, lhsDest, scratch128, 0);
-+    }
-+  }
-+}
-+
-+void MacroAssembler::replaceLaneInt16x8(unsigned lane, Register rhs,
-+                                        FloatRegister lhsDest) {
-+  MOZ_ASSERT(lane < 8);
-+  if (HasPOWER10()) {
-+    // 2 insns + 1 GPR scratch: lane*2 → byte position, then vinshrx.
-+    UseScratchRegisterScope temps(asMasm());
-+    Register idx = temps.Acquire();
-+    xs_li(idx, int16_t(lane * 2));
-+    as_vinshrx(lhsDest, idx, rhs);
-+    return;
-+  }
-+  if (HasPOWER9()) {
-+    // 2 insns + 1 VSR scratch: stage rhs in BE 0..63 (low 16 of rhs
-+    // lands at BE bytes 6..7), then vinserth copies those two bytes
-+    // into lhsDest's BE bytes (14 - 2L)..(15 - 2L) = wasm lane L.
-+    ScratchSimd128Scope scratch(*this);
-+    as_mtvsrd(scratch, rhs);
-+    as_vinserth(lhsDest, scratch, 14 - 2 * lane);
-+    return;
-+  }
-+  {
-+    // POWER8: extract dword, rldimi to insert halfword, write back.
-+    // Same dw1-undef invariant as replaceLaneInt8x16 above.
-+    UseScratchRegisterScope temps(asMasm());
-+    ScratchSimd128Scope scratch128(*this);
-+    Register tmp = temps.Acquire();
-+    unsigned dword = lane / 4;
-+    unsigned hwInDword = lane % 4;
-+    if (dword == 1) {
-+      as_mfvsrd(tmp, lhsDest);
-+    } else {
-+      as_xxpermdi(scratch128, lhsDest, lhsDest, 2);
-+      as_mfvsrd(tmp, scratch128);
-+    }
-+    as_rldimi(tmp, rhs, 16 * hwInDword, 48 - 16 * hwInDword);
-+    as_mtvsrd(scratch128, tmp);
-+    if (dword == 1) {
-+      as_xxpermdi(lhsDest, scratch128, lhsDest, 1);
-+    } else {
-+      as_xxpermdi(lhsDest, lhsDest, scratch128, 0);
-+    }
-+  }
-+}
-+
-+void MacroAssembler::replaceLaneInt32x4(unsigned lane, Register rhs,
-+                                        FloatRegister lhsDest) {
-+  MOZ_ASSERT(lane < 4);
-+  if (HasPOWER10()) {
-+    // 1 insn, no scratch VSR. UIM is the BE byte offset.
-+    as_vinsw(lhsDest, rhs, (3 - lane) * 4);
-+    return;
-+  }
-+  if (HasPOWER9()) {
-+    // POWER9: xxinsertw inserts word from bits[32:63] of XB at BE byte
-+    // offset UIM in XT. mtvsrd puts GPR into bits[0:63]; low 32 bits
-+    // land at bits[32:63]. BE byte offset of Wasm word lane = (3-lane)*4.
-+    ScratchSimd128Scope scratch(*this);
-+    as_mtvsrd(scratch, rhs);
-+    as_xxinsertw(lhsDest, scratch, (3 - lane) * 4);
-+    return;
-+  }
-+  // POWER8: extract dword, rldimi to insert word, write back.
-+  // Modeled on replaceLaneInt16x8 below.
-+  UseScratchRegisterScope temps(asMasm());
-+  ScratchSimd128Scope scratch128(*this);
-+  Register tmp = temps.Acquire();
-+  unsigned dword = lane / 2;        // 0 = lanes 0,1; 1 = lanes 2,3.
-+  unsigned wordInDword = lane % 2;  // 0 = low LE word; 1 = high LE word.
-+  if (dword == 1) {
-+    as_mfvsrd(tmp, lhsDest);
-+  } else {
-+    as_xxpermdi(scratch128, lhsDest, lhsDest, 2);
-+    as_mfvsrd(tmp, scratch128);
-+  }
-+  as_rldimi(tmp, rhs, 32 * wordInDword, 32 - 32 * wordInDword);
-+  as_mtvsrd(scratch128, tmp);
-+  if (dword == 1) {
-+    as_xxpermdi(lhsDest, scratch128, lhsDest, 1);
-+  } else {
-+    as_xxpermdi(lhsDest, lhsDest, scratch128, 0);
-+  }
-+}
-+
-+void MacroAssembler::replaceLaneFloat32x4(unsigned lane, FloatRegister rhs,
-+                                          FloatRegister lhsDest) {
-+  MOZ_ASSERT(lane < 4);
-+  if (HasPOWER9()) {
-+    ScratchSimd128Scope scratch(*this);
-+    as_xscvdpspn(scratch, rhs);
-+    as_xxinsertw(lhsDest, scratch, (3 - lane) * 4);
-+    return;
-+  }
-+  // POWER8: convert double rhs to single (lands in BE bits 0..31 of FPR),
-+  // extract bits to a GPR, then route through the integer insert path.
-+  UseScratchRegisterScope temps(asMasm());
-+  Register rhsBits = temps.Acquire();
-+  {
-+    ScratchSimd128Scope scratch(*this);
-+    as_xscvdpspn(scratch, rhs);
-+    as_mfvsrd(rhsBits, scratch);   // single is in high 32 bits of GPR
-+    x_srdi(rhsBits, rhsBits, 32);  // single → low 32 bits
-+  }
-+  // Inline the int-insert sequence (can't call replaceLaneInt32x4 from
-+  // here because we're already inside a UseScratchRegisterScope and
-+  // need to acquire a separate tmp).
-+  ScratchSimd128Scope scratch128(*this);
-+  Register tmp = temps.Acquire();
-+  unsigned dword = lane / 2;
-+  unsigned wordInDword = lane % 2;
-+  if (dword == 1) {
-+    as_mfvsrd(tmp, lhsDest);
-+  } else {
-+    as_xxpermdi(scratch128, lhsDest, lhsDest, 2);
-+    as_mfvsrd(tmp, scratch128);
-+  }
-+  as_rldimi(tmp, rhsBits, 32 * wordInDword, 32 - 32 * wordInDword);
-+  as_mtvsrd(scratch128, tmp);
-+  if (dword == 1) {
-+    as_xxpermdi(lhsDest, scratch128, lhsDest, 1);
-+  } else {
-+    as_xxpermdi(lhsDest, lhsDest, scratch128, 0);
-+  }
-+}
-+
-+void MacroAssembler::replaceLaneFloat64x2(unsigned lane, FloatRegister rhs,
-+                                          FloatRegister lhsDest) {
-+  MOZ_ASSERT(lane < 2);
-+  // xxpermdi to place the scalar double into the correct lane.
-+  if (lane == 0) {
-+    // Replace LE low dword (= dw1). Keep lhsDest dw0 (lane 1).
-+    // rhs scalar is in dw0. dm=0b00: [lhsDest.dw0, rhs.dw0]
-+    as_xxpermdi(lhsDest, lhsDest, rhs, 0);
-+  } else {
-+    // Replace LE high dword (= dw0). Keep lhsDest dw1 (lane 0).
-+    // rhs scalar is in dw0. dm=0b01: [rhs.dw0, lhsDest.dw1]
-+    as_xxpermdi(lhsDest, rhs, lhsDest, 1);
-+  }
-+}
-+
-+void MacroAssembler::shuffleInt8x16(const uint8_t lanes[16], FloatRegister rhs,
-+                                    FloatRegister lhsDest) {
-+  shuffleInt8x16(lanes, lhsDest, rhs, lhsDest);
-+}
-+
-+void MacroAssembler::shuffleInt8x16(const uint8_t lanes[16], FloatRegister lhs,
-+                                    FloatRegister rhs, FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  // PPC64 vperm uses BE byte indices: VRA[0]=MSB, VRA[15]=LSB, VRB[16..31].
-+  // Convert Wasm LE lane indices to vperm control: lhs lane N = BE index
-+  // (15-N), rhs lane N = BE index (31-N) = (47 - (N+16)).
-+  int8_t ctrl[16];
-+  for (unsigned i = 0; i < 16; i++) {
-+    uint8_t src = lanes[i];
-+    if (src < 16) {
-+      ctrl[i] = 15 - src;
-+    } else {
-+      ctrl[i] = 47 - src;
-+    }
-+  }
-+  loadConstantSimd128(SimdConstant::CreateX16(ctrl), scratch);
-+  // vperm directly on Simd128 regs.
-+  as_vperm(dest.encoding() & 31, lhs.encoding() & 31, rhs.encoding() & 31,
-+           scratch.encoding() & 31);
-+}
-+
-+void MacroAssembler::laneSelectSimd128(FloatRegister mask, FloatRegister lhs,
-+                                       FloatRegister rhs, FloatRegister dest) {
-+  // xxsel: XC=0→XA, XC=1→XB → XT = (XA & ~XC) | (XB & XC)
-+  // laneSelect: dest = (lhs & mask) | (rhs & ~mask)
-+  // Need XA=rhs, XB=lhs, XC=mask.
-+  as_xxsel(dest, rhs, lhs, mask);
-+}
-+
-+void MacroAssembler::interleaveHighInt8x16(FloatRegister lhs, FloatRegister rhs,
-+                                           FloatRegister dest) {
-+  // On LE, vmrghb(rhs, lhs) gives Wasm interleave_high.
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmrghb), rhs, lhs, dest);
-+}
-+
-+void MacroAssembler::interleaveHighInt16x8(FloatRegister lhs, FloatRegister rhs,
-+                                           FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmrghh), rhs, lhs, dest);
-+}
-+
-+void MacroAssembler::interleaveHighInt32x4(FloatRegister lhs, FloatRegister rhs,
-+                                           FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmrghw), rhs, lhs, dest);
-+}
-+
-+void MacroAssembler::interleaveHighInt64x2(FloatRegister lhs, FloatRegister rhs,
-+                                           FloatRegister dest) {
-+  // xxpermdi DM=0: [XA.dw0, XB.dw0] = merge high dwords.
-+  // On LE: dw0 = high Wasm lane (lane 1).
-+  as_xxpermdi(dest, rhs, lhs, 0);
-+}
-+
-+void MacroAssembler::interleaveLowInt8x16(FloatRegister lhs, FloatRegister rhs,
-+                                          FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmrglb), rhs, lhs, dest);
-+}
-+
-+void MacroAssembler::interleaveLowInt16x8(FloatRegister lhs, FloatRegister rhs,
-+                                          FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmrglh), rhs, lhs, dest);
-+}
-+
-+void MacroAssembler::interleaveLowInt32x4(FloatRegister lhs, FloatRegister rhs,
-+                                          FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmrglw), rhs, lhs, dest);
-+}
-+
-+void MacroAssembler::interleaveLowInt64x2(FloatRegister lhs, FloatRegister rhs,
-+                                          FloatRegister dest) {
-+  // xxpermdi DM=3: [XA.dw1, XB.dw1] = merge low dwords.
-+  as_xxpermdi(dest, rhs, lhs, 3);
-+}
-+
-+void MacroAssembler::concatAndRightShiftSimd128(FloatRegister lhs,
-+                                                FloatRegister rhs,
-+                                                FloatRegister dest,
-+                                                uint32_t shift) {
-+  // vsldoi(VRA, VRB, SH) extracts 16 bytes starting at byte SH of the
-+  // big-endian concatenation VRA||VRB. Endianness mapping for the Wasm
-+  // `v128.shuffle` right-shift-concat semantic:
-+  //   Wasm:  result[i] = (i + shift < 16) ? rhs[i + shift]
-+  //                                       : lhs[i + shift - 16]
-+  //   PPC LE: vsldoi(rhs, lhs, shift) produces exactly that — the LE byte
-+  //   layout reverses from BE, so passing (rhs, lhs, shift) here is the LE
-+  //   equivalent of (lhs, rhs, 16 - shift) on BE.
-+  MOZ_ASSERT(shift < 16);
-+  if (shift == 0) {
-+    moveSimd128(rhs, dest);
-+    return;
-+  }
-+  // vsldoi VRT,VRA,VRB,SH: result[i] = (VRA||VRB)[SH+i]
-+  // Emit vsldoi directly on Simd128 regs (VRA = lhs = high part, VRB =
-+  // rhs = low part). The VMX emitter masks `& 31` internally to extract
-+  // the 5-bit VR field from the Simd128 encoding.
-+  as_vsldoi(dest, lhs, rhs, shift);
-+}
-+
-+void MacroAssembler::leftShiftSimd128(Imm32 count, FloatRegister src,
-+                                      FloatRegister dest) {
-+  MOZ_ASSERT(count.value < 16);
-+  if (count.value == 0) {
-+    moveSimd128(src, dest);
-+    return;
-+  }
-+  // vslo shifts left by bytes (count in bits 121-124 of VRB, i.e. byte 15 bits
-+  // 1-4). vsl shifts left by bits (count in bits 125-127 of VRB, i.e. byte 15
-+  // bits 5-7). For byte shift: splatX4(count*8, scratch), then vslo.
-+  ScratchSimd128Scope scratch(*this);
-+  SplatImm32(*this, Imm32(count.value * 8), scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vslo), src, scratch, dest);
-+}
-+
-+void MacroAssembler::rightShiftSimd128(Imm32 count, FloatRegister src,
-+                                       FloatRegister dest) {
-+  MOZ_ASSERT(count.value < 16);
-+  if (count.value == 0) {
-+    moveSimd128(src, dest);
-+    return;
-+  }
-+  ScratchSimd128Scope scratch(*this);
-+  SplatImm32(*this, Imm32(count.value * 8), scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsro), src, scratch, dest);
-+}
-+
-+void MacroAssembler::zeroExtend8x16To16x8(FloatRegister src,
-+                                          FloatRegister dest) {
-+  // Unsigned widen low: interleave low bytes with zero bytes.
-+  // On LE, vmrglb(zero, src) interleaves the low 8 bytes of src with zeros.
-+  // Use ScratchSimd128Reg as the zero. Order matters: read src into the
-+  // merge BEFORE writing dest (which might alias src). vmrglb reads
-+  // vra+vrb, writes vrt — single-cycle issue.
-+  ScratchSimd128Scope zero(*this);
-+  as_xxlxor(zero, zero, zero);
-+  as_vmrglb(dest.encoding() & 31, zero.encoding() & 31, src.encoding() & 31);
-+}
-+
-+void MacroAssembler::zeroExtend8x16To32x4(FloatRegister src,
-+                                          FloatRegister dest) {
-+  zeroExtend8x16To16x8(src, dest);
-+  zeroExtend16x8To32x4(dest, dest);
-+}
-+
-+void MacroAssembler::zeroExtend8x16To64x2(FloatRegister src,
-+                                          FloatRegister dest) {
-+  zeroExtend8x16To32x4(src, dest);
-+  zeroExtend32x4To64x2(dest, dest);
-+}
-+
-+void MacroAssembler::zeroExtend16x8To32x4(FloatRegister src,
-+                                          FloatRegister dest) {
-+  // Unsigned widen low: interleave low halfwords with zero halfwords.
-+  ScratchSimd128Scope zero(*this);
-+  as_xxlxor(zero, zero, zero);
-+  as_vmrglh(dest.encoding() & 31, zero.encoding() & 31, src.encoding() & 31);
-+}
-+
-+void MacroAssembler::zeroExtend16x8To64x2(FloatRegister src,
-+                                          FloatRegister dest) {
-+  zeroExtend16x8To32x4(src, dest);
-+  zeroExtend32x4To64x2(dest, dest);
-+}
-+
-+void MacroAssembler::zeroExtend32x4To64x2(FloatRegister src,
-+                                          FloatRegister dest) {
-+  // Unsigned widen low: interleave low words with zero words.
-+  ScratchSimd128Scope zero(*this);
-+  as_xxlxor(zero, zero, zero);
-+  as_vmrglw(dest.encoding() & 31, zero.encoding() & 31, src.encoding() & 31);
-+}
-+
-+void MacroAssembler::reverseInt16x8(FloatRegister src, FloatRegister dest) {
-+  const uint8_t lanes[] = {14, 15, 12, 13, 10, 11, 8, 9,
-+                           6,  7,  4,  5,  2,  3,  0, 1};
-+  shuffleInt8x16(lanes, src, src, dest);
-+}
-+
-+void MacroAssembler::reverseInt32x4(FloatRegister src, FloatRegister dest) {
-+  const uint8_t lanes[] = {12, 13, 14, 15, 8, 9, 10, 11,
-+                           4,  5,  6,  7,  0, 1, 2,  3};
-+  shuffleInt8x16(lanes, src, src, dest);
-+}
-+
-+void MacroAssembler::reverseInt64x2(FloatRegister src, FloatRegister dest) {
-+  as_xxpermdi(dest, src, src, 2);
-+}
-+
-+void MacroAssembler::swizzleInt8x16Relaxed(FloatRegister lhs, FloatRegister rhs,
-+                                           FloatRegister dest) {
-+  swizzleInt8x16(lhs, rhs, dest);
-+}
-+
-+// extMul{Low,High}Int{8x16,16x8} use POWER8+ widening multiplies
-+// (vmul{e,o}{s,u}{b,h}) plus a halfword/word merge to map BE-indexed
-+// even/odd products into Wasm lane order on PPC64 LE.
-+//
-+// Lane mapping:
-+//   For Low (Wasm lanes from LE bytes/HW 0..N/2-1 = BE 15..N/2):
-+//     vmrgl{h,w}(even_products, odd_products) places the right products
-+//     at BE result indices, which on LE map to Wasm lanes 0..N/2-1.
-+//   For High (Wasm lanes from LE indices N/2..N-1 = BE N/2-1..0):
-+//     vmrgh{h,w} takes the upper-half BE indices instead.
-+//
-+// Aliasing safety: vmul* reads both operands before writing, so
-+// `dest = vmulo* lhs, rhs` is safe even when dest aliases lhs/rhs.
-+// We use one scratch for the even-product half because vmrgl{h,w}
-+// reads dest after the odd multiply.
-+
-+void MacroAssembler::extMulLowInt8x16(FloatRegister lhs, FloatRegister rhs,
-+                                      FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  uint8_t l = lhs.encoding() & 31, r = rhs.encoding() & 31;
-+  uint8_t d = dest.encoding() & 31, s = scratch.encoding() & 31;
-+  as_vmulesb(s, l, r);
-+  as_vmulosb(d, l, r);
-+  as_vmrglh(d, s, d);
-+}
-+
-+void MacroAssembler::extMulHighInt8x16(FloatRegister lhs, FloatRegister rhs,
-+                                       FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  uint8_t l = lhs.encoding() & 31, r = rhs.encoding() & 31;
-+  uint8_t d = dest.encoding() & 31, s = scratch.encoding() & 31;
-+  as_vmulesb(s, l, r);
-+  as_vmulosb(d, l, r);
-+  as_vmrghh(d, s, d);
-+}
-+
-+void MacroAssembler::unsignedExtMulLowInt8x16(FloatRegister lhs,
-+                                              FloatRegister rhs,
-+                                              FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  uint8_t l = lhs.encoding() & 31, r = rhs.encoding() & 31;
-+  uint8_t d = dest.encoding() & 31, s = scratch.encoding() & 31;
-+  as_vmuleub(s, l, r);
-+  as_vmuloub(d, l, r);
-+  as_vmrglh(d, s, d);
-+}
-+
-+void MacroAssembler::unsignedExtMulHighInt8x16(FloatRegister lhs,
-+                                               FloatRegister rhs,
-+                                               FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  uint8_t l = lhs.encoding() & 31, r = rhs.encoding() & 31;
-+  uint8_t d = dest.encoding() & 31, s = scratch.encoding() & 31;
-+  as_vmuleub(s, l, r);
-+  as_vmuloub(d, l, r);
-+  as_vmrghh(d, s, d);
-+}
-+
-+void MacroAssembler::extMulLowInt16x8(FloatRegister lhs, FloatRegister rhs,
-+                                      FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  uint8_t l = lhs.encoding() & 31, r = rhs.encoding() & 31;
-+  uint8_t d = dest.encoding() & 31, s = scratch.encoding() & 31;
-+  as_vmulesh(s, l, r);
-+  as_vmulosh(d, l, r);
-+  as_vmrglw(d, s, d);
-+}
-+
-+void MacroAssembler::extMulHighInt16x8(FloatRegister lhs, FloatRegister rhs,
-+                                       FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  uint8_t l = lhs.encoding() & 31, r = rhs.encoding() & 31;
-+  uint8_t d = dest.encoding() & 31, s = scratch.encoding() & 31;
-+  as_vmulesh(s, l, r);
-+  as_vmulosh(d, l, r);
-+  as_vmrghw(d, s, d);
-+}
-+
-+void MacroAssembler::unsignedExtMulLowInt16x8(FloatRegister lhs,
-+                                              FloatRegister rhs,
-+                                              FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  uint8_t l = lhs.encoding() & 31, r = rhs.encoding() & 31;
-+  uint8_t d = dest.encoding() & 31, s = scratch.encoding() & 31;
-+  as_vmuleuh(s, l, r);
-+  as_vmulouh(d, l, r);
-+  as_vmrglw(d, s, d);
-+}
-+
-+void MacroAssembler::unsignedExtMulHighInt16x8(FloatRegister lhs,
-+                                               FloatRegister rhs,
-+                                               FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  uint8_t l = lhs.encoding() & 31, r = rhs.encoding() & 31;
-+  uint8_t d = dest.encoding() & 31, s = scratch.encoding() & 31;
-+  as_vmuleuh(s, l, r);
-+  as_vmulouh(d, l, r);
-+  as_vmrghw(d, s, d);
-+}
-+
-+// ExtMul{Low,High}Int32x4 use vmul{e,o}{s,u}w (POWER8+) plus xxpermdi
-+// to combine the two i64 partial products into Wasm lane order on PPC64
-+// LE. xxpermdi accepts the full 6-bit VSR encoding so it works directly
-+// on Simd128 regs (encoding 32-63) without any VR staging.
-+//
-+// Aliasing safe: both vmul* reads complete before the second one writes
-+// dest, and xxpermdi reads both inputs before writing.
-+
-+static void EmitExtMulInt32x4(
-+    MacroAssembler& masm, FloatRegister lhs, FloatRegister rhs,
-+    FloatRegister dest, void (*mulEven)(Assembler&, uint8_t, uint8_t, uint8_t),
-+    void (*mulOdd)(Assembler&, uint8_t, uint8_t, uint8_t), uint8_t dm) {
-+  ScratchSimd128Scope scratch(masm);
-+  uint8_t l = lhs.encoding() & 31;
-+  uint8_t r = rhs.encoding() & 31;
-+  uint8_t d = dest.encoding() & 31;
-+  uint8_t s = scratch.encoding() & 31;
-+  mulEven(static_cast<Assembler&>(masm), s, l, r);
-+  mulOdd(static_cast<Assembler&>(masm), d, l, r);
-+  masm.as_xxpermdi(dest, scratch, dest, dm);
-+}
-+
-+void MacroAssembler::extMulLowInt32x4(FloatRegister lhs, FloatRegister rhs,
-+                                      FloatRegister dest) {
-+  EmitExtMulInt32x4(
-+      *this, lhs, rhs, dest,
-+      [](Assembler& a, uint8_t t, uint8_t x, uint8_t y) {
-+        a.as_vmulesw(t, x, y);
-+      },
-+      [](Assembler& a, uint8_t t, uint8_t x, uint8_t y) {
-+        a.as_vmulosw(t, x, y);
-+      },
-+      3);
-+}
-+
-+void MacroAssembler::extMulHighInt32x4(FloatRegister lhs, FloatRegister rhs,
-+                                       FloatRegister dest) {
-+  EmitExtMulInt32x4(
-+      *this, lhs, rhs, dest,
-+      [](Assembler& a, uint8_t t, uint8_t x, uint8_t y) {
-+        a.as_vmulesw(t, x, y);
-+      },
-+      [](Assembler& a, uint8_t t, uint8_t x, uint8_t y) {
-+        a.as_vmulosw(t, x, y);
-+      },
-+      0);
-+}
-+
-+void MacroAssembler::unsignedExtMulLowInt32x4(FloatRegister lhs,
-+                                              FloatRegister rhs,
-+                                              FloatRegister dest) {
-+  EmitExtMulInt32x4(
-+      *this, lhs, rhs, dest,
-+      [](Assembler& a, uint8_t t, uint8_t x, uint8_t y) {
-+        a.as_vmuleuw(t, x, y);
-+      },
-+      [](Assembler& a, uint8_t t, uint8_t x, uint8_t y) {
-+        a.as_vmulouw(t, x, y);
-+      },
-+      3);
-+}
-+
-+void MacroAssembler::unsignedExtMulHighInt32x4(FloatRegister lhs,
-+                                               FloatRegister rhs,
-+                                               FloatRegister dest) {
-+  EmitExtMulInt32x4(
-+      *this, lhs, rhs, dest,
-+      [](Assembler& a, uint8_t t, uint8_t x, uint8_t y) {
-+        a.as_vmuleuw(t, x, y);
-+      },
-+      [](Assembler& a, uint8_t t, uint8_t x, uint8_t y) {
-+        a.as_vmulouw(t, x, y);
-+      },
-+      0);
-+}
-+
-+void MacroAssembler::q15MulrSatInt16x8(FloatRegister lhs, FloatRegister rhs,
-+                                       FloatRegister dest) {
-+  // Q15 multiply-round-saturate: vmhraddshs(a, b, zero) computes
-+  // saturate((a[i]*b[i] + 0x4000) >> 15) for each halfword.
-+  ScratchSimd128Scope scratch(*this);
-+  ZeroSimd128(*this, scratch);
-+  EmitVmxTernary(
-+      *this,
-+      [](Assembler& a, uint8_t vrt, uint8_t vra, uint8_t vrb, uint8_t vrc) {
-+        a.as_vmhraddshs(vrt, vra, vrb, vrc);
-+      },
-+      lhs, rhs, scratch, dest);
-+}
-+
-+// neg = 0 - src. Use ScratchSimd128Reg (= VR0, non-allocatable) as the
-+// zero source so the register allocator sees no clobbered VRs.
-+// 2 insns: xxlxor scratch + vsubuXm dest, scratch, src. vneg{b,h}
-+// doesn't exist in any POWER ISA, hence the subtract.
-+void MacroAssembler::negInt8x16(FloatRegister src, FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  ZeroSimd128(*this, scratch);
-+  as_vsububm(dest.encoding() & 31, scratch.encoding() & 31,
-+             src.encoding() & 31);
-+}
-+
-+void MacroAssembler::negInt16x8(FloatRegister src, FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  ZeroSimd128(*this, scratch);
-+  as_vsubuhm(dest.encoding() & 31, scratch.encoding() & 31,
-+             src.encoding() & 31);
-+}
-+
-+void MacroAssembler::negInt32x4(FloatRegister src, FloatRegister dest) {
-+  if (HasPOWER9()) {
-+    EmitVmxUnary(
-+        *this,
-+        [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vnegw(vrt, vrb); },
-+        src, dest);
-+    return;
-+  }
-+  // POWER8 fallback: 0 - src via ScratchSimd128Reg (VR0).
-+  ScratchSimd128Scope scratch(*this);
-+  ZeroSimd128(*this, scratch);
-+  as_vsubuwm(dest.encoding() & 31, scratch.encoding() & 31,
-+             src.encoding() & 31);
-+}
-+
-+void MacroAssembler::negInt64x2(FloatRegister src, FloatRegister dest) {
-+  if (HasPOWER9()) {
-+    EmitVmxUnary(
-+        *this,
-+        [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vnegd(vrt, vrb); },
-+        src, dest);
-+    return;
-+  }
-+  // POWER8 fallback: 0 - src via ScratchSimd128Reg (VR0).
-+  ScratchSimd128Scope scratch(*this);
-+  ZeroSimd128(*this, scratch);
-+  as_vsubudm(dest.encoding() & 31, scratch.encoding() & 31,
-+             src.encoding() & 31);
-+}
-+#undef DEF_NEG_INTNxM_VSUB
-+
-+void MacroAssembler::unsignedAddSatInt8x16(FloatRegister lhs, FloatRegister rhs,
-+                                           FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vaddubs), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::unsignedAddSatInt16x8(FloatRegister lhs, FloatRegister rhs,
-+                                           FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vadduhs), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::unsignedSubSatInt8x16(FloatRegister lhs, FloatRegister rhs,
-+                                           FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsububs), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::unsignedSubSatInt16x8(FloatRegister lhs, FloatRegister rhs,
-+                                           FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsubuhs), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::unsignedMinInt8x16(FloatRegister lhs, FloatRegister rhs,
-+                                        FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vminub), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::unsignedMinInt16x8(FloatRegister lhs, FloatRegister rhs,
-+                                        FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vminuh), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::unsignedMinInt32x4(FloatRegister lhs, FloatRegister rhs,
-+                                        FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vminuw), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::unsignedMaxInt8x16(FloatRegister lhs, FloatRegister rhs,
-+                                        FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmaxub), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::unsignedMaxInt16x8(FloatRegister lhs, FloatRegister rhs,
-+                                        FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmaxuh), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::unsignedMaxInt32x4(FloatRegister lhs, FloatRegister rhs,
-+                                        FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmaxuw), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::unsignedAverageInt8x16(FloatRegister lhs,
-+                                            FloatRegister rhs,
-+                                            FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vavgub), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::unsignedAverageInt16x8(FloatRegister lhs,
-+                                            FloatRegister rhs,
-+                                            FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vavguh), lhs, rhs, dest);
-+}
-+
-+// abs(x) = max(x, -x) per signed lane. No vabs{b,h,w,d} exists in any ISA.
-+// vneg{w,d} exists only on POWER9.
-+// We use ScratchSimd128Reg as a temp for -src. Order matters: compute
-+// -src into temp first (reads src), then max(src, temp) into dest (reads
-+// src + temp, writes dest). Safe even when dest == src because src is
-+// read before dest is written by vmaxsX.
-+
-+void MacroAssembler::absInt8x16(FloatRegister src, FloatRegister dest) {
-+  ScratchSimd128Scope tmp(*this);
-+  as_xxlxor(tmp, tmp, tmp);  // tmp = 0
-+  as_vsububm(tmp.encoding() & 31, tmp.encoding() & 31,
-+             src.encoding() & 31);  // tmp = -src
-+  as_vmaxsb(dest.encoding() & 31, src.encoding() & 31,
-+            tmp.encoding() & 31);  // dest = max(src, -src)
-+}
-+
-+void MacroAssembler::absInt16x8(FloatRegister src, FloatRegister dest) {
-+  ScratchSimd128Scope tmp(*this);
-+  as_xxlxor(tmp, tmp, tmp);
-+  as_vsubuhm(tmp.encoding() & 31, tmp.encoding() & 31, src.encoding() & 31);
-+  as_vmaxsh(dest.encoding() & 31, src.encoding() & 31, tmp.encoding() & 31);
-+}
-+
-+void MacroAssembler::absInt32x4(FloatRegister src, FloatRegister dest) {
-+  ScratchSimd128Scope tmp(*this);
-+  if (HasPOWER9()) {
-+    as_vnegw(tmp.encoding() & 31, src.encoding() & 31);  // tmp = -src
-+  } else {
-+    as_xxlxor(tmp, tmp, tmp);
-+    as_vsubuwm(tmp.encoding() & 31, tmp.encoding() & 31, src.encoding() & 31);
-+  }
-+  as_vmaxsw(dest.encoding() & 31, src.encoding() & 31, tmp.encoding() & 31);
-+}
-+
-+void MacroAssembler::absInt64x2(FloatRegister src, FloatRegister dest) {
-+  ScratchSimd128Scope tmp(*this);
-+  if (HasPOWER9()) {
-+    as_vnegd(tmp.encoding() & 31, src.encoding() & 31);  // tmp = -src
-+  } else {
-+    as_xxlxor(tmp, tmp, tmp);
-+    as_vsubudm(tmp.encoding() & 31, tmp.encoding() & 31, src.encoding() & 31);
-+  }
-+  as_vmaxsd(dest.encoding() & 31, src.encoding() & 31, tmp.encoding() & 31);
-+}
-+
-+void MacroAssembler::leftShiftInt8x16(Imm32 count, FloatRegister src,
-+                                      FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  SplatImm8(*this, count, scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vslb), src, scratch, dest);
-+}
-+
-+void MacroAssembler::leftShiftInt16x8(Imm32 count, FloatRegister src,
-+                                      FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  SplatImm16(*this, count, scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vslh), src, scratch, dest);
-+}
-+
-+void MacroAssembler::leftShiftInt32x4(Imm32 count, FloatRegister src,
-+                                      FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  SplatImm32(*this, count, scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vslw), src, scratch, dest);
-+}
-+
-+void MacroAssembler::leftShiftInt64x2(Imm32 count, FloatRegister src,
-+                                      FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  SplatImm32(*this, count, scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsld), src, scratch, dest);
-+}
-+
-+void MacroAssembler::rightShiftInt8x16(Imm32 count, FloatRegister src,
-+                                       FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  SplatImm8(*this, count, scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrab), src, scratch, dest);
-+}
-+
-+void MacroAssembler::unsignedRightShiftInt8x16(Imm32 count, FloatRegister src,
-+                                               FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  SplatImm8(*this, count, scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrb), src, scratch, dest);
-+}
-+
-+void MacroAssembler::rightShiftInt16x8(Imm32 count, FloatRegister src,
-+                                       FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  SplatImm16(*this, count, scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrah), src, scratch, dest);
-+}
-+
-+void MacroAssembler::unsignedRightShiftInt16x8(Imm32 count, FloatRegister src,
-+                                               FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  SplatImm16(*this, count, scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrh), src, scratch, dest);
-+}
-+
-+void MacroAssembler::rightShiftInt32x4(Imm32 count, FloatRegister src,
-+                                       FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  SplatImm32(*this, count, scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsraw), src, scratch, dest);
-+}
-+
-+void MacroAssembler::unsignedRightShiftInt32x4(Imm32 count, FloatRegister src,
-+                                               FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  SplatImm32(*this, count, scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrw), src, scratch, dest);
-+}
-+
-+void MacroAssembler::rightShiftInt64x2(Imm32 count, FloatRegister src,
-+                                       FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  SplatImm32(*this, count, scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrad), src, scratch, dest);
-+}
-+
-+void MacroAssembler::unsignedRightShiftInt64x2(Imm32 count, FloatRegister src,
-+                                               FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  SplatImm32(*this, count, scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrd), src, scratch, dest);
-+}
-+
-+void MacroAssembler::bitwiseAndSimd128(FloatRegister rhs,
-+                                       FloatRegister lhsDest) {
-+  as_xxland(lhsDest, lhsDest, rhs);
-+}
-+
-+void MacroAssembler::bitwiseAndSimd128(FloatRegister lhs, FloatRegister rhs,
-+                                       FloatRegister dest) {
-+  as_xxland(dest, lhs, rhs);
-+}
-+
-+void MacroAssembler::bitwiseOrSimd128(FloatRegister rhs,
-+                                      FloatRegister lhsDest) {
-+  as_xxlor(lhsDest, lhsDest, rhs);
-+}
-+
-+void MacroAssembler::bitwiseOrSimd128(FloatRegister lhs, FloatRegister rhs,
-+                                      FloatRegister dest) {
-+  as_xxlor(dest, lhs, rhs);
-+}
-+
-+void MacroAssembler::bitwiseXorSimd128(FloatRegister rhs,
-+                                       FloatRegister lhsDest) {
-+  as_xxlxor(lhsDest, lhsDest, rhs);
-+}
-+
-+void MacroAssembler::bitwiseXorSimd128(FloatRegister lhs, FloatRegister rhs,
-+                                       FloatRegister dest) {
-+  as_xxlxor(dest, lhs, rhs);
-+}
-+
-+void MacroAssembler::bitwiseNotSimd128(FloatRegister src, FloatRegister dest) {
-+  as_xxlnor(dest, src, src);
-+}
-+
-+void MacroAssembler::bitwiseNotAndSimd128(FloatRegister rhs,
-+                                          FloatRegister lhsDest) {
-+  // notand(lhs, rhs) = ~lhs & rhs = xxlandc(rhs, lhs)
-+  as_xxlandc(lhsDest, rhs, lhsDest);
-+}
-+
-+void MacroAssembler::anyTrueSimd128(FloatRegister src, Register dest) {
-+  // vcmpequd. (POWER8+) against zero sets CR6:
-+  //   - CR6.LT (BE bit 24) = 1 iff the per-lane result is all-1s, i.e.
-+  //     every doubleword of src equals zero (= src is all-zero).
-+  //   - CR6.EQ (BE bit 26) = 1 iff no lane was equal (= any nonzero).
-+  // any-true = !all-zero = !CR6.LT.
-+  ScratchSimd128Scope scratch(*this);
-+  uint8_t s = scratch.encoding() & 31;
-+  as_xxlxor(scratch, scratch, scratch);
-+  as_vcmpequd_rc(s, src.encoding() & 31, s);
-+  if (HasPOWER10()) {
-+    // setbcr materialises (CR[BI] == 0) ? 1 : 0 directly into dest.
-+    // dest = (CR6.LT == 0) = "not all-zero" = any-true.
-+    as_setbcr(dest, Assembler::LessThan, cr6);
-+    return;
-+  }
-+  as_mfocrf(dest, cr6);
-+  // CR6.LT is at BE bit 24 of the GPR. rlwinm sh=25 rotates left 25:
-+  // bit (24 - 25) mod 32 = 31 (LSB). Mask 31..31 keeps just bit 31.
-+  as_rlwinm(dest, dest, 25, 31, 31);
-+  as_xori(dest, dest, 1);
-+}
-+
-+// vcmpequX. against zero sets CR6: LT = all input lanes were zero,
-+// EQ = no input lane was zero. The latter is exactly "all-true".
-+// mfocrf places CR6 at bits 24-27 of the low 32-bit half (LT=24, EQ=26).
-+// rlwinm rd,rd,27,31,31 extracts bit 26 (CR6.EQ) right-justified.
-+template <typename VmxCmpRcFn>
-+static void EmitAllTrueInt(MacroAssembler& masm, FloatRegister src,
-+                           Register dest, VmxCmpRcFn vmxCmpRc) {
-+  ScratchSimd128Scope scratch(masm);
-+  ZeroSimd128(masm, scratch);
-+  uint8_t s = scratch.encoding() & 31;
-+  vmxCmpRc(static_cast<Assembler&>(masm), s, src.encoding() & 31, s);
-+  if (HasPOWER10()) {
-+    // setbc materialises CR6.EQ directly into dest (1 insn vs the 2-insn
-+    // mfocrf + rlwinm extract). Already wired in ma_cmp_set.
-+    masm.as_setbc(dest, Assembler::Equal, cr6);
-+    return;
-+  }
-+  masm.as_mfocrf(dest, cr6);
-+  masm.as_rlwinm(dest, dest, 27, 31, 31);
-+}
-+
-+void MacroAssembler::allTrueInt8x16(FloatRegister src, Register dest) {
-+  EmitAllTrueInt(*this, src, dest,
-+                 [](Assembler& a, uint8_t t, uint8_t r, uint8_t b) {
-+                   a.as_vcmpequb_rc(t, r, b);
-+                 });
-+}
-+
-+void MacroAssembler::allTrueInt16x8(FloatRegister src, Register dest) {
-+  EmitAllTrueInt(*this, src, dest,
-+                 [](Assembler& a, uint8_t t, uint8_t r, uint8_t b) {
-+                   a.as_vcmpequh_rc(t, r, b);
-+                 });
-+}
-+
-+void MacroAssembler::allTrueInt32x4(FloatRegister src, Register dest) {
-+  EmitAllTrueInt(*this, src, dest,
-+                 [](Assembler& a, uint8_t t, uint8_t r, uint8_t b) {
-+                   a.as_vcmpequw_rc(t, r, b);
-+                 });
-+}
-+
-+void MacroAssembler::allTrueInt64x2(FloatRegister src, Register dest) {
-+  EmitAllTrueInt(*this, src, dest,
-+                 [](Assembler& a, uint8_t t, uint8_t r, uint8_t b) {
-+                   a.as_vcmpequd_rc(t, r, b);
-+                 });
-+}
-+
-+void MacroAssembler::compareInt8x16(Assembler::Condition cond,
-+                                    FloatRegister rhs, FloatRegister lhsDest) {
-+  compareInt8x16(cond, lhsDest, rhs, lhsDest);
-+}
-+
-+void MacroAssembler::compareInt8x16(Assembler::Condition cond,
-+                                    FloatRegister lhs, FloatRegister rhs,
-+                                    FloatRegister dest) {
-+  if (cond == Assembler::NotEqual && HasPOWER9()) {
-+    EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vcmpneb), lhs, rhs, dest);
-+    return;
-+  }
-+  EmitVmxCompare(*this, cond, lhs, rhs, dest, VMX_BINARY_WRAPPER(vcmpequb),
-+                 VMX_BINARY_WRAPPER(vcmpgtsb), VMX_BINARY_WRAPPER(vcmpgtub));
-+}
-+
-+void MacroAssembler::compareInt16x8(Assembler::Condition cond,
-+                                    FloatRegister rhs, FloatRegister lhsDest) {
-+  compareInt16x8(cond, lhsDest, rhs, lhsDest);
-+}
-+
-+void MacroAssembler::compareInt16x8(Assembler::Condition cond,
-+                                    FloatRegister lhs, FloatRegister rhs,
-+                                    FloatRegister dest) {
-+  if (cond == Assembler::NotEqual && HasPOWER9()) {
-+    EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vcmpneh), lhs, rhs, dest);
-+    return;
-+  }
-+  EmitVmxCompare(*this, cond, lhs, rhs, dest, VMX_BINARY_WRAPPER(vcmpequh),
-+                 VMX_BINARY_WRAPPER(vcmpgtsh), VMX_BINARY_WRAPPER(vcmpgtuh));
-+}
-+
-+void MacroAssembler::compareInt32x4(Assembler::Condition cond,
-+                                    FloatRegister rhs, FloatRegister lhsDest) {
-+  compareInt32x4(cond, lhsDest, rhs, lhsDest);
-+}
-+
-+void MacroAssembler::compareInt32x4(Assembler::Condition cond,
-+                                    FloatRegister lhs, FloatRegister rhs,
-+                                    FloatRegister dest) {
-+  if (cond == Assembler::NotEqual && HasPOWER9()) {
-+    EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vcmpnew), lhs, rhs, dest);
-+    return;
-+  }
-+  EmitVmxCompare(*this, cond, lhs, rhs, dest, VMX_BINARY_WRAPPER(vcmpequw),
-+                 VMX_BINARY_WRAPPER(vcmpgtsw), VMX_BINARY_WRAPPER(vcmpgtuw));
-+}
-+
-+void MacroAssembler::compareFloat32x4(Assembler::Condition cond,
-+                                      FloatRegister rhs,
-+                                      FloatRegister lhsDest) {
-+  compareFloat32x4(cond, lhsDest, rhs, lhsDest);
-+}
-+
-+void MacroAssembler::compareFloat32x4(Assembler::Condition cond,
-+                                      FloatRegister lhs, FloatRegister rhs,
-+                                      FloatRegister dest) {
-+  switch (cond) {
-+    case Assembler::Equal:
-+      as_xvcmpeqsp(dest, lhs, rhs);
-+      break;
-+    case Assembler::NotEqual:
-+      as_xvcmpeqsp(dest, lhs, rhs);
-+      bitwiseNotSimd128(dest, dest);
-+      break;
-+    case Assembler::GreaterThan:
-+      as_xvcmpgtsp(dest, lhs, rhs);
-+      break;
-+    case Assembler::GreaterThanOrEqual:
-+      as_xvcmpgesp(dest, lhs, rhs);
-+      break;
-+    case Assembler::LessThan:
-+      as_xvcmpgtsp(dest, rhs, lhs);
-+      break;
-+    case Assembler::LessThanOrEqual:
-+      as_xvcmpgesp(dest, rhs, lhs);
-+      break;
-+    default:
-+      MOZ_CRASH("Unexpected SIMD float condition");
-+  }
-+}
-+
-+void MacroAssembler::compareFloat64x2(Assembler::Condition cond,
-+                                      FloatRegister rhs,
-+                                      FloatRegister lhsDest) {
-+  compareFloat64x2(cond, lhsDest, rhs, lhsDest);
-+}
-+
-+void MacroAssembler::compareFloat64x2(Assembler::Condition cond,
-+                                      FloatRegister lhs, FloatRegister rhs,
-+                                      FloatRegister dest) {
-+  switch (cond) {
-+    case Assembler::Equal:
-+      as_xvcmpeqdp(dest, lhs, rhs);
-+      break;
-+    case Assembler::NotEqual:
-+      as_xvcmpeqdp(dest, lhs, rhs);
-+      bitwiseNotSimd128(dest, dest);
-+      break;
-+    case Assembler::GreaterThan:
-+      as_xvcmpgtdp(dest, lhs, rhs);
-+      break;
-+    case Assembler::GreaterThanOrEqual:
-+      as_xvcmpgedp(dest, lhs, rhs);
-+      break;
-+    case Assembler::LessThan:
-+      as_xvcmpgtdp(dest, rhs, lhs);
-+      break;
-+    case Assembler::LessThanOrEqual:
-+      as_xvcmpgedp(dest, rhs, lhs);
-+      break;
-+    default:
-+      MOZ_CRASH("Unexpected SIMD float condition");
-+  }
-+}
-+
-+void MacroAssembler::negFloat32x4(FloatRegister src, FloatRegister dest) {
-+  as_xvnegsp(dest, src);
-+}
-+
-+void MacroAssembler::negFloat64x2(FloatRegister src, FloatRegister dest) {
-+  as_xvnegdp(dest, src);
-+}
-+
-+void MacroAssembler::absFloat32x4(FloatRegister src, FloatRegister dest) {
-+  as_xvabssp(dest, src);
-+}
-+
-+void MacroAssembler::absFloat64x2(FloatRegister src, FloatRegister dest) {
-+  as_xvabsdp(dest, src);
-+}
-+
-+// Per spec:
-+//   result[k] = (s|u)ext_widen(src[2k]) + (s|u)ext_widen(src[2k+1])
-+// POWER lacks pairwise multiply-add. Emulate via vmulX{e,o}X(src, splat(1))
-+// + vadd. Both vmuls need `src` AND `splat(1)` available simultaneously.
-+//
-+// Available SIMD slots without involving Lowering:
-+//   - ScratchSimd128Reg (VR0, non-allocatable)
-+//   - dest, src
-+// That's 3 regs when dest != src — enough for {src, splat, intermediate}.
-+// When dest == src we stash src and the even product to the 288-byte ELFv2
-+// red zone and rebuild splat(1).
-+//
-+// (Earlier implementations of these helpers routed through hardcoded
-+// VR1/VR2/VR3 via xxlor_vsr — faster but stomped allocator-managed VRs
-+// and silently corrupted any live wasm v128 the allocator had placed
-+// there. ScratchSimd128Reg + red-zone stash is the safe contract.)
-+// Always-safe pattern: stash src to red zone so dest can be freely overwritten,
-+// stash even to red zone after first vmul so we can rebuild splat(1) for the
-+// second vmul. The splat-of-1 is now `vspltis{b,h}` (5-bit signed immediate
-+// splat) — 1 insn vs the 3-insn movePtr+mtvsrd+vsplt sequence the previous
-+// path used.
-+// Pattern: stash src to red zone slot 0 so dest can be freely overwritten;
-+// vmul-even (signed/unsigned) of src with splat(1) produces sign/zero-extended
-+// even-lane products into dest; stash that to slot 1 and rebuild scratch=src
-+// (slot 0) and dest=splat(1); vmul-odd produces the odd products; restore
-+// even from slot 1 and pairwise-add.
-+void MacroAssembler::extAddPairwiseInt8x16(FloatRegister src,
-+                                           FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  uint8_t s = scratch.encoding() & 31;
-+  uint8_t srcEnc = src.encoding() & 31;
-+  uint8_t destEnc = dest.encoding() & 31;
-+  RedZoneStashSimd128(*this, src, 0);
-+  as_vspltisb(s, 1);
-+  as_vmulesb(destEnc, srcEnc, s);
-+  RedZoneStashSimd128(*this, dest, 1);
-+  RedZoneRestoreSimd128(*this, 0, scratch);
-+  as_vspltisb(destEnc, 1);
-+  as_vmulosb(destEnc, s, destEnc);
-+  RedZoneRestoreSimd128(*this, 1, scratch);
-+  as_vadduhm(destEnc, destEnc, s);
-+}
-+
-+void MacroAssembler::unsignedExtAddPairwiseInt8x16(FloatRegister src,
-+                                                   FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  uint8_t s = scratch.encoding() & 31;
-+  uint8_t srcEnc = src.encoding() & 31;
-+  uint8_t destEnc = dest.encoding() & 31;
-+  RedZoneStashSimd128(*this, src, 0);
-+  as_vspltisb(s, 1);
-+  as_vmuleub(destEnc, srcEnc, s);
-+  RedZoneStashSimd128(*this, dest, 1);
-+  RedZoneRestoreSimd128(*this, 0, scratch);
-+  as_vspltisb(destEnc, 1);
-+  as_vmuloub(destEnc, s, destEnc);
-+  RedZoneRestoreSimd128(*this, 1, scratch);
-+  as_vadduhm(destEnc, destEnc, s);
-+}
-+
-+// vmsumshm/vmsumuhm collapse the i16x8 → i32x4 pairwise-add into a single
-+// multiply-sum: VT.i32[k] = VRA.i16[2k]*VRB.i16[2k] +
-+// VRA.i16[2k+1]*VRB.i16[2k+1]
-+// + VRC.i32[k]. With VRB = splat(1) and VRC = 0 this is exactly the wasm
-+// i32x4.extadd_pairwise_i16x8_{s,u} contract. 3 insns when dest != src;
-+// LWasmUnarySimd128 uses useRegisterAtStart so dest may alias src — in that
-+// case we put splat(1) into scratch (preserving src in dest) and use a
-+// red-zone slot for the zero VRC operand.
-+void MacroAssembler::extAddPairwiseInt16x8(FloatRegister src,
-+                                           FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  if (dest != src) {
-+    as_xxlxor(scratch, scratch, scratch);  // scratch = 0 (VRC addend)
-+    as_vspltish(dest.encoding() & 31, 1);  // dest = splat(1) (VRB multiplier)
-+    as_vmsumshm(dest.encoding() & 31, src.encoding() & 31, dest.encoding() & 31,
-+                scratch.encoding() & 31);
-+    return;
-+  }
-+  // dest == src: load splat(1) into scratch instead, stash zero to the red
-+  // zone, restore zero into scratch after the splat is consumed... actually
-+  // simpler: use vmule/vmulo + vadd trio with red zone. Same shape as the
-+  // pre-vmsumshm fallback for i8x16.
-+  uint8_t s = scratch.encoding() & 31;
-+  uint8_t srcEnc = src.encoding() & 31;
-+  uint8_t destEnc = dest.encoding() & 31;
-+  RedZoneStashSimd128(*this, src, 0);
-+  as_vspltish(s, 1);
-+  as_vmulesh(destEnc, srcEnc, s);
-+  RedZoneStashSimd128(*this, dest, 1);
-+  RedZoneRestoreSimd128(*this, 0, scratch);
-+  as_vspltish(destEnc, 1);
-+  as_vmulosh(destEnc, s, destEnc);
-+  RedZoneRestoreSimd128(*this, 1, scratch);
-+  as_vadduwm(destEnc, destEnc, s);
-+}
-+
-+void MacroAssembler::unsignedExtAddPairwiseInt16x8(FloatRegister src,
-+                                                   FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  if (dest != src) {
-+    as_xxlxor(scratch, scratch, scratch);
-+    as_vspltish(dest.encoding() & 31, 1);
-+    as_vmsumuhm(dest.encoding() & 31, src.encoding() & 31, dest.encoding() & 31,
-+                scratch.encoding() & 31);
-+    return;
-+  }
-+  uint8_t s = scratch.encoding() & 31;
-+  uint8_t srcEnc = src.encoding() & 31;
-+  uint8_t destEnc = dest.encoding() & 31;
-+  RedZoneStashSimd128(*this, src, 0);
-+  as_vspltish(s, 1);
-+  as_vmuleuh(destEnc, srcEnc, s);
-+  RedZoneStashSimd128(*this, dest, 1);
-+  RedZoneRestoreSimd128(*this, 0, scratch);
-+  as_vspltish(destEnc, 1);
-+  as_vmulouh(destEnc, s, destEnc);
-+  RedZoneRestoreSimd128(*this, 1, scratch);
-+  as_vadduwm(destEnc, destEnc, s);
-+}
-+
-+void MacroAssembler::sqrtFloat32x4(FloatRegister src, FloatRegister dest) {
-+  as_xvsqrtsp(dest, src);
-+}
-+
-+void MacroAssembler::sqrtFloat64x2(FloatRegister src, FloatRegister dest) {
-+  as_xvsqrtdp(dest, src);
-+}
-+
-+void MacroAssembler::convertInt32x4ToFloat32x4(FloatRegister src,
-+                                               FloatRegister dest) {
-+  as_xvcvsxwsp(dest, src);
-+}
-+
-+void MacroAssembler::unsignedConvertInt32x4ToFloat32x4(FloatRegister src,
-+                                                       FloatRegister dest) {
-+  as_xvcvuxwsp(dest, src);
-+}
-+
-+// i32x4 (low 2 lanes) → f64x2. Wasm `f64x2.convert_low_i32x4_{s,u}`.
-+// xvcv{s,u}xwdp converts BE word 0 and BE word 2 of source to doubles in
-+// BE dwords 0 and 1. vmrglw places src.word_BE[2,3] at the read positions,
-+// matching the f32→f64 promote shape:
-+//   vmrglw    scratch, src, src    ; BE words 2,3 of src → BE words 0,2 of
-+//   scratch xvcv*xwdp dest, scratch        ; convert both, place in BE dwords
-+//   0,1
-+// Output BE dwords land as [convert(input lane 1), convert(input lane 0)],
-+// which on PPC64LE storage IS the wasm output layout.
-+//
-+// 2 insns each, single ScratchSimd128 scope, no GPR or FPR scratch.
-+// All ops POWER7+. dest==src aliasing safe (vmrglw consumes src into
-+// scratch before dest is written).
-+void MacroAssembler::convertInt32x4ToFloat64x2(FloatRegister src,
-+                                               FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  as_vmrglw(scratch.encoding() & 31, src.encoding() & 31, src.encoding() & 31);
-+  as_xvcvsxwdp(dest, scratch);
-+}
-+
-+void MacroAssembler::unsignedConvertInt32x4ToFloat64x2(FloatRegister src,
-+                                                       FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  as_vmrglw(scratch.encoding() & 31, src.encoding() & 31, src.encoding() & 31);
-+  as_xvcvuxwdp(dest, scratch);
-+}
-+
-+void MacroAssembler::truncSatFloat32x4ToInt32x4(FloatRegister src,
-+                                                FloatRegister dest) {
-+  // xvcvspsxws gives INT32_MIN for NaN, but Wasm requires 0.
-+  ScratchSimd128Scope scratch(*this);
-+  as_xvcmpeqsp(scratch, src, src);  // ~0 for non-NaN, 0 for NaN
-+  as_xvcvspsxws(dest, src);
-+  as_xxland(dest, dest, scratch);  // zero NaN lanes
-+}
-+
-+// Pack the two "interesting" 32-bit results that xvcv*xws / xvcvdpsp leaves
-+// at scratch.word_BE[0] (= A) and scratch.word_BE[2] (= B) into a zeroed dest
-+// as dest.word_BE = [0, 0, A, B]. This is the layout wasm requires for
-+// f64x2 → {i32x4 trunc_sat, f32x4 demote}. Writes dest, consumes scratch.
-+//
-+// POWER9 path (4 insns) uses xxinsertw/xxextractuw. POWER8 path (7 insns)
-+// goes via two GPR round-trips: extract A and B with mfvsrd, splice them
-+// into a single dword with rldimi, mtvsrd back into a SIMD reg, and
-+// xxpermdi the result into dest.dw1 while keeping dest.dw0 zero.
-+static inline void PackTwoWordsToLowHalf(MacroAssembler& masm,
-+                                         FloatRegister scratch,
-+                                         FloatRegister dest) {
-+  if (HasPOWER9()) {
-+    masm.as_xxinsertw(dest, scratch,
-+                      8);  // dest.word_BE[2] ← scratch.word_BE[1] (= A)
-+    masm.as_xxextractuw(scratch, scratch,
-+                        8);  // scratch.word_BE[1] ← scratch.word_BE[2] (= B)
-+    masm.as_xxinsertw(dest, scratch,
-+                      12);  // dest.word_BE[3] ← scratch.word_BE[1] (= B)
-+    return;
-+  }
-+  // POWER8: xxinsertw/xxextractuw are ISA 3.0. Take a GPR detour instead.
-+  // scratch.dw_BE[0] = (A << 32) | A, scratch.dw_BE[1] = (B << 32) | B.
-+  UseScratchRegisterScope temps(masm);
-+  Register tmpA = temps.Acquire();
-+  Register tmpB = temps.Acquire();
-+  masm.as_mfvsrd(tmpA, scratch);  // tmpA = (A << 32) | A
-+  masm.as_xxpermdi(scratch, scratch, scratch,
-+                   2);            // swap dwords: now dw0 = (B<<32)|B
-+  masm.as_mfvsrd(tmpB, scratch);  // tmpB = (B << 32) | B
-+  masm.x_srdi(tmpA, tmpA, 32);    // tmpA = 0x00000000_AAAAAAAA
-+  masm.as_rldimi(tmpB, tmpA, 32,
-+                 0);              // tmpB[0..31] = A; tmpB[32..63] = B (kept)
-+  masm.as_mtvsrd(scratch, tmpB);  // scratch.dw_BE[0] = (A << 32) | B; dw1 = 0
-+  masm.as_xxpermdi(dest, dest, scratch,
-+                   0);  // dest = {dest.dw0=0, scratch.dw0} = [0, 0, A, B]
-+}
-+
-+// fctiwz / fcmpu / fctiduz are X-form scalar FP instructions that only
-+// encode 5-bit FRT/FRB fields, so emitting them on a Simd128 reg
-+// (encoding 32+) would corrupt the opcode. Bridge through
-+// ScratchDoubleReg (FPR f0) for the conversion. Extract both lanes' GPR
-+// results before writing dest so that dest == src is safe.
-+//
-+// Avoid replaceLaneInt32x4 on the tail: on POWER8 it needs an extra
-+// GPR scratch, but r11 and r12 are already held as a/b here. Pack both
-+// int32s into `a` with rldimi, transfer via mtvsrd, then xxpermdi the
-+// DWs into the low half so wasm lane 0 (BE W3) holds a, lane 1 (W2) b.
-+void MacroAssembler::truncSatFloat64x2ToInt32x4(FloatRegister src,
-+                                                FloatRegister dest,
-+                                                FloatRegister temp) {
-+  // Wasm `i32x4.trunc_sat_f64x2_s_zero`. xvcvdpsxws saturates to INT32_MIN
-+  // on overflow/NaN (per ISA); wasm requires NaN → 0, so a per-dword NaN
-+  // mask via xvcmpeqdp clamps NaN lanes to 0 before laying out the result.
-+  // Output BE word positions need wasm lane order: lane 1 → BE word 2,
-+  // lane 0 → BE word 3. xvcvdpsxws lands its results at BE words 0 and 2
-+  // (with replication into 1/3); PackTwoWordsToLowHalf moves them into
-+  // the right positions while zeroing the rest.
-+  // dest==src safe: src is consumed by xvcvdpsxws and xvcmpeqdp before
-+  // dest is zeroed.
-+  ScratchSimd128Scope scratch(*this);
-+  as_xvcvdpsxws(scratch, src);
-+  as_xvcmpeqdp(dest, src,
-+               src);  // NaN-mask: 0xFF...F per dword for non-NaN, 0 for NaN
-+  as_xxland(scratch, scratch, dest);
-+  as_xxlxor(dest, dest, dest);
-+  PackTwoWordsToLowHalf(*this, scratch, dest);
-+}
-+
-+void MacroAssembler::unsignedTruncSatFloat64x2ToInt32x4(FloatRegister src,
-+                                                        FloatRegister dest,
-+                                                        FloatRegister temp) {
-+  // Wasm `i32x4.trunc_sat_f64x2_u_zero`. xvcvdpuxws semantics already
-+  // match the wasm spec without any masking: NaN → 0, negative → 0,
-+  // positive overflow → UINT32_MAX. So no NaN mask needed; just position
-+  // the saturated results into BE words 2,3 with zeros at words 0,1.
-+  // dest==src safe: src consumed by xvcvdpuxws before dest is zeroed.
-+  ScratchSimd128Scope scratch(*this);
-+  as_xvcvdpuxws(scratch, src);
-+  as_xxlxor(dest, dest, dest);
-+  PackTwoWordsToLowHalf(*this, scratch, dest);
-+}
-+
-+void MacroAssembler::truncFloat32x4ToInt32x4Relaxed(FloatRegister src,
-+                                                    FloatRegister dest) {
-+  truncSatFloat32x4ToInt32x4(src, dest);
-+}
-+
-+void MacroAssembler::unsignedTruncFloat32x4ToInt32x4Relaxed(
-+    FloatRegister src, FloatRegister dest) {
-+  unsignedTruncSatFloat32x4ToInt32x4(src, dest);
-+}
-+
-+void MacroAssembler::truncFloat64x2ToInt32x4Relaxed(FloatRegister src,
-+                                                    FloatRegister dest) {
-+  truncSatFloat64x2ToInt32x4(src, dest, ScratchSimd128Reg);
-+}
-+
-+void MacroAssembler::unsignedTruncFloat64x2ToInt32x4Relaxed(
-+    FloatRegister src, FloatRegister dest) {
-+  unsignedTruncSatFloat64x2ToInt32x4(src, dest, ScratchSimd128Reg);
-+}
-+
-+// f64x2 → f32x4 (low 2 lanes; high lanes zero). Wasm `f32x4.demote_f64x2_zero`.
-+// xvcvdpsp converts both doubles in one shot, replicating each result across
-+// its dword: BE word lanes = [s(in.dw0), s(in.dw0), s(in.dw1), s(in.dw1)].
-+// On PPC64LE wasm storage (lxvx-loaded), input.dw_BE[0] = wasm lane 1 and
-+// input.dw_BE[1] = wasm lane 0, so we get [s(l1), s(l1), s(l0), s(l0)] in
-+// BE word order. We then zero dest and pack s(l1) into BE word 2 (wasm
-+// output lane 1) and s(l0) into BE word 3 (wasm output lane 0) via the
-+// shared PackTwoWordsToLowHalf helper, which has POWER9 and POWER8 paths.
-+//
-+// dest==src aliasing safe: src is consumed by xvcvdpsp before dest is zeroed.
-+void MacroAssembler::convertFloat64x2ToFloat32x4(FloatRegister src,
-+                                                 FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  as_xvcvdpsp(scratch, src);
-+  ZeroSimd128(*this, dest);
-+  PackTwoWordsToLowHalf(*this, scratch, dest);
-+}
-+
-+// f32x4 (low 2 lanes) → f64x2. Wasm `f64x2.promote_low_f32x4`. xvcvspdp
-+// converts both BE word 0 and BE word 2 of its source to doubles in BE
-+// dwords 0 and 1 respectively. To get wasm lanes 0 and 1 (= input BE
-+// words 3 and 2) into those source positions, vmrglw merges low words:
-+// VRT.word[0] = VRA.word[2] = wasm lane 1, VRT.word[2] = VRA.word[3] =
-+// wasm lane 0 (with replicated copies in odd word slots that xvcvspdp
-+// ignores). Output BE dwords land as [double(lane1), double(lane0)],
-+// which on PPC64LE storage is exactly the wasm f64x2 output layout.
-+//
-+// dest==src aliasing safe: vmrglw consumes src into a separate scratch
-+// before dest is written.
-+//
-+// 2 insns, single ScratchSimd128 scope. All ops POWER7+.
-+void MacroAssembler::convertFloat32x4ToFloat64x2(FloatRegister src,
-+                                                 FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  as_vmrglw(scratch.encoding() & 31, src.encoding() & 31, src.encoding() & 31);
-+  as_xvcvspdp(dest, scratch);
-+}
-+
-+void MacroAssembler::unsignedNarrowInt16x8(FloatRegister lhs, FloatRegister rhs,
-+                                           FloatRegister dest) {
-+  // On LE, VMX pack swaps operand order vs Wasm convention.
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vpkshus), rhs, lhs, dest);
-+}
-+
-+void MacroAssembler::unsignedNarrowInt32x4(FloatRegister lhs, FloatRegister rhs,
-+                                           FloatRegister dest) {
-+  // On LE, VMX pack swaps operand order vs Wasm convention.
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vpkswus), rhs, lhs, dest);
-+}
-+
-+void MacroAssembler::widenLowInt8x16(FloatRegister src, FloatRegister dest) {
-+  // On PPC64 LE, raw vupklsb unpacks the LOW Wasm lanes (not vupkhsb).
-+  // GCC vec_unpackh maps to vupklsb on LE (swapped from BE naming).
-+  // Raw vupklsb([1..8,-1..-8]) = [1,2,3,4,5,6,7,8].
-+  EmitVmxUnary(
-+      *this,
-+      [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vupklsb(vrt, vrb); },
-+      src, dest);
-+}
-+
-+void MacroAssembler::widenHighInt8x16(FloatRegister src, FloatRegister dest) {
-+  // On PPC64 LE, raw vupkhsb unpacks the HIGH Wasm lanes.
-+  EmitVmxUnary(
-+      *this,
-+      [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vupkhsb(vrt, vrb); },
-+      src, dest);
-+}
-+
-+void MacroAssembler::unsignedWidenLowInt8x16(FloatRegister src,
-+                                             FloatRegister dest) {
-+  zeroExtend8x16To16x8(src, dest);
-+}
-+
-+void MacroAssembler::unsignedWidenHighInt8x16(FloatRegister src,
-+                                              FloatRegister dest) {
-+  // vmrghb(zero, src) interleaves zero bytes with the BE-high half of src,
-+  // producing zero-extended halfwords of the LE-high (Wasm-high) lanes.
-+  ScratchSimd128Scope scratch(*this);
-+  as_xxlxor(scratch, scratch, scratch);
-+  as_vmrghb(dest.encoding() & 31, scratch.encoding() & 31, src.encoding() & 31);
-+}
-+
-+void MacroAssembler::widenLowInt16x8(FloatRegister src, FloatRegister dest) {
-+  // On PPC64 LE, raw vupklsh unpacks LOW Wasm lanes (GCC swaps h/l on LE).
-+  EmitVmxUnary(
-+      *this,
-+      [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vupklsh(vrt, vrb); },
-+      src, dest);
-+}
-+
-+void MacroAssembler::widenHighInt16x8(FloatRegister src, FloatRegister dest) {
-+  EmitVmxUnary(
-+      *this,
-+      [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vupkhsh(vrt, vrb); },
-+      src, dest);
-+}
-+
-+void MacroAssembler::unsignedWidenLowInt16x8(FloatRegister src,
-+                                             FloatRegister dest) {
-+  zeroExtend16x8To32x4(src, dest);
-+}
-+
-+void MacroAssembler::unsignedWidenHighInt16x8(FloatRegister src,
-+                                              FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  as_xxlxor(scratch, scratch, scratch);
-+  as_vmrghh(dest.encoding() & 31, scratch.encoding() & 31, src.encoding() & 31);
-+}
-+
-+void MacroAssembler::widenLowInt32x4(FloatRegister src, FloatRegister dest) {
-+  // On PPC64 LE, raw vupklsw unpacks LOW Wasm lanes.
-+  EmitVmxUnary(
-+      *this,
-+      [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vupklsw(vrt, vrb); },
-+      src, dest);
-+}
-+
-+void MacroAssembler::unsignedWidenLowInt32x4(FloatRegister src,
-+                                             FloatRegister dest) {
-+  zeroExtend32x4To64x2(src, dest);
-+}
-+
-+void MacroAssembler::widenHighInt32x4(FloatRegister src, FloatRegister dest) {
-+  EmitVmxUnary(
-+      *this,
-+      [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vupkhsw(vrt, vrb); },
-+      src, dest);
-+}
-+
-+void MacroAssembler::unsignedWidenHighInt32x4(FloatRegister src,
-+                                              FloatRegister dest) {
-+  // i64x2.extend_high_i32x4_u: take high 2 i32 lanes of src, zero-extend
-+  // to i64 each. Use vmrghw to interleave a zero VR with src — same shape
-+  // as the (already-correct) unsignedWidenHighInt16x8 sibling above.
-+  ScratchSimd128Scope scratch(*this);
-+  ZeroSimd128(*this, scratch);
-+  as_vmrghw(dest.encoding() & 31, scratch.encoding() & 31, src.encoding() & 31);
-+}
-+
-+void MacroAssembler::pseudoMinFloat32x4(FloatRegister rhsOrRhsDest,
-+                                        FloatRegister lhsOrLhsDest) {
-+  // pmin: result[i] = rhs[i] < lhs[i] ? rhs[i] : lhs[i]
-+  // xvcmpgtsp(mask, lhs, rhs) → 1 where lhs > rhs (i.e., rhs < lhs)
-+  // xxsel: mask=1 → XB=rhs. mask=0 → XA=lhs.
-+  // Result goes to lhsOrLhsDest (second param).
-+  ScratchSimd128Scope scratch(*this);
-+  as_xvcmpgtsp(scratch, lhsOrLhsDest, rhsOrRhsDest);
-+  as_xxsel(lhsOrLhsDest, lhsOrLhsDest, rhsOrRhsDest, scratch);
-+}
-+
-+void MacroAssembler::pseudoMinFloat32x4(FloatRegister lhs, FloatRegister rhs,
-+                                        FloatRegister dest) {
-+  // pmin(lhs, rhs) = rhs < lhs ? rhs : lhs
-+  // Inline to handle dest aliasing with either operand.
-+  ScratchSimd128Scope scratch(*this);
-+  as_xvcmpgtsp(scratch, lhs, rhs);
-+  // mask=1 where lhs > rhs. XC=1 → select XB=rhs. XC=0 → select XA=lhs.
-+  as_xxsel(dest, lhs, rhs, scratch);
-+}
-+
-+void MacroAssembler::pseudoMinFloat64x2(FloatRegister rhsOrRhsDest,
-+                                        FloatRegister lhsOrLhsDest) {
-+  ScratchSimd128Scope scratch(*this);
-+  as_xvcmpgtdp(scratch, lhsOrLhsDest, rhsOrRhsDest);
-+  as_xxsel(lhsOrLhsDest, lhsOrLhsDest, rhsOrRhsDest, scratch);
-+}
-+
-+void MacroAssembler::pseudoMinFloat64x2(FloatRegister lhs, FloatRegister rhs,
-+                                        FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  as_xvcmpgtdp(scratch, lhs, rhs);
-+  as_xxsel(dest, lhs, rhs, scratch);
-+}
-+
-+void MacroAssembler::pseudoMaxFloat32x4(FloatRegister rhsOrRhsDest,
-+                                        FloatRegister lhsOrLhsDest) {
-+  ScratchSimd128Scope scratch(*this);
-+  as_xvcmpgtsp(scratch, rhsOrRhsDest, lhsOrLhsDest);
-+  as_xxsel(lhsOrLhsDest, lhsOrLhsDest, rhsOrRhsDest, scratch);
-+}
-+
-+void MacroAssembler::pseudoMaxFloat32x4(FloatRegister lhs, FloatRegister rhs,
-+                                        FloatRegister dest) {
-+  // pmax(lhs, rhs) = lhs < rhs ? rhs : lhs
-+  ScratchSimd128Scope scratch(*this);
-+  as_xvcmpgtsp(scratch, rhs, lhs);
-+  // mask=1 where rhs > lhs (lhs < rhs). XC=1 → select XB=rhs. XC=0 → select
-+  // XA=lhs.
-+  as_xxsel(dest, lhs, rhs, scratch);
-+}
-+
-+void MacroAssembler::pseudoMaxFloat64x2(FloatRegister rhsOrRhsDest,
-+                                        FloatRegister lhsOrLhsDest) {
-+  ScratchSimd128Scope scratch(*this);
-+  as_xvcmpgtdp(scratch, rhsOrRhsDest, lhsOrLhsDest);
-+  as_xxsel(lhsOrLhsDest, lhsOrLhsDest, rhsOrRhsDest, scratch);
-+}
-+
-+void MacroAssembler::pseudoMaxFloat64x2(FloatRegister lhs, FloatRegister rhs,
-+                                        FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  as_xvcmpgtdp(scratch, rhs, lhs);
-+  as_xxsel(dest, lhs, rhs, scratch);
-+}
-+
-+void MacroAssembler::dotInt8x16Int7x16(FloatRegister lhs, FloatRegister rhs,
-+                                       FloatRegister dest) {
-+  // result[k] = lhs[2k]*rhs[2k] + lhs[2k+1]*rhs[2k+1] for k=0..7.
-+  // vmulesb/vmulosb produce even/odd byte products as i16 in matching
-+  // halfword lanes; vadduhm sums them pairwise.
-+  ScratchSimd128Scope scratch(*this);
-+  uint8_t l = lhs.encoding() & 31;
-+  uint8_t r = rhs.encoding() & 31;
-+  uint8_t d = dest.encoding() & 31;
-+  uint8_t s = scratch.encoding() & 31;
-+  as_vmulesb(s, l, r);
-+  as_vmulosb(d, l, r);
-+  as_vadduhm(d, s, d);
-+}
-+
-+void MacroAssembler::ceilFloat32x4(FloatRegister src, FloatRegister dest) {
-+  as_xvrspip(dest, src);
-+}
-+
-+void MacroAssembler::ceilFloat64x2(FloatRegister src, FloatRegister dest) {
-+  as_xvrdpip(dest, src);
-+}
-+
-+void MacroAssembler::floorFloat32x4(FloatRegister src, FloatRegister dest) {
-+  as_xvrspim(dest, src);
-+}
-+
-+void MacroAssembler::floorFloat64x2(FloatRegister src, FloatRegister dest) {
-+  as_xvrdpim(dest, src);
-+}
-+
-+void MacroAssembler::truncFloat32x4(FloatRegister src, FloatRegister dest) {
-+  as_xvrspiz(dest, src);
-+}
-+
-+void MacroAssembler::truncFloat64x2(FloatRegister src, FloatRegister dest) {
-+  as_xvrdpiz(dest, src);
-+}
-+
-+void MacroAssembler::nearestFloat32x4(FloatRegister src, FloatRegister dest) {
-+  as_xvrspic(dest, src);
-+}
-+
-+void MacroAssembler::nearestFloat64x2(FloatRegister src, FloatRegister dest) {
-+  as_xvrdpic(dest, src);
-+}
-+
-+void MacroAssembler::fnmaFloat32x4(FloatRegister src1, FloatRegister src2,
-+                                   FloatRegister srcDest) {
-+  as_xvnmsubasp(srcDest, src1, src2);
-+}
-+
-+void MacroAssembler::fnmaFloat64x2(FloatRegister src1, FloatRegister src2,
-+                                   FloatRegister srcDest) {
-+  as_xvnmsubadp(srcDest, src1, src2);
-+}
-+
-+void MacroAssembler::minFloat32x4Relaxed(FloatRegister src,
-+                                         FloatRegister srcDest) {
-+  as_xvminsp(srcDest, srcDest, src);
-+}
-+
-+void MacroAssembler::minFloat32x4Relaxed(FloatRegister lhs, FloatRegister rhs,
-+                                         FloatRegister dest) {
-+  as_xvminsp(dest, lhs, rhs);
-+}
-+
-+void MacroAssembler::maxFloat32x4Relaxed(FloatRegister src,
-+                                         FloatRegister srcDest) {
-+  as_xvmaxsp(srcDest, srcDest, src);
-+}
-+
-+void MacroAssembler::maxFloat32x4Relaxed(FloatRegister lhs, FloatRegister rhs,
-+                                         FloatRegister dest) {
-+  as_xvmaxsp(dest, lhs, rhs);
-+}
-+
-+void MacroAssembler::minFloat64x2Relaxed(FloatRegister src,
-+                                         FloatRegister srcDest) {
-+  as_xvmindp(srcDest, srcDest, src);
-+}
-+
-+void MacroAssembler::minFloat64x2Relaxed(FloatRegister lhs, FloatRegister rhs,
-+                                         FloatRegister dest) {
-+  as_xvmindp(dest, lhs, rhs);
-+}
-+
-+void MacroAssembler::maxFloat64x2Relaxed(FloatRegister src,
-+                                         FloatRegister srcDest) {
-+  as_xvmaxdp(srcDest, srcDest, src);
-+}
-+
-+void MacroAssembler::maxFloat64x2Relaxed(FloatRegister lhs, FloatRegister rhs,
-+                                         FloatRegister dest) {
-+  as_xvmaxdp(dest, lhs, rhs);
-+}
-+
-+void MacroAssembler::q15MulrInt16x8Relaxed(FloatRegister lhs, FloatRegister rhs,
-+                                           FloatRegister dest) {
-+  q15MulrSatInt16x8(lhs, rhs, dest);
-+}
-+
-+// SIMD overloads accepting an extra FloatRegister temp (shared-header signature
-+// used by x86; on PPC64 the temp is unused for most of these).
-+void MacroAssembler::popcntInt8x16(FloatRegister src, FloatRegister dest,
-+                                   FloatRegister temp) {
-+  popcntInt8x16(src, dest);
-+}
-+
-+void MacroAssembler::unsignedTruncSatFloat32x4ToInt32x4(FloatRegister src,
-+                                                        FloatRegister dest,
-+                                                        FloatRegister temp) {
-+  unsignedTruncSatFloat32x4ToInt32x4(src, dest);
-+}
-+
-+void MacroAssembler::dotInt8x16Int7x16ThenAdd(FloatRegister lhs,
-+                                              FloatRegister rhs,
-+                                              FloatRegister dest,
-+                                              FloatRegister temp) {
-+  // dest += pairwise_widen_i16_to_i32(dot_i8x16(lhs, rhs)).
-+  //
-+  // Step 1: i16x8 dot of i8 byte pairs (vmulesb/vmulosb/vadduhm). Keeps
-+  // the existing signed-byte multiply semantics that match ARM64 sdot
-+  // and x86 vpdpbssd (vmsummbm would be signed×unsigned and diverge for
-+  // i7 lanes that bit-pattern as negative).
-+  //
-+  // Step 2: vmsumshm dest, dot, splat_hw(1), dest computes
-+  //   dest.i32[k] = dest.i32[k] + dot.i16[2k]*1 + dot.i16[2k+1]*1
-+  // which is exactly pairwise widen + accumulate in a single insn.
-+  // splat_hw(1) is a single vspltish (5-bit SIMM splat to all 8 halfwords).
-+  ScratchSimd128Scope scratch(*this);
-+  uint8_t l = lhs.encoding() & 31;
-+  uint8_t r = rhs.encoding() & 31;
-+  uint8_t d = dest.encoding() & 31;
-+  uint8_t s = scratch.encoding() & 31;
-+  uint8_t t = temp.encoding() & 31;
-+
-+  as_vmulesb(s, l, r);
-+  as_vmulosb(t, l, r);
-+  as_vadduhm(t, s, t);
-+  as_vspltish(s, 1);
-+  as_vmsumshm(d, t, s, d);
-+}
-+
-+// SIMD ops ported from arm64- and x86/x64-shaped signatures.
-+
-+void MacroAssembler::permuteInt16x8(const uint16_t lanes[8], FloatRegister src,
-+                                    FloatRegister dest) {
-+  uint8_t shuffleLanes[16];
-+  for (unsigned i = 0; i < 8; i++) {
-+    shuffleLanes[i * 2] = lanes[i] * 2;
-+    shuffleLanes[i * 2 + 1] = lanes[i] * 2 + 1;
-+  }
-+  shuffleInt8x16(shuffleLanes, src, src, dest);
-+}
-+
-+void MacroAssembler::rotateRightSimd128(FloatRegister src, FloatRegister dest,
-+                                        uint32_t shift) {
-+  MOZ_ASSERT(shift < 16);
-+  if (shift == 0) {
-+    moveSimd128(src, dest);
-+    return;
-+  }
-+  // vsldoi VRT,VRA,VRB,SH: concatenate VRA||VRB, take bytes [SH..SH+15].
-+  // Rotate right by N = vsldoi(src, src, 16-N).
-+  as_vsldoi(dest, src, src, 16 - shift);
-+}
-+
-+void MacroAssembler::mulInt64x2(FloatRegister lhs, FloatRegister rhs,
-+                                FloatRegister dest, FloatRegister temp1,
-+                                FloatRegister temp2) {
-+  // POWER10 collapses the entire i64x2 multiply to a single vmulld.
-+  // POWER9/POWER8 fall back to the GPR round-trip path: extract each
-+  // lane pair into GPRs (mfvsrld for LE-dw0/Wasm-lane-0, mfvsrd for
-+  // LE-dw1/lane-1), multiply, and reassemble via mtvsrd + xxpermdi.
-+  if (HasPOWER10()) {
-+    as_vmulld(dest.encoding() & 31, lhs.encoding() & 31, rhs.encoding() & 31);
-+    return;
-+  }
-+  // Aliasing safety: stash the lane-0 product in ScratchSimd128 (which
-+  // is non-allocatable, so cannot alias lhs/rhs) and only write dest at
-+  // the very end, after both lhs and rhs have been fully consumed.
-+  ScratchSimd128Scope scratch(*this);
-+  UseScratchRegisterScope temps(asMasm());
-+  Register a = temps.Acquire();
-+  Register b = temps.Acquire();
-+
-+  if (HasPOWER9()) {
-+    as_mfvsrld(a, lhs);
-+    as_mfvsrld(b, rhs);
-+  } else {
-+    as_xxpermdi(scratch, lhs, lhs, 2);
-+    as_mfvsrd(a, scratch);
-+    as_xxpermdi(scratch, rhs, rhs, 2);
-+    as_mfvsrd(b, scratch);
-+  }
-+  as_mulld(a, a, b);
-+  as_mtvsrd(scratch, a);
-+
-+  as_mfvsrd(a, lhs);
-+  as_mfvsrd(b, rhs);
-+  as_mulld(a, a, b);
-+  as_mtvsrd(dest, a);
-+  as_xxpermdi(dest, dest, scratch, 0);
-+}
-+
-+void MacroAssembler::bitwiseAndNotSimd128(FloatRegister lhs, FloatRegister rhs,
-+                                          FloatRegister dest) {
-+  // andnot(lhs, rhs) = lhs & ~rhs = xxlandc(lhs, rhs)
-+  as_xxlandc(dest, lhs, rhs);
-+}
-+
-+void MacroAssembler::bitwiseSelectSimd128(FloatRegister onTrue,
-+                                          FloatRegister onFalse,
-+                                          FloatRegister maskDest) {
-+  // result = (onTrue & mask) | (onFalse & ~mask)
-+  // xxsel: XC=0→XA, XC=1→XB → XT = (XA & ~XC) | (XB & XC)
-+  // Need XA=onFalse, XB=onTrue, XC=mask.
-+  as_xxsel(maskDest, onFalse, onTrue, maskDest);
-+}
-+
-+void MacroAssembler::popcntInt8x16(FloatRegister src, FloatRegister dest) {
-+  EmitVmxUnary(
-+      *this,
-+      [](Assembler& a, uint8_t vrt, uint8_t vrb) { a.as_vpopcntb(vrt, vrb); },
-+      src, dest);
-+}
-+
-+void MacroAssembler::bitmaskInt8x16(FloatRegister src, Register dest,
-+                                    FloatRegister temp) {
-+  if (HasPOWER10()) {
-+    // Single-instruction collapse on POWER10.
-+    as_vextractbm(dest, src);
-+    return;
-+  }
-+  // POWER8+ vbpermq-based bitmask: ctl[i] = (15-i)*8 produces the wasm-spec
-+  // bitmap (bit i = MSB of LE lane i) in dw0 low 16 bits.
-+  int8_t ctl[16] = {120, 112, 104, 96, 88, 80, 72, 64,
-+                    56,  48,  40,  32, 24, 16, 8,  0};
-+  loadConstantSimd128(SimdConstant::CreateX16(ctl), temp);
-+  as_vbpermq(temp.encoding() & 31, src.encoding() & 31, temp.encoding() & 31);
-+  as_mfvsrd(dest, temp);
-+}
-+
-+void MacroAssembler::bitmaskInt16x8(FloatRegister src, Register dest,
-+                                    FloatRegister temp) {
-+  if (HasPOWER10()) {
-+    as_vextracthm(dest, src);
-+    return;
-+  }
-+  // Same recipe as bitmaskInt8x16 but ctl picks halfword MSBs:
-+  // BE bit (14-2i)*8 for lane i, plus 8 ignore-bytes (high bit set).
-+  int8_t ctl[16] = {112,  96,   80,   64,   48,   32,   16,   0,
-+                    -128, -128, -128, -128, -128, -128, -128, -128};
-+  loadConstantSimd128(SimdConstant::CreateX16(ctl), temp);
-+  as_vbpermq(temp.encoding() & 31, src.encoding() & 31, temp.encoding() & 31);
-+  as_mfvsrd(dest, temp);
-+}
-+
-+void MacroAssembler::bitmaskInt32x4(FloatRegister src, Register dest,
-+                                    FloatRegister temp) {
-+  if (HasPOWER10()) {
-+    as_vextractwm(dest, src);
-+    return;
-+  }
-+  // Same recipe as bitmaskInt8x16 but ctl picks word MSBs:
-+  // BE bit (12-4i)*8 for lane i, plus 12 ignore-bytes (high bit set).
-+  int8_t ctl[16] = {96,   64,   32,   0,    -128, -128, -128, -128,
-+                    -128, -128, -128, -128, -128, -128, -128, -128};
-+  loadConstantSimd128(SimdConstant::CreateX16(ctl), temp);
-+  as_vbpermq(temp.encoding() & 31, src.encoding() & 31, temp.encoding() & 31);
-+  as_mfvsrd(dest, temp);
-+}
-+
-+void MacroAssembler::bitmaskInt64x2(FloatRegister src, Register dest,
-+                                    FloatRegister temp) {
-+  if (HasPOWER10()) {
-+    as_vextractdm(dest, src);
-+    return;
-+  }
-+  // Same recipe as the other bitmask variants. ctl picks dword MSBs:
-+  // BE bit 64 for lane 0, BE bit 0 for lane 1, plus 14 ignore-bytes.
-+  int8_t ctl[16] = {64,   0,    -128, -128, -128, -128, -128, -128,
-+                    -128, -128, -128, -128, -128, -128, -128, -128};
-+  loadConstantSimd128(SimdConstant::CreateX16(ctl), temp);
-+  as_vbpermq(temp.encoding() & 31, src.encoding() & 31, temp.encoding() & 31);
-+  as_mfvsrd(dest, temp);
-+}
-+
-+void MacroAssembler::compareInt64x2(Assembler::Condition cond,
-+                                    FloatRegister rhs, FloatRegister lhsDest) {
-+  compareInt64x2(cond, lhsDest, rhs, lhsDest);
-+}
-+
-+void MacroAssembler::compareInt64x2(Assembler::Condition cond,
-+                                    FloatRegister lhs, FloatRegister rhs,
-+                                    FloatRegister dest) {
-+  EmitVmxCompare(*this, cond, lhs, rhs, dest, VMX_BINARY_WRAPPER(vcmpequd),
-+                 VMX_BINARY_WRAPPER(vcmpgtsd), VMX_BINARY_WRAPPER(vcmpgtud));
-+}
-+
-+void MacroAssembler::minFloat32x4(FloatRegister rhs, FloatRegister lhsDest) {
-+  minFloat32x4(lhsDest, rhs, lhsDest);
-+}
-+
-+void MacroAssembler::minFloat32x4(FloatRegister lhs, FloatRegister rhs,
-+                                  FloatRegister dest) {
-+  as_xvminsp(dest, lhs, rhs);
-+}
-+
-+void MacroAssembler::minFloat32x4(FloatRegister lhs, FloatRegister rhs,
-+                                  FloatRegister dest, FloatRegister temp1,
-+                                  FloatRegister temp2) {
-+  // Wasm min with NaN propagation.
-+  // Detect NaN in either operand (not via add which falsely flags inf+(-inf)).
-+  // Compute mask and add BEFORE min (min may clobber lhs via dest aliasing).
-+  as_xvcmpeqsp(temp1, lhs, lhs);
-+  as_xvcmpeqsp(temp2, rhs, rhs);
-+  as_xxland(temp1, temp1, temp2);
-+  as_xvaddsp(temp2, lhs, rhs);
-+  as_xvminsp(dest, lhs, rhs);
-+  as_xxsel(dest, temp2, dest, temp1);
-+}
-+
-+void MacroAssembler::minFloat64x2(FloatRegister rhs, FloatRegister lhsDest) {
-+  minFloat64x2(lhsDest, rhs, lhsDest);
-+}
-+
-+void MacroAssembler::minFloat64x2(FloatRegister lhs, FloatRegister rhs,
-+                                  FloatRegister dest) {
-+  as_xvmindp(dest, lhs, rhs);
-+}
-+
-+void MacroAssembler::minFloat64x2(FloatRegister lhs, FloatRegister rhs,
-+                                  FloatRegister dest, FloatRegister temp1,
-+                                  FloatRegister temp2) {
-+  // NaN mask and add must be computed BEFORE min (which may clobber lhs via
-+  // dest).
-+  as_xvcmpeqdp(temp1, lhs, lhs);
-+  as_xvcmpeqdp(temp2, rhs, rhs);
-+  as_xxland(temp1, temp1, temp2);  // temp1 = ~0 when both non-NaN
-+  as_xvadddp(temp2, lhs, rhs);     // temp2 = add (NaN source)
-+  as_xvmindp(dest, lhs, rhs);      // dest = min (may clobber lhs)
-+  as_xxsel(dest, temp2, dest, temp1);
-+}
-+
-+void MacroAssembler::maxFloat32x4(FloatRegister rhs, FloatRegister lhsDest) {
-+  maxFloat32x4(lhsDest, rhs, lhsDest);
-+}
-+
-+void MacroAssembler::maxFloat32x4(FloatRegister lhs, FloatRegister rhs,
-+                                  FloatRegister dest) {
-+  as_xvmaxsp(dest, lhs, rhs);
-+}
-+
-+void MacroAssembler::maxFloat32x4(FloatRegister lhs, FloatRegister rhs,
-+                                  FloatRegister dest, FloatRegister temp1,
-+                                  FloatRegister temp2) {
-+  // Wasm max with NaN propagation, using temp registers.
-+  as_xvcmpeqsp(temp1, lhs, lhs);
-+  as_xvcmpeqsp(temp2, rhs, rhs);
-+  as_xxland(temp1, temp1, temp2);
-+  as_xvaddsp(temp2, lhs, rhs);
-+  as_xvmaxsp(dest, lhs, rhs);
-+  as_xxsel(dest, temp2, dest, temp1);
-+}
-+
-+void MacroAssembler::maxFloat64x2(FloatRegister rhs, FloatRegister lhsDest) {
-+  maxFloat64x2(lhsDest, rhs, lhsDest);
-+}
-+
-+void MacroAssembler::maxFloat64x2(FloatRegister lhs, FloatRegister rhs,
-+                                  FloatRegister dest) {
-+  as_xvmaxdp(dest, lhs, rhs);
-+}
-+
-+void MacroAssembler::maxFloat64x2(FloatRegister lhs, FloatRegister rhs,
-+                                  FloatRegister dest, FloatRegister temp1,
-+                                  FloatRegister temp2) {
-+  as_xvcmpeqdp(temp1, lhs, lhs);
-+  as_xvcmpeqdp(temp2, rhs, rhs);
-+  as_xxland(temp1, temp1, temp2);
-+  as_xvadddp(temp2, lhs, rhs);
-+  as_xvmaxdp(dest, lhs, rhs);
-+  as_xxsel(dest, temp2, dest, temp1);
-+}
-+
-+void MacroAssembler::unsignedTruncSatFloat32x4ToInt32x4(FloatRegister src,
-+                                                        FloatRegister dest) {
-+  as_xvcvspuxws(dest, src);
-+}
-+
-+void MacroAssembler::extractLaneInt64x2(uint32_t lane, FloatRegister src,
-+                                        Register64 dest) {
-+  MOZ_ASSERT(lane < 2);
-+  if (lane == 1) {
-+    // Lane 1 = BE dword 0 = register bits[0:63].
-+    as_mfvsrd(dest.reg, src);
-+  } else {
-+    // Lane 0 = BE dword 1.
-+    if (HasPOWER9()) {
-+      as_mfvsrld(dest.reg, src);
-+    } else {
-+      ScratchSimd128Scope scratch(*this);
-+      as_xxpermdi(scratch, src, src, 2);
-+      as_mfvsrd(dest.reg, scratch);
-+    }
-+  }
-+}
-+
-+void MacroAssembler::replaceLaneInt64x2(unsigned lane, Register64 rhs,
-+                                        FloatRegister lhsDest) {
-+  MOZ_ASSERT(lane < 2);
-+  if (HasPOWER10()) {
-+    // 1 insn, no scratch VSR. UIM byte offset: lane 0 → 8, lane 1 → 0.
-+    as_vinsd(lhsDest, rhs.reg, (1 - lane) * 8);
-+    return;
-+  }
-+  ScratchSimd128Scope scratch(*this);
-+  as_mtvsrd(scratch, rhs.reg);
-+  if (lane == 0) {
-+    // Replace dw1 (LE low = lane 0). Keep dw0 (lane 1).
-+    // dm=0b00: [lhsDest.dw0, scratch.dw0]
-+    as_xxpermdi(lhsDest, lhsDest, scratch, 0);
-+  } else {
-+    // Replace dw0 (LE high = lane 1). Keep dw1 (lane 0).
-+    // dm=0b01: [scratch.dw0, lhsDest.dw1]
-+    as_xxpermdi(lhsDest, scratch, lhsDest, 1);
-+  }
-+}
-+
-+// SIMD 3-operand arithmetic (x86_shared-style signatures).
-+
-+void MacroAssembler::addFloat32x4(FloatRegister lhs, FloatRegister rhs,
-+                                  FloatRegister dest) {
-+  as_xvaddsp(dest, lhs, rhs);
-+}
-+
-+void MacroAssembler::addFloat64x2(FloatRegister lhs, FloatRegister rhs,
-+                                  FloatRegister dest) {
-+  as_xvadddp(dest, lhs, rhs);
-+}
-+
-+void MacroAssembler::addInt16x8(FloatRegister lhs, FloatRegister rhs,
-+                                FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vadduhm), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::addInt8x16(FloatRegister lhs, FloatRegister rhs,
-+                                FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vaddubm), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::divFloat32x4(FloatRegister lhs, FloatRegister rhs,
-+                                  FloatRegister dest) {
-+  as_xvdivsp(dest, lhs, rhs);
-+}
-+
-+void MacroAssembler::extractLaneInt16x8(uint32_t lane, FloatRegister src,
-+                                        Register dest) {
-+  MOZ_ASSERT(lane < 8);
-+  if (HasPOWER9()) {
-+    as_vextractuh(ScratchSimd128Reg, src, 14 - 2 * lane);
-+    as_mfvsrd(dest, ScratchSimd128Reg);
-+    as_extsh(dest, dest);
-+    return;
-+  }
-+  ExtractLaneToGPR(*this, lane, src, dest, 2, 16);
-+  as_extsh(dest, dest);
-+}
-+
-+void MacroAssembler::extractLaneInt32x4(uint32_t lane, FloatRegister src,
-+                                        Register dest) {
-+  MOZ_ASSERT(lane < 4);
-+  ExtractLaneToGPR(*this, lane, src, dest, 4, 32);
-+  // ExtractLaneToGPR leaves the adjacent lane in the high 32 bits for the
-+  // unshifted lanes (0 and 2); canonicalize to a sign-extended i32, as the
-+  // i8x16/i16x8 extracts do with extsb/extsh. A consumer that reads the full
-+  // 64-bit register -- e.g. the POWER8 i32.ctz emulation, whose 64-bit neg/and.
-+  // with a 32-bit cntlzw otherwise mis-handles a zero low word over nonzero
-+  // high garbage and returns -1 -- requires this.
-+  as_extsw(dest, dest);
-+}
-+
-+void MacroAssembler::extractLaneInt8x16(uint32_t lane, FloatRegister src,
-+                                        Register dest) {
-+  MOZ_ASSERT(lane < 16);
-+  if (HasPOWER9()) {
-+    as_vextractub(ScratchSimd128Reg, src, 15 - lane);
-+    as_mfvsrd(dest, ScratchSimd128Reg);
-+    as_extsb(dest, dest);
-+    return;
-+  }
-+  ExtractLaneToGPR(*this, lane, src, dest, 1, 8);
-+  as_extsb(dest, dest);
-+}
-+
-+void MacroAssembler::maxInt16x8(FloatRegister lhs, FloatRegister rhs,
-+                                FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmaxsh), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::maxInt32x4(FloatRegister lhs, FloatRegister rhs,
-+                                FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmaxsw), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::maxInt8x16(FloatRegister lhs, FloatRegister rhs,
-+                                FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmaxsb), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::minInt8x16(FloatRegister lhs, FloatRegister rhs,
-+                                FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vminsb), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::mulInt32x4(FloatRegister lhs, FloatRegister rhs,
-+                                FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vmuluwm), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::narrowInt16x8(FloatRegister lhs, FloatRegister rhs,
-+                                   FloatRegister dest) {
-+  // On LE, VMX pack swaps operand order vs Wasm convention.
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vpkshss), rhs, lhs, dest);
-+}
-+
-+void MacroAssembler::splatX2(Register64 src, FloatRegister dest) {
-+  if (HasPOWER9()) {
-+    as_mtvsrdd(dest, src.reg, src.reg);
-+  } else {
-+    as_mtvsrd(dest, src.reg);
-+    as_xxpermdi(dest, dest, dest, 0);
-+  }
-+}
-+
-+void MacroAssembler::subInt32x4(FloatRegister lhs, FloatRegister rhs,
-+                                FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsubuwm), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::swizzleInt8x16(FloatRegister lhs, FloatRegister rhs,
-+                                    FloatRegister dest) {
-+  // Wasm i8x16.swizzle: result[i] = (rhs[i] < 16) ? lhs[rhs[i]] : 0.
-+  //
-+  // Strategy: build ctrl in ScratchSimd128 (which can't alias inputs
-+  // because v0 is non-allocatable). Use vsububs(splat(15), rhs) to
-+  // produce ctrl = max(0, 15 - rhs); the saturation clamps out-of-range
-+  // indices to 0, and those positions get masked off below.
-+  //
-+  // The mask is computed via vcmpgtub(rhs, splat(15)) + xxlnor — 0xFF
-+  // where rhs <= 15. Reformulating "rhs < 16" as "!(rhs > 15)" lets us
-+  // use vspltisb with a 5-bit signed immediate (P7+, 1 insn, no GPR
-+  // scratch) for both splat-of-15 sites, replacing the previous
-+  // movePtr(0x0F0F0F0F)/movePtr(0x10101010) + splatX4 dance.
-+  //
-+  // Aliasing: dest may equal lhs (wasm baseline calls swizzleInt8x16(
-+  // rsd, rs, rsd); Ion's useRegisterAtStart permits the same). When
-+  // dest != rhs, ctrl can be built in scratch and the mask computed
-+  // after the permute (rhs is still alive). When dest == rhs, the
-+  // permute would clobber rhs before we could compute the mask, so the
-+  // mask goes to the red zone first.
-+  ScratchSimd128Scope scratch(*this);
-+  uint8_t s = scratch.encoding() & 31;
-+  uint8_t l = lhs.encoding() & 31;
-+  uint8_t r = rhs.encoding() & 31;
-+  uint8_t d = dest.encoding() & 31;
-+
-+  if (dest != rhs) {
-+    as_vspltisb(s, 15);
-+    as_vsububs(s, s, r);   // scratch = ctrl
-+    as_vperm(d, l, l, s);  // dest = vperm(lhs, lhs, ctrl)
-+    as_vspltisb(s, 15);
-+    as_vcmpgtub(s, r, s);             // scratch = 0xFF where rhs > 15
-+    as_xxlandc(dest, dest, scratch);  // dest &= ~scratch (= bytes-to-keep)
-+    return;
-+  }
-+
-+  // dest == rhs: vperm clobbers rhs, so build the bytes-to-zero mask first
-+  // and stash it. The xxlandc at the end consumes the un-inverted form.
-+  as_vspltisb(s, 15);
-+  as_vcmpgtub(s, r, s);  // scratch = 0xFF where rhs > 15
-+  RedZoneStashSimd128(*this, scratch, 0);
-+  as_vspltisb(s, 15);
-+  as_vsububs(s, s, r);   // scratch = ctrl
-+  as_vperm(d, l, l, s);  // dest = vperm(lhs, lhs, ctrl)
-+  RedZoneRestoreSimd128(*this, 0, scratch);
-+  as_xxlandc(dest, dest, scratch);  // dest &= ~scratch (= bytes-to-keep)
-+}
-+// SIMD 3-operand arithmetic (continued).
-+
-+void MacroAssembler::addInt32x4(FloatRegister lhs, FloatRegister rhs,
-+                                FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vadduwm), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::addInt64x2(FloatRegister lhs, FloatRegister rhs,
-+                                FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vaddudm), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::addSatInt16x8(FloatRegister lhs, FloatRegister rhs,
-+                                   FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vaddshs), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::addSatInt8x16(FloatRegister lhs, FloatRegister rhs,
-+                                   FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vaddsbs), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::divFloat64x2(FloatRegister lhs, FloatRegister rhs,
-+                                  FloatRegister dest) {
-+  as_xvdivdp(dest, lhs, rhs);
-+}
-+
-+void MacroAssembler::minInt16x8(FloatRegister lhs, FloatRegister rhs,
-+                                FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vminsh), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::minInt32x4(FloatRegister lhs, FloatRegister rhs,
-+                                FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vminsw), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::mulFloat32x4(FloatRegister lhs, FloatRegister rhs,
-+                                  FloatRegister dest) {
-+  as_xvmulsp(dest, lhs, rhs);
-+}
-+
-+void MacroAssembler::mulFloat64x2(FloatRegister lhs, FloatRegister rhs,
-+                                  FloatRegister dest) {
-+  as_xvmuldp(dest, lhs, rhs);
-+}
-+
-+void MacroAssembler::mulInt16x8(FloatRegister lhs, FloatRegister rhs,
-+                                FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  ZeroSimd128(*this, scratch);
-+  EmitVmxTernary(
-+      *this,
-+      [](Assembler& a, uint8_t vrt, uint8_t vra, uint8_t vrb, uint8_t vrc) {
-+        a.as_vmladduhm(vrt, vra, vrb, vrc);
-+      },
-+      lhs, rhs, scratch, dest);
-+}
-+
-+void MacroAssembler::narrowInt32x4(FloatRegister lhs, FloatRegister rhs,
-+                                   FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vpkswss), rhs, lhs, dest);
-+}
-+
-+void MacroAssembler::subFloat32x4(FloatRegister lhs, FloatRegister rhs,
-+                                  FloatRegister dest) {
-+  as_xvsubsp(dest, lhs, rhs);
-+}
-+
-+void MacroAssembler::subFloat64x2(FloatRegister lhs, FloatRegister rhs,
-+                                  FloatRegister dest) {
-+  as_xvsubdp(dest, lhs, rhs);
-+}
-+
-+void MacroAssembler::subInt16x8(FloatRegister lhs, FloatRegister rhs,
-+                                FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsubuhm), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::subInt64x2(FloatRegister lhs, FloatRegister rhs,
-+                                FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsubudm), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::subInt8x16(FloatRegister lhs, FloatRegister rhs,
-+                                FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsububm), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::subSatInt16x8(FloatRegister lhs, FloatRegister rhs,
-+                                   FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsubshs), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::subSatInt8x16(FloatRegister lhs, FloatRegister rhs,
-+                                   FloatRegister dest) {
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsubsbs), lhs, rhs, dest);
-+}
-+
-+void MacroAssembler::widenDotInt16x8(FloatRegister lhs, FloatRegister rhs,
-+                                     FloatRegister dest) {
-+  // i32x4.dot_i16x8_s: result[k] = lhs[2k]*rhs[2k] + lhs[2k+1]*rhs[2k+1].
-+  // vmsumshm computes exactly that for each i32 lane plus an addend (VRC).
-+  // With VRC = 0, the addend disappears and we get the wasm spec result in
-+  // a single instruction. xxlxor zeros the scratch in 1 insn, so total is
-+  // 2 insns vs the old vmulesh/vmulosh/vadduwm trio.
-+  ScratchSimd128Scope scratch(*this);
-+  as_xxlxor(scratch, scratch, scratch);
-+  as_vmsumshm(dest.encoding() & 31, lhs.encoding() & 31, rhs.encoding() & 31,
-+              scratch.encoding() & 31);
-+}
-+
-+// SIMD variable-shift and FMA helpers.
-+// Pattern: splat the GPR shift count across all lanes of a scratch VSR,
-+// then issue a vector-shift on lhs and the splat. vsl{b,h} / vsr{b,h} /
-+// vsra{b,h} use the low 3-or-4 bits of each lane's shift count, exactly
-+// matching wasm modulo-N shift semantics.
-+
-+void MacroAssembler::leftShiftInt8x16(FloatRegister lhs, Register rhs,
-+                                      FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  splatX16(rhs, scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vslb), lhs, scratch, dest);
-+}
-+
-+void MacroAssembler::rightShiftInt8x16(FloatRegister lhs, Register rhs,
-+                                       FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  splatX16(rhs, scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrab), lhs, scratch, dest);
-+}
-+
-+void MacroAssembler::unsignedRightShiftInt8x16(FloatRegister lhs, Register rhs,
-+                                               FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  splatX16(rhs, scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrb), lhs, scratch, dest);
-+}
-+
-+void MacroAssembler::leftShiftInt16x8(FloatRegister lhs, Register rhs,
-+                                      FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  splatX8(rhs, scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vslh), lhs, scratch, dest);
-+}
-+
-+void MacroAssembler::rightShiftInt16x8(FloatRegister lhs, Register rhs,
-+                                       FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  splatX8(rhs, scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrah), lhs, scratch, dest);
-+}
-+
-+void MacroAssembler::unsignedRightShiftInt16x8(FloatRegister lhs, Register rhs,
-+                                               FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  splatX8(rhs, scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrh), lhs, scratch, dest);
-+}
-+
-+void MacroAssembler::leftShiftInt32x4(FloatRegister lhs, Register rhs,
-+                                      FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  splatX4(rhs, scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vslw), lhs, scratch, dest);
-+}
-+
-+void MacroAssembler::leftShiftInt64x2(FloatRegister lhs, Register rhs,
-+                                      FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  splatX4(rhs, scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsld), lhs, scratch, dest);
-+}
-+
-+void MacroAssembler::rightShiftInt32x4(FloatRegister lhs, Register rhs,
-+                                       FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  splatX4(rhs, scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsraw), lhs, scratch, dest);
-+}
-+
-+void MacroAssembler::rightShiftInt64x2(FloatRegister lhs, Register rhs,
-+                                       FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  splatX4(rhs, scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrad), lhs, scratch, dest);
-+}
-+
-+void MacroAssembler::unsignedRightShiftInt32x4(FloatRegister lhs, Register rhs,
-+                                               FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  splatX4(rhs, scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrw), lhs, scratch, dest);
-+}
-+
-+void MacroAssembler::unsignedRightShiftInt64x2(FloatRegister lhs, Register rhs,
-+                                               FloatRegister dest) {
-+  ScratchSimd128Scope scratch(*this);
-+  splatX4(rhs, scratch);
-+  EmitVmxBinary(*this, VMX_BINARY_WRAPPER(vsrd), lhs, scratch, dest);
-+}
-+
-+void MacroAssembler::fmaFloat32x4(FloatRegister src1, FloatRegister src2,
-+                                  FloatRegister srcDest) {
-+  as_xvmaddasp(srcDest, src1, src2);
-+}
-+
-+void MacroAssembler::fmaFloat64x2(FloatRegister src1, FloatRegister src2,
-+                                  FloatRegister srcDest) {
-+  as_xvmaddadp(srcDest, src1, src2);
-+}
-+
-+//}}} check_macroassembler_style
-+
-+}  // namespace jit
-+}  // namespace js
-+
-+#endif /* jit_ppc64_MacroAssembler_ppc64_inl_h */
-diff --git a/js/src/jit/ppc64/MacroAssembler-ppc64.cpp b/js/src/jit/ppc64/MacroAssembler-ppc64.cpp
-new file mode 100644
-index 000000000000..702fb3cd4cba
---- /dev/null
-+++ b/js/src/jit/ppc64/MacroAssembler-ppc64.cpp
-@@ -0,0 +1,3467 @@
-+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
-+ * vim: set ts=8 sts=2 et sw=2 tw=80:
-+ * This Source Code Form is subject to the terms of the Mozilla Public
-+ * License, v. 2.0. If a copy of the MPL was not distributed with this
-+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-+
-+#include "jit/ppc64/MacroAssembler-ppc64.h"
-+
-+#include "jit/Bailouts.h"
-+#include "jit/BaselineFrame.h"
-+#include "jit/FlushICache.h"
-+#include "jit/JitFrames.h"
-+#include "jit/JitRuntime.h"
-+#include "jit/MacroAssembler.h"
-+#include "jit/MoveEmitter.h"
-+#include "jit/ppc64/SharedICRegisters-ppc64.h"
-+#include "vm/JitActivation.h"
-+#include "vm/JSContext.h"
-+#include "wasm/WasmStubs.h"
-+
-+#include "jit/MacroAssembler-inl.h"
-+
-+namespace js {
-+namespace jit {
-+
-+MacroAssembler& MacroAssemblerPPC64::asMasm() {
-+  return *static_cast<MacroAssembler*>(this);
-+}
-+
-+const MacroAssembler& MacroAssemblerPPC64::asMasm() const {
-+  return *static_cast<const MacroAssembler*>(this);
-+}
-+
-+// ===============================================================
-+// Out-of-line fake exit frame
-+
-+bool MacroAssemblerPPC64Compat::buildOOLFakeExitFrame(void* fakeReturnAddr) {
-+  asMasm().Push(FrameDescriptor(FrameType::IonJS));
-+  asMasm().Push(ImmPtr(fakeReturnAddr));
-+  asMasm().Push(FramePointer);
-+  return true;
-+}
-+
-+// ===============================================================
-+// Load int32 or double from memory
-+
-+void MacroAssemblerPPC64Compat::loadInt32OrDouble(const Address& src,
-+                                                  FloatRegister dest) {
-+  UseScratchRegisterScope temps(*this);
-+  Register scratch = temps.Acquire();
-+  Label end;
-+
-+  // Load the boxed value and stash in the FPR immediately, then reuse the
-+  // GPR for the tag test.  Only one scratch GPR is held here so that
-+  // branchTestInt32 can acquire the second one for the ImmTag constant.
-+  loadPtr(Address(src.base, src.offset), scratch);
-+  as_mtvsrd(dest, scratch);
-+  x_srdi(scratch, scratch, JSVAL_TAG_SHIFT);
-+  asMasm().branchTestInt32(Assembler::NotEqual, scratch, &end);
-+  // It was an int32.  Recover the boxed value from the FPR, sign-extend
-+  // the low 32 bits, and convert to double.
-+  as_mfvsrd(scratch, dest);
-+  as_extsw(scratch, scratch);
-+  as_mtvsrd(dest, scratch);
-+  as_fcfid(dest, dest);
-+
-+  bind(&end);
-+}
-+
-+void MacroAssemblerPPC64Compat::loadInt32OrDouble(const BaseIndex& addr,
-+                                                  FloatRegister dest) {
-+  UseScratchRegisterScope temps(*this);
-+  Register scratch = temps.Acquire();
-+  Label end;
-+
-+  computeScaledAddress(addr, scratch);
-+  loadPtr(Address(scratch, addr.offset), scratch);
-+  as_mtvsrd(dest, scratch);
-+  x_srdi(scratch, scratch, JSVAL_TAG_SHIFT);
-+  asMasm().branchTestInt32(Assembler::NotEqual, scratch, &end);
-+  as_mfvsrd(scratch, dest);
-+  as_extsw(scratch, scratch);
-+  as_mtvsrd(dest, scratch);
-+  as_fcfid(dest, dest);
-+
-+  bind(&end);
-+}
-+
-+// ===============================================================
-+// Conversion functions
-+
-+void MacroAssemblerPPC64Compat::convertUInt32ToDouble(Register src,
-+                                                      FloatRegister dest) {
-+  // mtvsrwz: VSR[dest].dw0 = zero_ext_64(src[32:63]); P8+ (ISA 2.07).
-+  // Replaces rldicl + mtvsrd (2 insns + scratch) with 1 insn.
-+  as_mtvsrwz(dest, src);
-+  as_fcfid(dest, dest);
-+}
-+
-+void MacroAssemblerPPC64Compat::convertUInt32ToFloat32(Register src,
-+                                                       FloatRegister dest) {
-+  // mtvsrwz + fcfids; same recipe as convertUInt32ToDouble.
-+  as_mtvsrwz(dest, src);
-+  as_fcfids(dest, dest);
-+}
-+
-+// Helper for the negative-zero check after a successful round-trip.
-+// Precondition: `dest` holds the integer round-trip result; if it equals
-+// zero, then `src` was either +0.0 or -0.0 (those are the only doubles
-+// that round-trip to int 0). Distinguish them by inspecting src's sign
-+// bit: -0.0 has its MSB set, so an mfvsrd-then-signed-cmp-against-zero
-+// branches to `fail` only for -0.0. Non-zero `dest` values (including
-+// every negative integer) skip the check entirely.
-+static void EmitNegativeZeroCheck(MacroAssemblerPPC64Compat& masm,
-+                                  FloatRegister src, Register dest,
-+                                  Label* fail) {
-+  Label notZero;
-+  masm.as_cmpdi(dest, 0);
-+  masm.ma_b(Assembler::NotEqual, &notZero);
-+  UseScratchRegisterScope temps(masm);
-+  Register scratch = temps.Acquire();
-+  masm.as_mfvsrd(scratch, src);
-+  masm.as_cmpdi(scratch, 0);
-+  masm.ma_b(Assembler::LessThan, fail);
-+  masm.bind(&notZero);
-+}
-+
-+void MacroAssemblerPPC64Compat::convertDoubleToInt32(FloatRegister src,
-+                                                     Register dest, Label* fail,
-+                                                     bool negativeZeroCheck) {
-+  // Truncate to int32 (round toward zero), sign-extend, and verify
-+  // exactness via round-trip compare. fctiwz writes the int32 to BE
-+  // bits 32..63 of the FPR; mfvsrd extracts and extsw sign-extends.
-+  // The compare also catches NaN (unordered) and Inf (saturated to
-+  // INT32_{MIN,MAX}, won't round-trip equal).
-+  as_fctiwz(ScratchDoubleReg, src);
-+  as_mfvsrd(dest, ScratchDoubleReg);
-+  as_extsw(dest, dest);
-+  as_mtvsrd(ScratchDoubleReg, dest);
-+  as_fcfid(ScratchDoubleReg, ScratchDoubleReg);
-+  as_fcmpu(ScratchDoubleReg, src);
-+  ma_b(Assembler::DoubleNotEqualOrUnordered, fail);
-+
-+  if (negativeZeroCheck) {
-+    EmitNegativeZeroCheck(*this, src, dest, fail);
-+  }
-+}
-+
-+void MacroAssemblerPPC64Compat::convertDoubleToPtr(FloatRegister src,
-+                                                   Register dest, Label* fail,
-+                                                   bool negativeZeroCheck) {
-+  // Same pattern as convertDoubleToInt32 but to int64 (no sign-extend
-+  // needed since fctidz already produces a 64-bit result).
-+  as_fctidz(ScratchDoubleReg, src);
-+  as_mfvsrd(dest, ScratchDoubleReg);
-+  as_mtvsrd(ScratchDoubleReg, dest);
-+  as_fcfid(ScratchDoubleReg, ScratchDoubleReg);
-+  as_fcmpu(ScratchDoubleReg, src);
-+  ma_b(Assembler::DoubleNotEqualOrUnordered, fail);
-+
-+  if (negativeZeroCheck) {
-+    EmitNegativeZeroCheck(*this, src, dest, fail);
-+  }
-+}
-+
-+void MacroAssemblerPPC64Compat::convertFloat32ToInt32(FloatRegister src,
-+                                                      Register dest,
-+                                                      Label* fail,
-+                                                      bool negativeZeroCheck) {
-+  // Same as convertDoubleToInt32 but the round-trip uses fcfids so the
-+  // comparison happens at single precision (matches src's actual width).
-+  as_fctiwz(ScratchDoubleReg, src);
-+  as_mfvsrd(dest, ScratchDoubleReg);
-+  as_extsw(dest, dest);
-+  as_mtvsrd(ScratchDoubleReg, dest);
-+  as_fcfids(ScratchDoubleReg, ScratchDoubleReg);
-+  as_fcmpu(ScratchDoubleReg, src);
-+  ma_b(Assembler::DoubleNotEqualOrUnordered, fail);
-+
-+  if (negativeZeroCheck) {
-+    EmitNegativeZeroCheck(*this, src, dest, fail);
-+  }
-+}
-+
-+CodeOffset MacroAssemblerPPC64Compat::toggledCall(JitCode* target,
-+                                                  bool enabled) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  // stanza(8) + mtctr/bctrl(2) = 10 instructions.
-+  m_buffer.enterNoPool(kNoPoolPatchableBranchInsns);
-+  BufferOffset boLoad =
-+      emitLoad64Stanza(scratch, (uint64_t)uintptr_t(target->raw()));
-+  CodeOffset offset(boLoad.getOffset());
-+  addPendingJump(boLoad, ImmPtr(target->raw()), RelocationKind::JITCODE);
-+  if (enabled) {
-+    xs_mtctr(scratch);
-+    as_bctr(LinkBit::LinkB);
-+  } else {
-+    writeInst(NopInst);
-+    writeInst(NopInst);
-+  }
-+  m_buffer.leaveNoPool();
-+  MOZ_ASSERT_IF(!oom(), nextOffset().getOffset() - offset.offset() ==
-+                            ToggledCallSize(nullptr));
-+  return offset;
-+}
-+
-+// ===============================================================
-+// Exception handling
-+
-+void MacroAssemblerPPC64Compat::handleFailureWithHandlerTail(
-+    Label* profilerExitTail, Label* bailoutTail,
-+    uint32_t* returnValueCheckOffset) {
-+  // Round sizeof(ResumeFromException) up to ABIStackAlignment. The
-+  // canonical (sz + align - 1) & ~(align - 1) form is exact: when sz
-+  // is already a multiple of `align` the rounding is a no-op. The
-+  // previous (sz + align) & ~(align - 1) over-allocated by `align`
-+  // bytes whenever sz was already aligned.
-+  int size = (sizeof(ResumeFromException) + ABIStackAlignment - 1) &
-+             ~(ABIStackAlignment - 1);
-+  asMasm().subPtr(Imm32(size), StackPointer);
-+  // Use r3 (first argument register).
-+  mov(StackPointer, r3);
-+
-+  using Fn = void (*)(ResumeFromException* rfe);
-+  asMasm().setupUnalignedABICall(r4);
-+  asMasm().passABIArg(r3);
-+  asMasm().callWithABI<Fn, HandleException>(
-+      ABIType::General, CheckUnsafeCallWithABI::DontCheckHasExitFrame);
-+
-+  *returnValueCheckOffset = asMasm().currentOffset();
-+
-+  Label entryFrame;
-+  Label catch_;
-+  Label finally;
-+  Label returnBaseline;
-+  Label returnIon;
-+  Label bailout;
-+  Label wasmInterpEntry;
-+  Label wasmCatch;
-+
-+  load32(Address(StackPointer, ResumeFromException::offsetOfKind()), r3);
-+  asMasm().branch32(Assembler::Equal, r3,
-+                    Imm32(ExceptionResumeKind::EntryFrame), &entryFrame);
-+  asMasm().branch32(Assembler::Equal, r3, Imm32(ExceptionResumeKind::Catch),
-+                    &catch_);
-+  asMasm().branch32(Assembler::Equal, r3, Imm32(ExceptionResumeKind::Finally),
-+                    &finally);
-+  asMasm().branch32(Assembler::Equal, r3,
-+                    Imm32(ExceptionResumeKind::ForcedReturnBaseline),
-+                    &returnBaseline);
-+  asMasm().branch32(Assembler::Equal, r3,
-+                    Imm32(ExceptionResumeKind::ForcedReturnIon), &returnIon);
-+  asMasm().branch32(Assembler::Equal, r3, Imm32(ExceptionResumeKind::Bailout),
-+                    &bailout);
-+  asMasm().branch32(Assembler::Equal, r3,
-+                    Imm32(ExceptionResumeKind::WasmInterpEntry),
-+                    &wasmInterpEntry);
-+  asMasm().branch32(Assembler::Equal, r3, Imm32(ExceptionResumeKind::WasmCatch),
-+                    &wasmCatch);
-+
-+  breakpoint();  // Invalid kind.
-+
-+  // No exception handler. Return error from entry frame.
-+  bind(&entryFrame);
-+  asMasm().moveValue(MagicValue(JS_ION_ERROR), JSReturnOperand);
-+  loadPtr(Address(StackPointer, ResumeFromException::offsetOfFramePointer()),
-+          FramePointer);
-+  loadPtr(Address(StackPointer, ResumeFromException::offsetOfStackPointer()),
-+          StackPointer);
-+  ret();
-+
-+  // Catch handler.
-+  bind(&catch_);
-+  loadPtr(Address(StackPointer, ResumeFromException::offsetOfTarget()), r3);
-+  loadPtr(Address(StackPointer, ResumeFromException::offsetOfFramePointer()),
-+          FramePointer);
-+  loadPtr(Address(StackPointer, ResumeFromException::offsetOfStackPointer()),
-+          StackPointer);
-+  jump(r3);
-+
-+  // Finally block.
-+  bind(&finally);
-+  ValueOperand exception = ValueOperand(r4);
-+  loadValue(Address(StackPointer, ResumeFromException::offsetOfException()),
-+            exception);
-+
-+  ValueOperand exceptionStack = ValueOperand(r5);
-+  loadValue(
-+      Address(StackPointer, ResumeFromException::offsetOfExceptionStack()),
-+      exceptionStack);
-+
-+  loadPtr(Address(StackPointer, ResumeFromException::offsetOfTarget()), r3);
-+  loadPtr(Address(StackPointer, ResumeFromException::offsetOfFramePointer()),
-+          FramePointer);
-+  loadPtr(Address(StackPointer, ResumeFromException::offsetOfStackPointer()),
-+          StackPointer);
-+
-+  pushValue(exception);
-+  pushValue(exceptionStack);
-+  pushValue(BooleanValue(true));
-+  jump(r3);
-+
-+  // Forced return from baseline.
-+  Label profilingInstrumentation;
-+  bind(&returnBaseline);
-+  loadPtr(Address(StackPointer, ResumeFromException::offsetOfFramePointer()),
-+          FramePointer);
-+  loadPtr(Address(StackPointer, ResumeFromException::offsetOfStackPointer()),
-+          StackPointer);
-+  loadValue(Address(FramePointer, BaselineFrame::reverseOffsetOfReturnValue()),
-+            JSReturnOperand);
-+  jump(&profilingInstrumentation);
-+
-+  // Forced return from Ion.
-+  bind(&returnIon);
-+  loadValue(Address(StackPointer, ResumeFromException::offsetOfException()),
-+            JSReturnOperand);
-+  loadPtr(Address(StackPointer, ResumeFromException::offsetOfFramePointer()),
-+          FramePointer);
-+  loadPtr(Address(StackPointer, ResumeFromException::offsetOfStackPointer()),
-+          StackPointer);
-+
-+  bind(&profilingInstrumentation);
-+  {
-+    Label skipProfilingInstrumentation;
-+    AbsoluteAddress addressOfEnabled(
-+        asMasm().runtime()->geckoProfiler().addressOfEnabled());
-+    asMasm().branch32(Assembler::Equal, addressOfEnabled, Imm32(0),
-+                      &skipProfilingInstrumentation);
-+    jump(profilerExitTail);
-+    bind(&skipProfilingInstrumentation);
-+  }
-+
-+  xs_mr(StackPointer, FramePointer);
-+  // Pop FP from stack, then return (pop LR + blr).
-+  loadPtr(Address(StackPointer, 0), FramePointer);
-+  asMasm().addPtr(Imm32(sizeof(void*)), StackPointer);
-+  ret();
-+
-+  // Bailout.
-+  bind(&bailout);
-+  loadPtr(Address(StackPointer, ResumeFromException::offsetOfBailoutInfo()),
-+          r5);
-+  loadPtr(Address(StackPointer, ResumeFromException::offsetOfStackPointer()),
-+          StackPointer);
-+  xs_li(ReturnReg, 1);
-+  jump(bailoutTail);
-+
-+  // Wasm interp entry.
-+  bind(&wasmInterpEntry);
-+  loadPtr(Address(StackPointer, ResumeFromException::offsetOfFramePointer()),
-+          FramePointer);
-+  loadPtr(Address(StackPointer, ResumeFromException::offsetOfStackPointer()),
-+          StackPointer);
-+  movePtr(ImmWord(wasm::InterpFailInstanceReg), InstanceReg);
-+  ret();
-+
-+  // Wasm catch.
-+  bind(&wasmCatch);
-+  wasm::GenerateJumpToCatchHandler(asMasm(), StackPointer, r4, r5, r6);
-+}
-+
-+void MacroAssembler::clampDoubleToUint8(FloatRegister input, Register output) {
-+  ScratchDoubleScope fpscratch(asMasm());
-+
-+  if (HasPOWER9()) {
-+    // P9 xsmaxjdp uses Java/JS semantics (ISA v3.0B): any NaN
-+    // is treated as "less than any number that is not a NaN", so
-+    // xsmaxjdp(input, 0) collapses {NaN, -Inf, ≤ 0} to 0 in one insn —
-+    // the "≤ 0 or NaN → 0" branch dance disappears.
-+    //
-+    // After the max, fctid (round-to-nearest-even per FPSCR default,
-+    // matches ECMA Uint8ClampedArray's round-half-to-even) saturates
-+    // out-of-int64 values to INT64_MAX. Remaining upper clamp
-+    // (output > 255 → 255) is one cmpdi + isel.
-+    zeroDouble(fpscratch);
-+    as_xsmaxjdp(fpscratch, input, fpscratch);
-+    as_fctid(fpscratch, fpscratch);
-+    as_mfvsrd(output, fpscratch);
-+    UseScratchRegisterScope temps(asMasm());
-+    Register max255 = temps.Acquire();
-+    xs_li(max255, 255);
-+    as_cmpdi(output, 255);
-+    as_isel(output, max255, output, GreaterThan);
-+    return;
-+  }
-+
-+  // POWER8 fallback: xsmaxjdp is unavailable, so filter NaN explicitly
-+  // before fctid. Per Power ISA, fctid maps NaN to INT64_MAX, which
-+  // would clamp to 255 instead of the spec-required 0.
-+  Label positive, below255, done;
-+  zeroDouble(fpscratch);
-+  branchDouble(DoubleGreaterThan, input, fpscratch, &positive);
-+  {
-+    move32(Imm32(0), output);
-+    jump(&done);
-+  }
-+
-+  bind(&positive);
-+
-+  loadConstantDouble(255.0, fpscratch);
-+  branchDouble(DoubleLessThan, input, fpscratch, &below255);
-+  {
-+    move32(Imm32(255), output);
-+    jump(&done);
-+  }
-+
-+  bind(&below255);
-+
-+  as_fctid(fpscratch, input);
-+  as_mfvsrd(output, fpscratch);
-+  bind(&done);
-+}
-+
-+void MacroAssembler::subFromStackPtr(Imm32 imm32) {
-+  if (imm32.value) {
-+    asMasm().subPtr(imm32, StackPointer);
-+  }
-+}
-+
-+//{{{ check_macroassembler_style
-+
-+void MacroAssembler::widenInt32(Register r) {
-+  move32To64SignExtend(r, Register64(r));
-+}
-+
-+// Stack operations.
-+void MacroAssembler::Push(Register reg) {
-+  push(reg);
-+  adjustFrame(int32_t(sizeof(intptr_t)));
-+}
-+void MacroAssembler::Push(const Imm32 imm) {
-+  push(imm);
-+  adjustFrame(int32_t(sizeof(intptr_t)));
-+}
-+
-+void MacroAssembler::Push(const ImmWord imm) {
-+  push(imm);
-+  adjustFrame(int32_t(sizeof(intptr_t)));
-+}
-+
-+void MacroAssembler::Push(const ImmPtr imm) {
-+  Push(ImmWord(uintptr_t(imm.value)));
-+}
-+
-+void MacroAssembler::Push(const ImmGCPtr ptr) {
-+  push(ptr);
-+  adjustFrame(int32_t(sizeof(intptr_t)));
-+}
-+
-+void MacroAssembler::PushBoxed(FloatRegister reg) {
-+  subFromStackPtr(Imm32(sizeof(double)));
-+  boxDouble(reg, Address(getStackPointer(), 0));
-+  adjustFrame(sizeof(double));
-+}
-+
-+void MacroAssembler::Pop(Register reg) {
-+  pop(reg);
-+  adjustFrame(-int32_t(sizeof(intptr_t)));
-+}
-+void MacroAssembler::PushRegsInMask(LiveRegisterSet set) {
-+  int32_t diff =
-+      set.gprs().size() * sizeof(intptr_t) + set.fpus().getPushSizeInBytes();
-+  const int32_t reserved = diff;
-+
-+  reserveStack(reserved);
-+  for (GeneralRegisterBackwardIterator iter(set.gprs()); iter.more(); ++iter) {
-+    diff -= sizeof(intptr_t);
-+    storePtr(*iter, Address(StackPointer, diff));
-+  }
-+
-+  // Natural per-kind slot — 8 bytes for Single/Double via stfd, 16 bytes
-+  // for Simd128 via stxvx. RegisterDump::FPUArray is sized 32 × 8 = 256
-+  // bytes (sizeof(RegisterContent) is 8 — no v128 in the union), so
-+  // f_K's stfd slot lands at the right offset. Bailout AllRegs excludes
-+  // Simd128 (Ion has no SIMD live), so the FP region in bailout frames
-+  // is strictly Float-only.
-+  for (FloatRegisterBackwardIterator iter(set.fpus().reduceSetForPush());
-+       iter.more(); ++iter) {
-+    FloatRegister reg = *iter;
-+    diff -= reg.size();
-+    if (reg.isSimd128()) {
-+      storeUnalignedSimd128(reg, Address(StackPointer, diff));
-+    } else {
-+      storeDouble(reg.asDouble(), Address(StackPointer, diff));
-+    }
-+  }
-+  MOZ_ASSERT(diff == 0);
-+}
-+void MacroAssembler::PopRegsInMaskIgnore(LiveRegisterSet set,
-+                                         LiveRegisterSet ignore) {
-+  int32_t diff =
-+      set.gprs().size() * sizeof(intptr_t) + set.fpus().getPushSizeInBytes();
-+  const int32_t reserved = diff;
-+
-+  for (GeneralRegisterBackwardIterator iter(set.gprs()); iter.more(); ++iter) {
-+    diff -= sizeof(intptr_t);
-+    if (!ignore.has(*iter)) {
-+      loadPtr(Address(StackPointer, diff), *iter);
-+    }
-+  }
-+
-+  // Natural per-kind slot. See PushRegsInMask comment.
-+  for (FloatRegisterBackwardIterator iter(set.fpus().reduceSetForPush());
-+       iter.more(); ++iter) {
-+    FloatRegister reg = *iter;
-+    diff -= reg.size();
-+    if (!ignore.has(reg)) {
-+      if (reg.isSimd128()) {
-+        loadUnalignedSimd128(Address(StackPointer, diff), reg);
-+      } else {
-+        loadDouble(Address(StackPointer, diff), reg.asDouble());
-+      }
-+    }
-+  }
-+  MOZ_ASSERT(diff == 0);
-+  freeStack(reserved);
-+}
-+
-+// Call operations.
-+CodeOffset MacroAssembler::call(Register reg) {
-+  // ELFv2 ABI: r12 must hold the target address at function entry
-+  // so the callee can compute its TOC pointer from r12.
-+  if (reg != CallReg) {
-+    movePtr(reg, CallReg);
-+  }
-+  xs_mtctr(CallReg);
-+  as_bctr(LinkB);
-+  return CodeOffset(currentOffset());
-+}
-+CodeOffset MacroAssembler::call(Label* label) {
-+  if (label->bound()) {
-+    // Open the no-pool window BEFORE computing the displacement.
-+    // enterNoPool() can itself trigger a pending pool flush, advancing
-+    // currentOffset(). A pre-flush displacement emitted at the post-flush
-+    // position would overshoot the target by poolSize bytes.
-+    m_buffer.enterNoPool(kNoPoolPatchableBranchInsns);
-+    int32_t offset = label->offset() - currentOffset();
-+    // Call instruction goes at inst[9] in the 10-word stanza.
-+    int32_t callOffset = offset - 9 * (int32_t)sizeof(uint32_t);
-+    if (JOffImm26::IsInRange(callOffset)) {
-+      // Short: 9 nops + bl = 10 instructions.
-+      writeInst(NopInst);
-+      writeInst(NopInst);
-+      writeInst(NopInst);
-+      writeInst(NopInst);
-+      writeInst(NopInst);
-+      writeInst(NopInst);
-+      writeInst(NopInst);
-+      writeInst(NopInst);
-+      writeInst(NopInst);
-+      as_b(JOffImm26(callOffset), RelativeBranch, LinkB);
-+      m_buffer.leaveNoPool();
-+      return CodeOffset(currentOffset());
-+    }
-+    // Long call to bound label: stanza(8) + mtctr + bctrl = 10 instructions.
-+    BufferOffset bo =
-+        emitLoad64Stanza(SecondScratchReg, LabelBase::INVALID_OFFSET);
-+    xs_mtctr(SecondScratchReg);
-+    as_bctr(LinkB);
-+    m_buffer.leaveNoPool();
-+    addLongJump(bo, BufferOffset(label->offset()));
-+    return CodeOffset(currentOffset());
-+  }
-+  // Emit a CallTag stanza: trap + chain + 8 nops (10 instructions total).
-+  m_buffer.enterNoPool(kNoPoolPatchableBranchInsns);
-+  BufferOffset bo = xs_trap_tagged(CallTag);
-+  writeInst(label->used() ? label->offset() : LabelBase::INVALID_OFFSET);
-+  writeInst(NopInst);
-+  writeInst(NopInst);
-+  writeInst(NopInst);
-+  writeInst(NopInst);
-+  writeInst(NopInst);
-+  writeInst(NopInst);
-+  writeInst(NopInst);
-+  writeInst(NopInst);
-+  m_buffer.leaveNoPool();
-+  if (!oom()) {
-+    label->use(bo.getOffset());
-+  }
-+  return CodeOffset(currentOffset());
-+}
-+CodeOffset MacroAssembler::call(const Address& addr) {
-+  loadPtr(addr, CallReg);
-+  return call(CallReg);
-+}
-+
-+void MacroAssembler::call(ImmPtr target) {
-+  uint64_t addr = uintptr_t(target.value);
-+  // stanza(8) + mtctr + bctrl = 10 instructions.
-+  m_buffer.enterNoPool(kNoPoolPatchableBranchInsns);
-+  BufferOffset bo = emitLoad64Stanza(CallReg, addr);
-+  addPendingJump(bo, target, RelocationKind::HARDCODED);
-+  xs_mtctr(CallReg);
-+  as_bctr(LinkB);
-+  m_buffer.leaveNoPool();
-+}
-+
-+CodeOffset MacroAssembler::call(wasm::SymbolicAddress target) {
-+  movePtr(target, CallReg);
-+  return call(CallReg);
-+}
-+
-+void MacroAssembler::callWithABINoProfiler(const Address& fun, ABIType result) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  loadPtr(fun, scratch);
-+
-+  uint32_t stackAdjust;
-+  callWithABIPre(&stackAdjust);
-+  call(scratch);
-+  callWithABIPost(stackAdjust, result);
-+}
-+
-+void MacroAssembler::callWithABIPre(uint32_t* stackAdjust, bool callFromWasm) {
-+  MOZ_ASSERT(inCall_);
-+  uint32_t stackForCall = abiArgs_.stackBytesConsumedSoFar();
-+
-+  // Reserve place for LR save.
-+  stackForCall += sizeof(intptr_t);
-+
-+  if (dynamicAlignment_) {
-+    stackForCall += ComputeByteAlignment(stackForCall, ABIStackAlignment);
-+  } else {
-+    uint32_t alignmentAtPrologue = callFromWasm ? sizeof(wasm::Frame) : 0;
-+    stackForCall += ComputeByteAlignment(
-+        stackForCall + framePushed() + alignmentAtPrologue, ABIStackAlignment);
-+  }
-+
-+  *stackAdjust = stackForCall;
-+  reserveStack(stackForCall);
-+
-+  // Save LR. Restore it in callWithABIPost.
-+  {
-+    UseScratchRegisterScope temps(asMasm());
-+    Register scratch = temps.Acquire();
-+    xs_mflr(scratch);
-+    storePtr(scratch, Address(StackPointer, stackForCall - sizeof(intptr_t)));
-+  }
-+
-+  // Position all arguments.
-+  {
-+    enoughMemory_ &= moveResolver_.resolve();
-+    if (!enoughMemory_) {
-+      return;
-+    }
-+
-+    MoveEmitter emitter(*this);
-+    emitter.emit(moveResolver_);
-+    emitter.finish();
-+  }
-+
-+  assertStackAlignment(ABIStackAlignment);
-+}
-+
-+void MacroAssembler::callWithABIPost(uint32_t stackAdjust, ABIType result) {
-+  {
-+    UseScratchRegisterScope temps(asMasm());
-+    Register scratch = temps.Acquire();
-+    loadPtr(Address(StackPointer, stackAdjust - sizeof(intptr_t)), scratch);
-+    xs_mtlr(scratch);
-+  }
-+
-+  if (dynamicAlignment_) {
-+    // Restore SP from stack (as stored in setupUnalignedABICall).
-+    loadPtr(Address(StackPointer, stackAdjust), StackPointer);
-+    adjustFrame(-stackAdjust);
-+  } else {
-+    freeStack(stackAdjust);
-+  }
-+
-+#ifdef DEBUG
-+  MOZ_ASSERT(inCall_);
-+  inCall_ = false;
-+#endif
-+}
-+
-+// Value operations.
-+void MacroAssembler::moveValue(const ValueOperand& src,
-+                               const ValueOperand& dest) {
-+  if (src.valueReg() != dest.valueReg()) {
-+    movePtr(src.valueReg(), dest.valueReg());
-+  }
-+}
-+void MacroAssembler::moveValue(const Value& src, const ValueOperand& dest) {
-+  if (!src.isGCThing()) {
-+    movePtr(ImmWord(src.asRawBits()), dest.valueReg());
-+    return;
-+  }
-+  CodeOffset off = movWithPatch(ImmWord(src.asRawBits()), dest.valueReg());
-+  writeDataRelocation(off, src);
-+}
-+
-+// Branch operations.
-+void MacroAssembler::branchTestValue(Condition cond, const ValueOperand& lhs,
-+                                     const Value& rhs, Label* label) {
-+  MOZ_ASSERT(cond == Equal || cond == NotEqual);
-+  MOZ_ASSERT(!rhs.isNaN());
-+
-+  if (!rhs.isGCThing()) {
-+    UseScratchRegisterScope temps(asMasm());
-+    Register scratch = temps.Acquire();
-+    MOZ_ASSERT(lhs.valueReg() != scratch);
-+    movePtr(ImmWord(rhs.asRawBits()), scratch);
-+    branchPtr(cond, lhs.valueReg(), scratch, label);
-+  } else {
-+    UseScratchRegisterScope temps(asMasm());
-+    Register scratch = temps.Acquire();
-+    MOZ_ASSERT(lhs.valueReg() != scratch);
-+    moveValue(rhs, ValueOperand(scratch));
-+    branchPtr(cond, lhs.valueReg(), scratch, label);
-+  }
-+}
-+void MacroAssembler::branchTestNaNValue(Condition cond, const ValueOperand& val,
-+                                        Register temp, Label* label) {
-+  MOZ_ASSERT(cond == Equal || cond == NotEqual);
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  MOZ_ASSERT(val.valueReg() != scratch);
-+
-+  // Strip the IEEE sign bit (LSB-numbering bit 63 = PPC-numbering bit 0)
-+  // with rldicl SH=0, MB=1: rotate by zero (no-op) then keep bits 1..63 of
-+  // PPC-numbering, clearing bit 0. Rotating by 1 instead would also shift
-+  // the quiet-NaN bit out of position and cause 1.5 (0x3FF8...) and NaN
-+  // (0x7FF8...) to collide after masking — bug 1943704 PPC64 regression.
-+  as_rldicl(temp, val.valueReg(), 0, 1);
-+
-+  // Load canonical NaN (with sign bit 0) and strip its sign bit too.
-+  static_assert(JS::detail::CanonicalizedNaNSignBit == 0);
-+  moveValue(DoubleValue(JS::GenericNaN()), ValueOperand(scratch));
-+  as_rldicl(scratch, scratch, 0, 1);
-+
-+  branchPtr(cond, temp, scratch, label);
-+}
-+
-+void MacroAssembler::branchPtrInNurseryChunk(Condition cond, Register ptr,
-+                                             Register temp, Label* label) {
-+  MOZ_ASSERT(cond == Assembler::Equal || cond == Assembler::NotEqual);
-+  MOZ_ASSERT(ptr != temp);
-+  MOZ_ASSERT(temp != InvalidReg);
-+
-+  andPtr(Imm32(int32_t(~gc::ChunkMask)), ptr, temp);
-+  branchPtr(InvertCondition(cond), Address(temp, gc::ChunkStoreBufferOffset),
-+            ImmWord(0), label);
-+}
-+void MacroAssembler::branchValueIsNurseryCell(Condition cond,
-+                                              ValueOperand value, Register temp,
-+                                              Label* label) {
-+  branchValueIsNurseryCellImpl(cond, value, temp, label);
-+}
-+
-+// Patching / near address operations.
-+CodeOffset MacroAssembler::nopPatchableToCall() {
-+  // Emit 10 nops that can be patched to a call stanza:
-+  // 8 load64 nops + mtctr nop + bctrl nop
-+  // Return offset AFTER the stanza (= the return address).
-+  m_buffer.enterNoPool(kNoPoolPatchableBranchInsns);
-+  writeInst(NopInst);
-+  writeInst(NopInst);
-+  writeInst(NopInst);
-+  writeInst(NopInst);
-+  writeInst(NopInst);
-+  writeInst(NopInst);
-+  writeInst(NopInst);
-+  writeInst(NopInst);
-+  writeInst(NopInst);
-+  writeInst(NopInst);
-+  m_buffer.leaveNoPool();
-+  return CodeOffset(currentOffset());
-+}
-+CodeOffset MacroAssembler::moveNearAddressWithPatch(Register dest) {
-+  CodeOffset offset(currentOffset());
-+  emitLoad64Stanza(dest, 0);
-+  return offset;
-+}
-+// static
-+void MacroAssembler::patchNearAddressMove(CodeLocationLabel loc,
-+                                          CodeLocationLabel target) {
-+  Instruction* inst = (Instruction*)loc.raw();
-+  UpdateLoad64Value(inst, (uint64_t)target.raw());
-+}
-+
-+// Return address operations (link register architectures).
-+//
-+// Note: these MUST decrement SP by exactly 8 bytes. wasm::Frame is 16 bytes
-+// (callerFP_ + returnAddress_) and GenerateCallablePrologue pairs this with
-+// push(FramePointer) to match that layout exactly — a 16-byte decrement here
-+// would insert 8 bytes of padding and break FP-chain unwinding. The 8-byte
-+// intermediate misalignment between this save and the following push(FP) is
-+// never observed by a C call (no intervening transition), and any caller that
-+// does make a C call after pushReturnAddress routes through
-+// setupUnalignedABICall which re-aligns.
-+void MacroAssembler::pushReturnAddress() {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  xs_mflr(scratch);
-+  push(scratch);
-+}
-+void MacroAssembler::popReturnAddress() {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  pop(scratch);
-+  xs_mtlr(scratch);
-+}
-+
-+// ABI setup.
-+void MacroAssembler::setupUnalignedABICall(Register scratch) {
-+  MOZ_ASSERT(!IsCompilingWasm(), "wasm should only use aligned ABI calls");
-+  setupNativeABICall();
-+  dynamicAlignment_ = true;
-+
-+  movePtr(StackPointer, scratch);
-+
-+  // Force sp to be aligned.
-+  subPtr(Imm32(sizeof(uintptr_t)), StackPointer);
-+  andPtr(Imm32(~(ABIStackAlignment - 1)), StackPointer);
-+  storePtr(scratch, Address(StackPointer, 0));
-+}
-+
-+// ===============================================================
-+// Arithmetic helpers.
-+
-+void MacroAssembler::flexibleDivMod32(Register lhs, Register rhs,
-+                                      Register divOutput, Register remOutput,
-+                                      bool isUnsigned, const LiveRegisterSet&) {
-+  MOZ_ASSERT(lhs != divOutput && lhs != remOutput, "lhs is preserved");
-+  MOZ_ASSERT(rhs != divOutput && rhs != remOutput, "rhs is preserved");
-+
-+  // PPC64 has no modulus instruction. Compute: rem = lhs - (lhs/rhs)*rhs
-+  // PPC64 divw(INT32_MIN, -1) is undefined; quotient=INT32_MIN, remainder=0.
-+  Label done;
-+  if (!isUnsigned) {
-+    Label notMinOverflow;
-+    branchPtr(Assembler::NotEqual, lhs, ImmWord(INT32_MIN), &notMinOverflow);
-+    branchPtr(Assembler::NotEqual, rhs, ImmWord(-1), &notMinOverflow);
-+    move32(Imm32(INT32_MIN), divOutput);
-+    move32(Imm32(0), remOutput);
-+    jump(&done);
-+    bind(&notMinOverflow);
-+  }
-+  if (isUnsigned) {
-+    as_divwu(divOutput, lhs, rhs);
-+  } else {
-+    as_divw(divOutput, lhs, rhs);
-+  }
-+  as_extsw(divOutput, divOutput);
-+  if (HasPOWER9()) {
-+    if (isUnsigned) {
-+      as_moduw(remOutput, lhs, rhs);
-+    } else {
-+      as_modsw(remOutput, lhs, rhs);
-+    }
-+  } else {
-+    as_mullw(remOutput, divOutput, rhs);
-+    as_subf(remOutput, remOutput, lhs);
-+  }
-+  as_extsw(remOutput, remOutput);
-+  bind(&done);
-+}
-+
-+void MacroAssembler::shiftIndex32AndAdd(Register indexTemp32, int shift,
-+                                        Register pointer) {
-+  if (IsShiftInScaleRange(shift)) {
-+    computeEffectiveAddress(
-+        BaseIndex(pointer, indexTemp32, ShiftToScale(shift)), pointer);
-+    return;
-+  }
-+  lshift32(Imm32(shift), indexTemp32);
-+  addPtr(indexTemp32, pointer);
-+}
-+
-+void MacroAssembler::convertInt64ToDouble(Register64 src, FloatRegister dest) {
-+  as_mtvsrd(dest, src.reg);
-+  as_fcfid(dest, dest);
-+}
-+
-+void MacroAssembler::nearbyIntDouble(RoundingMode mode, FloatRegister src,
-+                                     FloatRegister dest) {
-+  switch (mode) {
-+    case RoundingMode::NearestTiesToEven: {
-+      // PPC64's frin rounds ties away from zero, NOT to even (ISA v3.1).
-+      // Use fctid+fcfid which uses FPSCR RN (default = round-to-nearest-even).
-+      // Guard: if |src| >= 2^52, value is already integral (or NaN/Inf) —
-+      // just copy src. This preserves NaN, Inf, and -0.
-+      // Check via integer exponent extraction to avoid FP temp conflicts.
-+      Label done;
-+      UseScratchRegisterScope temps(*this);
-+      Register scratch = temps.Acquire();
-+      moveDouble(src, ScratchDoubleReg);
-+      if (src != dest) {
-+        moveDouble(src, dest);
-+      }
-+      if (HasPOWER9()) {
-+        // xsxexpdp lays the 11-bit biased exponent in XT.dw0 with the
-+        // rest zeroed, so mfvsrd reads it directly — drops the
-+        // srdi+andi. masking pair.
-+        ScratchSimd128Scope expScratch(*this);
-+        as_xsxexpdp(expScratch, ScratchDoubleReg);
-+        as_mfvsrd(scratch, expScratch);
-+      } else {
-+        as_mfvsrd(scratch, ScratchDoubleReg);
-+        x_srdi(scratch, scratch, 52);
-+        as_andi_rc(scratch, scratch, 0x7FF);
-+      }
-+      // Biased exponent >= 1075 (= 1023+52) means |val| >= 2^52.
-+      // Also catches Inf (exp=2047) and NaN (exp=2047).
-+      ma_cmp(scratch, Imm32(1075), Assembler::GreaterThanOrEqual);
-+      ma_b(Assembler::GreaterThanOrEqual, &done);
-+      as_fctid(dest, ScratchDoubleReg);
-+      as_fcfid(dest, dest);
-+      as_fcpsgn(dest, ScratchDoubleReg, dest);
-+      bind(&done);
-+      break;
-+    }
-+    case RoundingMode::TowardsZero:
-+      as_friz(dest, src);
-+      break;
-+    case RoundingMode::Up:
-+      as_frip(dest, src);
-+      break;
-+    case RoundingMode::Down:
-+      as_frim(dest, src);
-+      break;
-+    default:
-+      MOZ_CRASH("Unexpected rounding mode");
-+  }
-+}
-+
-+void MacroAssembler::nearbyIntFloat32(RoundingMode mode, FloatRegister src,
-+                                      FloatRegister dest) {
-+  // PPC FP rounding instructions operate on double-precision.
-+  // For single-precision, we round as double then round back to single.
-+  // The frsp instruction handles the double->single conversion.
-+  nearbyIntDouble(mode, src, dest);
-+  as_frsp(dest, dest);
-+}
-+
-+// ===============================================================
-+// Far jump support.
-+
-+CodeOffset MacroAssembler::farJumpWithPatch() {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+
-+  // stanza(8) + mtctr + bctr = 10 instructions.
-+  CodeOffset loadOffset(currentOffset());
-+  m_buffer.enterNoPool(kNoPoolPatchableBranchInsns);
-+  emitLoad64Stanza(scratch, 0);
-+  xs_mtctr(scratch);
-+  as_bctr();
-+  m_buffer.leaveNoPool();
-+
-+  return loadOffset;
-+}
-+
-+// ===============================================================
-+void MacroAssembler::flush() { Assembler::flush(); }
-+
-+// Wasm support.
-+
-+FaultingCodeOffset MacroAssembler::wasmTrapInstruction() {
-+  m_buffer.flushPool();  // see comment in wasmLoadImpl
-+  FaultingCodeOffset fco = FaultingCodeOffset(currentOffset());
-+  xs_trap();
-+  return fco;
-+}
-+
-+// PPC64 SlowCallMarker: `ori r0, r0, 0` -- a NOP-like instruction
-+// that won't appear in normal code generation.
-+// ori r0, r0, 0 = 0x60000000 -- that's actually PPC_nop.
-+// Use a distinguishable encoding: `ori r12, r12, 0` = 0x618C0000
-+static const int32_t SlowCallMarker = 0x618C0000;
-+
-+void MacroAssembler::wasmMarkCallAsSlow() {
-+  // Emit: ori r12, r12, 0
-+  as_ori(CallReg, CallReg, 0);
-+}
-+
-+void MacroAssembler::wasmCheckSlowCallsite(Register ra_, Label* notSlow,
-+                                           Register temp1, Register temp2) {
-+  MOZ_ASSERT(ra_ != temp2);
-+  load32(Address(ra_, 0), temp2);
-+  branch32(Assembler::NotEqual, temp2, Imm32(SlowCallMarker), notSlow);
-+}
-+
-+CodeOffset MacroAssembler::wasmMarkedSlowCall(const wasm::CallSiteDesc& desc,
-+                                              const Register reg) {
-+  CodeOffset offset = call(desc, reg);
-+  wasmMarkCallAsSlow();
-+  return offset;
-+}
-+
-+// ===============================================================
-+// Additional stack operations.
-+
-+void MacroAssembler::Push(FloatRegister f) {
-+  push(f);
-+  adjustFrame(int32_t(sizeof(double)));
-+}
-+void MacroAssembler::Pop(FloatRegister f) {
-+  pop(f);
-+  adjustFrame(-int32_t(sizeof(double)));
-+}
-+void MacroAssembler::Pop(const ValueOperand& val) {
-+  popValue(val);
-+  adjustFrame(-int32_t(sizeof(Value)));
-+}
-+
-+// static
-+size_t MacroAssembler::PushRegsInMaskSizeInBytes(LiveRegisterSet set) {
-+  return set.gprs().size() * sizeof(intptr_t) + set.fpus().getPushSizeInBytes();
-+}
-+
-+void MacroAssembler::storeRegsInMask(LiveRegisterSet set, Address dest,
-+                                     Register scratch) {
-+  FloatRegisterSet fpuSet(set.fpus().reduceSetForPush());
-+  mozilla::DebugOnly<unsigned> numFpu = fpuSet.size();
-+  mozilla::DebugOnly<int32_t> diffF = fpuSet.getPushSizeInBytes();
-+  mozilla::DebugOnly<int32_t> diffG = set.gprs().size() * sizeof(intptr_t);
-+
-+  MOZ_ASSERT(dest.offset >= diffG + diffF);
-+
-+  for (GeneralRegisterBackwardIterator iter(set.gprs()); iter.more(); ++iter) {
-+    diffG -= sizeof(intptr_t);
-+    dest.offset -= sizeof(intptr_t);
-+    storePtr(*iter, dest);
-+  }
-+  MOZ_ASSERT(diffG == 0);
-+
-+  // Natural per-kind slot. See PushRegsInMask comment.
-+  for (FloatRegisterBackwardIterator iter(fpuSet); iter.more(); ++iter) {
-+    FloatRegister reg = *iter;
-+    diffF -= reg.size();
-+    numFpu -= 1;
-+    dest.offset -= reg.size();
-+    if (reg.isSimd128()) {
-+      storeUnalignedSimd128(reg, dest);
-+    } else {
-+      storeDouble(reg.asDouble(), dest);
-+    }
-+  }
-+  MOZ_ASSERT(diffF == 0);
-+}
-+
-+void MacroAssembler::freeStackTo(uint32_t framePushed) {
-+  MOZ_ASSERT(framePushed <= framePushed_);
-+  // SP = FP - framePushed
-+  movePtr(FramePointer, StackPointer);
-+  if (framePushed) {
-+    subPtr(Imm32(framePushed), StackPointer);
-+  }
-+  framePushed_ = framePushed;
-+}
-+
-+// ===============================================================
-+// Additional call / patch operations.
-+
-+void MacroAssembler::call(JitCode* c) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+
-+  uint64_t addr = uintptr_t(c->raw());
-+  BufferOffset bo = emitLoad64Stanza(scratch, addr);
-+  addPendingJump(bo, ImmPtr(c->raw()), RelocationKind::JITCODE);
-+
-+  callJitNoProfiler(scratch);
-+}
-+
-+CodeOffset MacroAssembler::callWithPatch() {
-+  // Emit a CallTag-sized stanza of nops. Will be patched by patchCall.
-+  // Return offset AFTER the stanza (= the return address when bl executes).
-+  m_buffer.enterNoPool(kNoPoolPatchableBranchInsns);
-+  writeInst(NopInst);
-+  writeInst(NopInst);
-+  writeInst(NopInst);
-+  writeInst(NopInst);
-+  writeInst(NopInst);
-+  writeInst(NopInst);
-+  writeInst(NopInst);
-+  writeInst(NopInst);
-+  writeInst(NopInst);
-+  writeInst(NopInst);
-+  m_buffer.leaveNoPool();
-+  return CodeOffset(currentOffset());
-+}
-+
-+void MacroAssembler::patchCall(uint32_t callerOffset, uint32_t calleeOffset) {
-+  // callerOffset points AFTER the 10-instruction stanza (the return address).
-+  // Subtract to find the stanza start. The `bl` goes at inst[9].
-+  uint32_t stanzaStart = callerOffset - 10 * sizeof(uint32_t);
-+  Instruction* i0 = (Instruction*)(m_buffer.getInst(BufferOffset(stanzaStart)));
-+  // bl offset is relative to inst[9], which is at stanzaStart + 36.
-+  intptr_t blAddr = (intptr_t)stanzaStart + 9 * (intptr_t)sizeof(uint32_t);
-+  intptr_t callOffset = (intptr_t)calleeOffset - blAddr;
-+  if (JOffImm26::IsInRange(callOffset)) {
-+    i0[0].makeNop();
-+    i0[1].makeNop();
-+    i0[2].makeNop();
-+    i0[3].makeNop();
-+    i0[4].makeNop();
-+    i0[5].makeNop();
-+    i0[6].makeNop();
-+    i0[7].makeNop();
-+    i0[8].makeNop();
-+    i0[9].setData(PPC_b | JOffImm26(callOffset).encode() | LinkB);
-+  } else {
-+    addLongJump(BufferOffset(stanzaStart), BufferOffset(calleeOffset));
-+    WriteLoad64Instructions(i0, SecondScratchReg, LabelBase::INVALID_OFFSET);
-+    i0[8].makeOp_mtctr(SecondScratchReg);
-+    i0[9].makeOp_bctr(LinkB);
-+  }
-+}
-+
-+void MacroAssembler::patchFarJump(CodeOffset farJump, uint32_t targetOffset) {
-+  Instruction* inst =
-+      (Instruction*)m_buffer.getInst(BufferOffset(farJump.offset()));
-+  // Extract the destination register from the existing stanza. Both shapes
-+  // encode rD at LE bits [21..25] of their first "register-touching" slot:
-+  // P8 = mflr rD at [2], P9+ = addpcis rD at [0]. Major opcode of slot [0]
-+  // distinguishes (31 = mfspr, 19 = addpcis).
-+  uint32_t i0 = inst[0].encode();
-+  uint32_t regCode = (((i0 >> 26) & 0x3f) == 19)
-+                         ? ((i0 >> 21) & 0x1f)
-+                         : ((inst[2].encode() >> 21) & 0x1f);
-+  Register reg = Register::FromCode(regCode);
-+  WriteLoad64Instructions(inst, reg, LabelBase::INVALID_OFFSET);
-+  addLongJump(BufferOffset(farJump.offset()), BufferOffset(targetOffset));
-+}
-+
-+// static
-+void MacroAssembler::patchFarJump(uint8_t* farJump, uint8_t* target) {
-+  UpdateLoad64Value((Instruction*)farJump, (uint64_t)(uintptr_t)target);
-+  FlushICache(farJump, 8 * sizeof(Instruction));
-+}
-+
-+// static
-+void MacroAssembler::patchNopToCall(uint8_t* callsite, uint8_t* target) {
-+  // callsite points AFTER the 10-instruction stanza. Subtract to find start.
-+  Instruction* inst = (Instruction*)callsite - 10;
-+  WriteLoad64Instructions(inst, SecondScratchReg, (uint64_t)(uintptr_t)target);
-+  inst[8].makeOp_mtctr(SecondScratchReg);
-+  inst[9].makeOp_bctr(LinkB);
-+  FlushICache(inst, 10 * sizeof(Instruction));
-+}
-+
-+// static
-+void MacroAssembler::patchCallToNop(uint8_t* callsite) {
-+  // callsite points AFTER the 10-instruction stanza. Subtract to find start.
-+  Instruction* inst = (Instruction*)callsite - 10;
-+  for (int i = 0; i < 10; i++) {
-+    inst[i].makeNop();
-+  }
-+  FlushICache(inst, 10 * sizeof(Instruction));
-+}
-+
-+void MacroAssembler::patchMove32(CodeOffset offset, Imm32 n) {
-+  // Patch an 8-instruction load64 sequence with a 32-bit value.
-+  Instruction* inst =
-+      (Instruction*)m_buffer.getInst(BufferOffset(offset.offset()));
-+  UpdateLoad64Value(inst, uint64_t(int64_t(n.value)));
-+}
-+
-+uint32_t MacroAssembler::pushFakeReturnAddress(Register scratch) {
-+  CodeLabel cl;
-+
-+  // Use mov(CodeLabel*, Register) which always emits a full 8-instruction
-+  // load64 sequence (via NOPs + WriteLoad64Instructions). This is critical
-+  // because movePtr(ImmWord(0)) would optimize to a single li instruction,
-+  // but processCodeLabels->Bind->UpdateLoad64Value expects the full
-+  // 8-instruction literal pool sequence at the patchAt offset.
-+  mov(&cl, scratch);
-+
-+  Push(scratch);
-+
-+  bind(&cl);
-+  uint32_t retAddr = currentOffset();
-+
-+  addCodeLabel(cl);
-+  return retAddr;
-+}
-+
-+void MacroAssembler::callWithABINoProfiler(Register fun, ABIType result) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  // Save fun to scratch since fun might be clobbered by callWithABIPre.
-+  movePtr(fun, scratch);
-+
-+  uint32_t stackAdjust;
-+  callWithABIPre(&stackAdjust);
-+  call(scratch);
-+  callWithABIPost(stackAdjust, result);
-+}
-+
-+// ===============================================================
-+// Additional arithmetic helpers.
-+
-+void MacroAssembler::flexibleRemainder32(Register lhs, Register rhs,
-+                                         Register dest, bool isUnsigned,
-+                                         const LiveRegisterSet&) {
-+  // rem = lhs - (lhs/rhs)*rhs
-+  // PPC64 divw(INT32_MIN, -1) is undefined; result is 0.
-+  Label done;
-+  if (!isUnsigned) {
-+    Label notMinOverflow;
-+    branchPtr(Assembler::NotEqual, lhs, ImmWord(INT32_MIN), &notMinOverflow);
-+    branchPtr(Assembler::NotEqual, rhs, ImmWord(-1), &notMinOverflow);
-+    move32(Imm32(0), dest);
-+    jump(&done);
-+    bind(&notMinOverflow);
-+  }
-+  if (HasPOWER9()) {
-+    if (isUnsigned) {
-+      as_moduw(dest, lhs, rhs);
-+    } else {
-+      as_modsw(dest, lhs, rhs);
-+    }
-+  } else {
-+    UseScratchRegisterScope temps(asMasm());
-+    Register scratch = temps.Acquire();
-+    if (isUnsigned) {
-+      as_divwu(scratch, lhs, rhs);
-+    } else {
-+      as_divw(scratch, lhs, rhs);
-+    }
-+    as_mullw(scratch, scratch, rhs);
-+    as_subf(dest, scratch, lhs);
-+  }
-+  as_extsw(dest, dest);
-+  bind(&done);
-+}
-+
-+void MacroAssembler::flexibleQuotientPtr(Register lhs, Register rhs,
-+                                         Register dest, bool isUnsigned,
-+                                         const LiveRegisterSet&) {
-+  // PPC64 divd(INT64_MIN, -1) is undefined; return INT64_MIN to match
-+  // ARM64/LoongArch64 hardware sdiv behavior.
-+  Label done;
-+  if (!isUnsigned) {
-+    Label notMinOverflow;
-+    branchPtr(Assembler::NotEqual, lhs, ImmWord(INT64_MIN), &notMinOverflow);
-+    branchPtr(Assembler::NotEqual, rhs, ImmWord(-1), &notMinOverflow);
-+    movePtr(ImmWord(INT64_MIN), dest);
-+    jump(&done);
-+    bind(&notMinOverflow);
-+  }
-+  if (isUnsigned) {
-+    as_divdu(dest, lhs, rhs);
-+  } else {
-+    as_divd(dest, lhs, rhs);
-+  }
-+  bind(&done);
-+}
-+
-+void MacroAssembler::flexibleRemainderPtr(Register lhs, Register rhs,
-+                                          Register dest, bool isUnsigned,
-+                                          const LiveRegisterSet&) {
-+  // rem = lhs - (lhs/rhs)*rhs
-+  // PPC64 divd(INT64_MIN, -1) is undefined; result is 0.
-+  Label done;
-+  if (!isUnsigned) {
-+    Label notMinOverflow;
-+    branchPtr(Assembler::NotEqual, lhs, ImmWord(INT64_MIN), &notMinOverflow);
-+    branchPtr(Assembler::NotEqual, rhs, ImmWord(-1), &notMinOverflow);
-+    movePtr(ImmWord(0), dest);
-+    jump(&done);
-+    bind(&notMinOverflow);
-+  }
-+  if (HasPOWER9()) {
-+    if (isUnsigned) {
-+      as_modud(dest, lhs, rhs);
-+    } else {
-+      as_modsd(dest, lhs, rhs);
-+    }
-+  } else {
-+    UseScratchRegisterScope temps(asMasm());
-+    Register scratch = temps.Acquire();
-+    if (isUnsigned) {
-+      as_divdu(scratch, lhs, rhs);
-+    } else {
-+      as_divd(scratch, lhs, rhs);
-+    }
-+    as_mulld(scratch, scratch, rhs);
-+    as_subf(dest, scratch, lhs);
-+  }
-+  bind(&done);
-+}
-+
-+// ===============================================================
-+// Rounding helpers.
-+
-+void MacroAssembler::floorDoubleToInt32(FloatRegister src, Register dest,
-+                                        Label* fail) {
-+  ScratchDoubleScope fpscratch(asMasm());
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+
-+  // Round toward negative infinity, then convert to int64.
-+  as_frim(fpscratch, src);
-+  as_fctidz(fpscratch, fpscratch);
-+  as_mfvsrd(dest, fpscratch);
-+
-+  // Check if result fits in int32.
-+  as_extsw(scratch, dest);
-+  as_cmpd(dest, scratch);
-+  ma_b(NotEqual, fail);
-+
-+  // Check for -0 and NaN when result is zero.
-+  Label notZero;
-+  as_cmpdi(dest, 0);
-+  ma_b(NotEqual, &notZero);
-+  {
-+    // If top 2 bits of src are set, it's negative or NaN.
-+    as_mfvsrd(dest, src);
-+    // rldicl. = x_srdi + record form: dest = top 2 bits, CR0[eq]=(dest==0).
-+    // Folds the explicit cmpdi src,0 that would otherwise drive the branch.
-+    as_rldicl_rc(dest, dest, 2, 62);
-+    ma_b(NotEqual, fail);
-+  }
-+  bind(&notZero);
-+}
-+
-+void MacroAssembler::floorFloat32ToInt32(FloatRegister src, Register dest,
-+                                         Label* fail) {
-+  ScratchDoubleScope fpscratch(asMasm());
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+
-+  // PPC FP rounding works on doubles. Single-precision FPRs are
-+  // already in double-width registers, so frim works fine.
-+  as_frim(fpscratch, src);
-+  as_fctidz(fpscratch, fpscratch);
-+  as_mfvsrd(dest, fpscratch);
-+
-+  // Check if result fits in int32.
-+  as_extsw(scratch, dest);
-+  as_cmpd(dest, scratch);
-+  ma_b(NotEqual, fail);
-+
-+  // Check for -0 and NaN when result is zero.
-+  Label notZero;
-+  as_cmpdi(dest, 0);
-+  ma_b(NotEqual, &notZero);
-+  {
-+    // src is held in the FPR as a 64-bit double (lfs widens float32 to
-+    // double on load), so the same top-2-bits check used for doubles
-+    // applies: bit 63 = sign, bit 62 = exponent MSB. Nonzero means -0,
-+    // ±Inf, NaN, or a large magnitude — none of which is +0.
-+    as_mfvsrd(dest, src);
-+    // rldicl. = x_srdi + record form: dest = top 2 bits, CR0[eq]=(dest==0).
-+    // Folds the explicit cmpdi src,0 that would otherwise drive the branch.
-+    as_rldicl_rc(dest, dest, 2, 62);
-+    ma_b(NotEqual, fail);
-+  }
-+  bind(&notZero);
-+}
-+
-+void MacroAssembler::ceilDoubleToInt32(FloatRegister src, Register dest,
-+                                       Label* fail) {
-+  ScratchDoubleScope fpscratch(asMasm());
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+
-+  as_frip(fpscratch, src);
-+  as_fctidz(fpscratch, fpscratch);
-+  as_mfvsrd(dest, fpscratch);
-+
-+  // Check if result fits in int32.
-+  as_extsw(scratch, dest);
-+  as_cmpd(dest, scratch);
-+  ma_b(NotEqual, fail);
-+
-+  // Check for (-1, -0] and NaN when result is zero.
-+  Label notZero;
-+  as_cmpdi(dest, 0);
-+  ma_b(NotEqual, &notZero);
-+  {
-+    // If binary value is not zero, input was not 0 (could be -0 or NaN).
-+    as_mfvsrd(dest, src);
-+    as_cmpdi(dest, 0);
-+    ma_b(NotEqual, fail);
-+  }
-+  bind(&notZero);
-+}
-+
-+void MacroAssembler::ceilFloat32ToInt32(FloatRegister src, Register dest,
-+                                        Label* fail) {
-+  ScratchDoubleScope fpscratch(asMasm());
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+
-+  as_frip(fpscratch, src);
-+  as_fctidz(fpscratch, fpscratch);
-+  as_mfvsrd(dest, fpscratch);
-+
-+  // Check if result fits in int32.
-+  as_extsw(scratch, dest);
-+  as_cmpd(dest, scratch);
-+  ma_b(NotEqual, fail);
-+
-+  // Check for (-1, -0] and NaN when result is zero.
-+  Label notZero;
-+  as_cmpdi(dest, 0);
-+  ma_b(NotEqual, &notZero);
-+  {
-+    as_mfvsrd(dest, src);
-+    as_cmpdi(dest, 0);
-+    ma_b(NotEqual, fail);
-+  }
-+  bind(&notZero);
-+}
-+
-+void MacroAssembler::truncDoubleToInt32(FloatRegister src, Register dest,
-+                                        Label* fail) {
-+  ScratchDoubleScope fpscratch(asMasm());
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+
-+  as_fctidz(fpscratch, src);
-+  as_mfvsrd(dest, fpscratch);
-+
-+  // Check if result fits in int32.
-+  as_extsw(scratch, dest);
-+  as_cmpd(dest, scratch);
-+  ma_b(NotEqual, fail);
-+
-+  // Check for -0 and NaN when result is zero.
-+  Label notZero;
-+  as_cmpdi(dest, 0);
-+  ma_b(NotEqual, &notZero);
-+  {
-+    as_mfvsrd(dest, src);
-+    // rldicl. = x_srdi + record form: dest = top 2 bits, CR0[eq]=(dest==0).
-+    // Folds the explicit cmpdi src,0 that would otherwise drive the branch.
-+    as_rldicl_rc(dest, dest, 2, 62);
-+    ma_b(NotEqual, fail);
-+  }
-+  bind(&notZero);
-+}
-+
-+void MacroAssembler::truncFloat32ToInt32(FloatRegister src, Register dest,
-+                                         Label* fail) {
-+  ScratchDoubleScope fpscratch(asMasm());
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+
-+  as_fctidz(fpscratch, src);
-+  as_mfvsrd(dest, fpscratch);
-+
-+  // Check if result fits in int32.
-+  as_extsw(scratch, dest);
-+  as_cmpd(dest, scratch);
-+  ma_b(NotEqual, fail);
-+
-+  // Check for -0 and NaN when result is zero.
-+  Label notZero;
-+  as_cmpdi(dest, 0);
-+  ma_b(NotEqual, &notZero);
-+  {
-+    as_mfvsrd(dest, src);
-+    // rldicl. = x_srdi + record form: dest = top 2 bits, CR0[eq]=(dest==0).
-+    // Folds the explicit cmpdi src,0 that would otherwise drive the branch.
-+    as_rldicl_rc(dest, dest, 2, 62);
-+    ma_b(NotEqual, fail);
-+  }
-+  bind(&notZero);
-+}
-+
-+void MacroAssembler::roundDoubleToInt32(FloatRegister src, Register dest,
-+                                        FloatRegister temp, Label* fail) {
-+  ScratchDoubleScope fpscratch(asMasm());
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+
-+  Label negative, end, performRound;
-+
-+  // Branch for negative inputs.
-+  zeroDouble(fpscratch);
-+  branchDouble(DoubleGreaterThanOrEqual, src, fpscratch, &performRound);
-+
-+  // Input is negative.
-+  loadConstantDouble(-0.5, fpscratch);
-+  branchDouble(DoubleGreaterThanOrEqual, src, fpscratch, fail);
-+  jump(&performRound);
-+
-+  bind(&performRound);
-+  {
-+    loadConstantDouble(GetBiggestNumberLessThan(0.5), temp);
-+    as_fadd(fpscratch, src, temp);
-+    as_frim(fpscratch, fpscratch);
-+    as_fctidz(fpscratch, fpscratch);
-+    as_mfvsrd(dest, fpscratch);
-+
-+    // Check if result fits in int32.
-+    as_extsw(scratch, dest);
-+    as_cmpd(dest, scratch);
-+    ma_b(NotEqual, fail);
-+  }
-+  bind(&end);
-+
-+  // Check for -0 and NaN when result is zero.
-+  Label notZero;
-+  as_cmpdi(dest, 0);
-+  ma_b(NotEqual, &notZero);
-+  {
-+    as_mfvsrd(dest, src);
-+    as_cmpdi(dest, 0);
-+    ma_b(NotEqual, fail);
-+  }
-+  bind(&notZero);
-+}
-+
-+void MacroAssembler::roundFloat32ToInt32(FloatRegister src, Register dest,
-+                                         FloatRegister temp, Label* fail) {
-+  ScratchDoubleScope fpscratch(asMasm());
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+
-+  Label negative, end, performRound;
-+
-+  // Branch for non-negative inputs.
-+  loadConstantFloat32(0.0f, fpscratch);
-+  branchFloat(DoubleGreaterThanOrEqual, src, fpscratch, &performRound);
-+
-+  // Input is negative.
-+  loadConstantFloat32(-0.5f, fpscratch);
-+  branchFloat(DoubleGreaterThanOrEqual, src, fpscratch, fail);
-+  jump(&performRound);
-+
-+  bind(&performRound);
-+  {
-+    loadConstantFloat32(float(GetBiggestNumberLessThan(0.5)), temp);
-+    as_fadds(fpscratch, src, temp);
-+    as_frim(fpscratch, fpscratch);
-+    as_fctidz(fpscratch, fpscratch);
-+    as_mfvsrd(dest, fpscratch);
-+
-+    // Check if result fits in int32.
-+    as_extsw(scratch, dest);
-+    as_cmpd(dest, scratch);
-+    ma_b(NotEqual, fail);
-+  }
-+  bind(&end);
-+
-+  // Check for -0 and NaN when result is zero.
-+  Label notZero;
-+  as_cmpdi(dest, 0);
-+  ma_b(NotEqual, &notZero);
-+  {
-+    as_mfvsrd(dest, src);
-+    as_cmpdi(dest, 0);
-+    ma_b(NotEqual, fail);
-+  }
-+  bind(&notZero);
-+}
-+
-+// ===============================================================
-+// FP conversion / copy-sign.
-+
-+void MacroAssembler::convertIntPtrToDouble(Register src, FloatRegister dest) {
-+  convertInt64ToDouble(Register64(src), dest);
-+}
-+
-+void MacroAssembler::copySignDouble(FloatRegister lhs, FloatRegister rhs,
-+                                    FloatRegister output) {
-+  // fcpsgn frt, fra, frb: copies sign of fra to magnitude of frb.
-+  // lhs = magnitude source, rhs = sign source.
-+  as_fcpsgn(output, rhs, lhs);
-+}
-+
-+void MacroAssembler::copySignFloat32(FloatRegister lhs, FloatRegister rhs,
-+                                     FloatRegister output) {
-+  as_fcpsgn(output, rhs, lhs);
-+}
-+
-+// ===============================================================
-+// GC / nursery helpers.
-+
-+void MacroAssembler::loadStoreBuffer(Register ptr, Register buffer) {
-+  andPtr(Imm32(int32_t(~gc::ChunkMask)), ptr, buffer);
-+  loadPtr(Address(buffer, gc::ChunkStoreBufferOffset), buffer);
-+}
-+
-+void MacroAssembler::branchValueIsNurseryCell(Condition cond,
-+                                              const Address& address,
-+                                              Register temp, Label* label) {
-+  branchValueIsNurseryCellImpl(cond, address, temp, label);
-+}
-+
-+template <typename T>
-+void MacroAssembler::branchValueIsNurseryCellImpl(Condition cond,
-+                                                  const T& value, Register temp,
-+                                                  Label* label) {
-+  MOZ_ASSERT(cond == Assembler::Equal || cond == Assembler::NotEqual);
-+  MOZ_ASSERT(temp != InvalidReg);
-+  Label done;
-+  branchTestGCThing(Assembler::NotEqual, value,
-+                    cond == Assembler::Equal ? &done : label);
-+
-+  getGCThingValueChunk(value, temp);
-+  loadPtr(Address(temp, gc::ChunkStoreBufferOffset), temp);
-+  branchPtr(InvertCondition(cond), temp, ImmWord(0), label);
-+
-+  bind(&done);
-+}
-+
-+// ===============================================================
-+// Template instantiations.
-+
-+template <typename T>
-+void MacroAssembler::storeUnboxedValue(const ConstantOrRegister& value,
-+                                       MIRType valueType, const T& dest) {
-+  MOZ_ASSERT(valueType < MIRType::Value);
-+
-+  if (valueType == MIRType::Double) {
-+    boxDouble(value.reg().typedReg().fpu(), dest);
-+    return;
-+  }
-+
-+  if (value.constant()) {
-+    storeValue(value.value(), dest);
-+  } else {
-+    storeValue(ValueTypeFromMIRType(valueType), value.reg().typedReg().gpr(),
-+               dest);
-+  }
-+}
-+
-+template void MacroAssembler::storeUnboxedValue(const ConstantOrRegister& value,
-+                                                MIRType valueType,
-+                                                const Address& dest);
-+template void MacroAssembler::storeUnboxedValue(
-+    const ConstantOrRegister& value, MIRType valueType,
-+    const BaseObjectElementIndex& dest);
-+
-+// ===============================================================
-+// Misc stubs.
-+
-+void MacroAssembler::comment(const char* msg) {}
-+
-+void MacroAssembler::speculationBarrier() {
-+  // isync provides execution synchronization: discards prefetched
-+  // instructions and forces a refetch+reexecute past the barrier.
-+  // No instruction following isync may begin (architecturally) until
-+  // isync completes, blocking speculative bypass — exactly the
-+  // Spectre v1 guarantee needed after a C call returns a value that
-+  // may influence subsequent loads. Reachable from shared
-+  // CodeGenerator under JitOptions.spectreJitToCxxCalls.
-+  as_isync();
-+}
-+
-+void MacroAssembler::atomicPause() { nop(); }
-+
-+void MacroAssembler::enterFakeExitFrameForWasm(Register cxreg, Register scratch,
-+                                               ExitFrameType type) {
-+  enterFakeExitFrame(cxreg, scratch, type);
-+}
-+
-+void MacroAssembler::wasmBoundsCheck32(Condition cond, Register index,
-+                                       Register boundsCheckLimit,
-+                                       Label* label) {
-+  ma_cmp(index, boundsCheckLimit, cond);
-+  ma_b(cond, label);
-+}
-+
-+void MacroAssembler::wasmBoundsCheck32(Condition cond, Register index,
-+                                       Address boundsCheckLimit, Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  load32(boundsCheckLimit, scratch);
-+  ma_cmp(index, scratch, cond);
-+  ma_b(cond, label);
-+}
-+
-+void MacroAssembler::wasmBoundsCheck64(Condition cond, Register64 index,
-+                                       Register64 boundsCheckLimit,
-+                                       Label* label) {
-+  ma_cmp(index.reg, boundsCheckLimit.reg, cond);
-+  ma_b(cond, label);
-+}
-+
-+void MacroAssembler::wasmBoundsCheck64(Condition cond, Register64 index,
-+                                       Address boundsCheckLimit, Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  loadPtr(boundsCheckLimit, scratch);
-+  ma_cmp(index.reg, scratch, cond);
-+  ma_b(cond, label);
-+}
-+
-+CodeOffset MacroAssembler::move32WithPatch(Register dest) {
-+  CodeOffset offset(currentOffset());
-+  emitLoad64Stanza(dest, 0);
-+  return offset;
-+}
-+
-+CodeOffset MacroAssembler::sub32FromMemAndBranchIfNegativeWithPatch(
-+    Address address, Label* label) {
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  MOZ_ASSERT(scratch != address.base);
-+  load32(address, scratch);
-+  // Subtract a placeholder value (will be patched).
-+  // Use addi with positive placeholder (128), which will be patched to
-+  // addi with negative value. The immediate is in the addi instruction.
-+  as_addi(scratch, scratch, 128);
-+  CodeOffset patchPoint = CodeOffset(currentOffset());
-+  store32(scratch, address);
-+  // Branch if result is negative (signed).
-+  as_cmpwi(scratch, 0);
-+  ma_b(LessThan, label);
-+  return patchPoint;
-+}
-+
-+bool MacroAssembler::convertUInt64ToDoubleNeedsTemp() { return false; }
-+
-+void MacroAssembler::call(ImmWord imm) { call(ImmPtr((void*)imm.value)); }
-+
-+void MacroAssembler::convertUInt64ToDouble(Register64 src, FloatRegister dest,
-+                                           Register temp) {
-+  MOZ_ASSERT(temp == Register::Invalid());
-+  // POWER7+ has fcfidu (unsigned i64 → f64) as a single instruction; no
-+  // sign-split / branch / GPR scratch needed.
-+  as_mtvsrd(dest, src.reg);
-+  as_fcfidu(dest, dest);
-+}
-+
-+void MacroAssembler::convertInt64ToFloat32(Register64 src, FloatRegister dest) {
-+  as_mtvsrd(dest, src.reg);
-+  as_fcfids(dest, dest);
-+}
-+
-+void MacroAssembler::convertUInt64ToFloat32(Register64 src, FloatRegister dest,
-+                                            Register temp) {
-+  MOZ_ASSERT(temp == Register::Invalid());
-+  // POWER7+ has fcfidus (unsigned i64 → f32) as a single instruction.
-+  as_mtvsrd(dest, src.reg);
-+  as_fcfidus(dest, dest);
-+}
-+
-+void MacroAssembler::flexibleQuotient32(
-+    Register lhs, Register rhs, Register dest, bool isUnsigned,
-+    const LiveRegisterSet& volatileLiveRegs) {
-+  // PPC64 divw(INT32_MIN, -1) is undefined; return INT32_MIN to match
-+  // ARM64/LoongArch64 hardware sdiv behavior.
-+  Label done;
-+  if (!isUnsigned) {
-+    Label notMinOverflow;
-+    branchPtr(Assembler::NotEqual, lhs, ImmWord(INT32_MIN), &notMinOverflow);
-+    branchPtr(Assembler::NotEqual, rhs, ImmWord(-1), &notMinOverflow);
-+    move32(Imm32(INT32_MIN), dest);
-+    jump(&done);
-+    bind(&notMinOverflow);
-+  }
-+  if (isUnsigned) {
-+    as_divwu(dest, lhs, rhs);
-+  } else {
-+    as_divw(dest, lhs, rhs);
-+  }
-+  as_extsw(dest, dest);
-+  bind(&done);
-+}
-+
-+void MacroAssembler::oolWasmTruncateCheckF32ToI32(
-+    FloatRegister input, Register output, TruncFlags flags,
-+    const wasm::TrapSiteDesc& trapSiteDesc, Label* rejoin) {
-+  outOfLineWasmTruncateToInt32Check(input, output, MIRType::Float32, flags,
-+                                    rejoin, trapSiteDesc);
-+}
-+
-+void MacroAssembler::oolWasmTruncateCheckF32ToI64(
-+    FloatRegister input, Register64 output, TruncFlags flags,
-+    const wasm::TrapSiteDesc& trapSiteDesc, Label* rejoin) {
-+  outOfLineWasmTruncateToInt64Check(input, output, MIRType::Float32, flags,
-+                                    rejoin, trapSiteDesc);
-+}
-+
-+void MacroAssembler::oolWasmTruncateCheckF64ToI32(
-+    FloatRegister input, Register output, TruncFlags flags,
-+    const wasm::TrapSiteDesc& trapSiteDesc, Label* rejoin) {
-+  outOfLineWasmTruncateToInt32Check(input, output, MIRType::Double, flags,
-+                                    rejoin, trapSiteDesc);
-+}
-+
-+void MacroAssembler::oolWasmTruncateCheckF64ToI64(
-+    FloatRegister input, Register64 output, TruncFlags flags,
-+    const wasm::TrapSiteDesc& trapSiteDesc, Label* rejoin) {
-+  outOfLineWasmTruncateToInt64Check(input, output, MIRType::Double, flags,
-+                                    rejoin, trapSiteDesc);
-+}
-+
-+void MacroAssemblerPPC64Compat::outOfLineWasmTruncateToInt32Check(
-+    FloatRegister input, Register output, MIRType fromType, TruncFlags flags,
-+    Label* rejoin, const wasm::TrapSiteDesc& trapSiteDesc) {
-+  bool isUnsigned = flags & TRUNC_UNSIGNED;
-+  bool isSaturating = flags & TRUNC_SATURATING;
-+
-+  if (isSaturating) {
-+    ScratchDoubleScope fpscratch(asMasm());
-+    if (fromType == MIRType::Double) {
-+      asMasm().loadConstantDouble(0.0, fpscratch);
-+    } else {
-+      asMasm().loadConstantFloat32(0.0f, fpscratch);
-+    }
-+
-+    if (isUnsigned) {
-+      // If input < 0 or NaN, output = 0; else output = UINT32_MAX.
-+      Label notNegOrNaN;
-+      if (fromType == MIRType::Double) {
-+        asMasm().branchDouble(Assembler::DoubleGreaterThanOrEqual, input,
-+                              fpscratch, &notNegOrNaN);
-+      } else {
-+        asMasm().branchFloat(Assembler::DoubleGreaterThanOrEqual, input,
-+                             fpscratch, &notNegOrNaN);
-+      }
-+      asMasm().move32(Imm32(0), output);
-+      asMasm().jump(rejoin);
-+      asMasm().bind(&notNegOrNaN);
-+      asMasm().move32(Imm32(UINT32_MAX), output);
-+    } else {
-+      // Signed: NaN -> 0, negative overflow -> INT32_MIN,
-+      // positive overflow already saturated to INT32_MAX.
-+      Label notNaN, done;
-+      if (fromType == MIRType::Double) {
-+        asMasm().branchDouble(Assembler::DoubleOrdered, input, input, &notNaN);
-+      } else {
-+        asMasm().branchFloat(Assembler::DoubleOrdered, input, input, &notNaN);
-+      }
-+      asMasm().move32(Imm32(0), output);
-+      asMasm().jump(rejoin);
-+
-+      asMasm().bind(&notNaN);
-+      if (fromType == MIRType::Double) {
-+        asMasm().branchDouble(Assembler::DoubleGreaterThanOrEqual, input,
-+                              fpscratch, rejoin);
-+      } else {
-+        asMasm().branchFloat(Assembler::DoubleGreaterThanOrEqual, input,
-+                             fpscratch, rejoin);
-+      }
-+      asMasm().move32(Imm32(INT32_MIN), output);
-+    }
-+
-+    MOZ_ASSERT(rejoin->bound());
-+    asMasm().jump(rejoin);
-+    return;
-+  }
-+
-+  Label inputIsNaN;
-+  if (fromType == MIRType::Double) {
-+    asMasm().branchDouble(Assembler::DoubleUnordered, input, input,
-+                          &inputIsNaN);
-+  } else {
-+    asMasm().branchFloat(Assembler::DoubleUnordered, input, input, &inputIsNaN);
-+  }
-+
-+  asMasm().wasmTrap(wasm::Trap::IntegerOverflow, trapSiteDesc);
-+  asMasm().bind(&inputIsNaN);
-+  asMasm().wasmTrap(wasm::Trap::InvalidConversionToInteger, trapSiteDesc);
-+}
-+
-+void MacroAssemblerPPC64Compat::outOfLineWasmTruncateToInt64Check(
-+    FloatRegister input, Register64 output_, MIRType fromType, TruncFlags flags,
-+    Label* rejoin, const wasm::TrapSiteDesc& trapSiteDesc) {
-+  bool isUnsigned = flags & TRUNC_UNSIGNED;
-+  bool isSaturating = flags & TRUNC_SATURATING;
-+
-+  if (isSaturating) {
-+    ScratchDoubleScope fpscratch(asMasm());
-+    Register output = output_.reg;
-+
-+    if (fromType == MIRType::Double) {
-+      asMasm().loadConstantDouble(0.0, fpscratch);
-+    } else {
-+      asMasm().loadConstantFloat32(0.0f, fpscratch);
-+    }
-+
-+    if (isUnsigned) {
-+      Label notNegOrNaN;
-+      if (fromType == MIRType::Double) {
-+        asMasm().branchDouble(Assembler::DoubleGreaterThanOrEqual, input,
-+                              fpscratch, &notNegOrNaN);
-+      } else {
-+        asMasm().branchFloat(Assembler::DoubleGreaterThanOrEqual, input,
-+                             fpscratch, &notNegOrNaN);
-+      }
-+      asMasm().movePtr(ImmWord(0), output);
-+      asMasm().jump(rejoin);
-+      asMasm().bind(&notNegOrNaN);
-+      asMasm().movePtr(ImmWord(UINT64_MAX), output);
-+    } else {
-+      Label notNaN;
-+      if (fromType == MIRType::Double) {
-+        asMasm().branchDouble(Assembler::DoubleOrdered, input, input, &notNaN);
-+      } else {
-+        asMasm().branchFloat(Assembler::DoubleOrdered, input, input, &notNaN);
-+      }
-+      asMasm().movePtr(ImmWord(0), output);
-+      asMasm().jump(rejoin);
-+
-+      asMasm().bind(&notNaN);
-+      if (fromType == MIRType::Double) {
-+        asMasm().branchDouble(Assembler::DoubleGreaterThanOrEqual, input,
-+                              fpscratch, rejoin);
-+      } else {
-+        asMasm().branchFloat(Assembler::DoubleGreaterThanOrEqual, input,
-+                             fpscratch, rejoin);
-+      }
-+      asMasm().movePtr(ImmWord(INT64_MIN), output);
-+    }
-+
-+    MOZ_ASSERT(rejoin->bound());
-+    asMasm().jump(rejoin);
-+    return;
-+  }
-+
-+  Label inputIsNaN;
-+  if (fromType == MIRType::Double) {
-+    asMasm().branchDouble(Assembler::DoubleUnordered, input, input,
-+                          &inputIsNaN);
-+  } else {
-+    asMasm().branchFloat(Assembler::DoubleUnordered, input, input, &inputIsNaN);
-+  }
-+
-+  asMasm().wasmTrap(wasm::Trap::IntegerOverflow, trapSiteDesc);
-+  asMasm().bind(&inputIsNaN);
-+  asMasm().wasmTrap(wasm::Trap::InvalidConversionToInteger, trapSiteDesc);
-+}
-+
-+void MacroAssembler::PopStackPtr() {
-+  loadPtr(Address(StackPointer, 0), StackPointer);
-+  adjustFrame(-int32_t(sizeof(intptr_t)));
-+}
-+
-+void MacroAssembler::patchSub32FromMemAndBranchIfNegative(CodeOffset offset,
-+                                                          Imm32 imm) {
-+  int32_t val = imm.value;
-+  MOZ_RELEASE_ASSERT(val >= 1 && val <= 127);
-+  // Patch the addi instruction that's right before patchPoint.
-+  // addi is 1 instruction before the CodeOffset (which is after the addi).
-+  Instruction* inst =
-+      (Instruction*)m_buffer.getInst(BufferOffset(offset.offset() - 4));
-+  // Rewrite the immediate field to -val.
-+  // PPC addi: opcode(6) | RT(5) | RA(5) | SI(16)
-+  uint32_t instWord = inst->encode();
-+  uint32_t base = instWord & 0xffff0000;
-+  inst->setData(base | (uint16_t)(-val & 0xffff));
-+}
-+
-+void MacroAssembler::wasmTruncateDoubleToInt32(FloatRegister input,
-+                                               Register output,
-+                                               bool isSaturating,
-+                                               Label* oolEntry) {
-+  ScratchDoubleScope fpscratch(asMasm());
-+  // Clear VXCVI (bit 23) before the conversion so we can detect overflow.
-+  as_mtfsb0(23);
-+  as_fctiwz(fpscratch, input);
-+  as_mfvsrd(output, fpscratch);
-+  as_extsw(output, output);
-+  // Move FPSCR field 5 (which contains VXCVI) to CR0.
-+  // If the conversion was invalid (NaN or out-of-range), VXCVI=1 → SO set.
-+  as_mcrfs(cr0, 5);
-+  ma_b(SOBit, oolEntry);
-+}
-+
-+void MacroAssembler::wasmTruncateDoubleToUInt32(FloatRegister input,
-+                                                Register output,
-+                                                bool isSaturating,
-+                                                Label* oolEntry) {
-+  ScratchDoubleScope fpscratch(asMasm());
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  // Always check for NaN — the ool handler clamps for saturating mode.
-+  as_fcmpu(input, input);
-+  ma_b(DoubleUnordered, oolEntry);
-+  as_fctidz(fpscratch, input);
-+  as_mfvsrd(output, fpscratch);
-+  x_srdi(scratch, output, 32);
-+  as_extsw(output, output);
-+  as_cmpdi(scratch, 0);
-+  ma_b(NotEqual, oolEntry);
-+}
-+
-+void MacroAssembler::wasmTruncateFloat32ToInt32(FloatRegister input,
-+                                                Register output,
-+                                                bool isSaturating,
-+                                                Label* oolEntry) {
-+  ScratchDoubleScope fpscratch(asMasm());
-+  as_mtfsb0(23);
-+  as_fctiwz(fpscratch, input);
-+  as_mfvsrd(output, fpscratch);
-+  as_extsw(output, output);
-+  as_mcrfs(cr0, 5);
-+  ma_b(SOBit, oolEntry);
-+}
-+
-+void MacroAssembler::wasmTruncateFloat32ToUInt32(FloatRegister input,
-+                                                 Register output,
-+                                                 bool isSaturating,
-+                                                 Label* oolEntry) {
-+  ScratchDoubleScope fpscratch(asMasm());
-+  UseScratchRegisterScope temps(asMasm());
-+  Register scratch = temps.Acquire();
-+  as_fcmpu(input, input);
-+  ma_b(DoubleUnordered, oolEntry);
-+  as_fctidz(fpscratch, input);
-+  as_mfvsrd(output, fpscratch);
-+  x_srdi(scratch, output, 32);
-+  as_extsw(output, output);
-+  as_cmpdi(scratch, 0);
-+  ma_b(NotEqual, oolEntry);
-+}
-+
-+void MacroAssembler::wasmTruncateDoubleToInt64(
-+    FloatRegister input, Register64 output, bool isSaturating, Label* oolEntry,
-+    Label* oolRejoin, FloatRegister tempDouble) {
-+  MOZ_ASSERT(tempDouble.isInvalid());
-+  ScratchDoubleScope fpscratch(asMasm());
-+  as_mtfsb0(23);
-+  as_fctidz(fpscratch, input);
-+  as_mfvsrd(output.reg, fpscratch);
-+  as_mcrfs(cr0, 5);
-+  ma_b(SOBit, oolEntry);
-+  if (isSaturating) {
-+    bind(oolRejoin);
-+  }
-+}
-+
-+void MacroAssembler::wasmTruncateFloat32ToInt64(
-+    FloatRegister input, Register64 output, bool isSaturating, Label* oolEntry,
-+    Label* oolRejoin, FloatRegister tempFloat) {
-+  MOZ_ASSERT(tempFloat.isInvalid());
-+  ScratchDoubleScope fpscratch(asMasm());
-+  as_mtfsb0(23);
-+  as_fctidz(fpscratch, input);
-+  as_mfvsrd(output.reg, fpscratch);
-+  as_mcrfs(cr0, 5);
-+  ma_b(SOBit, oolEntry);
-+  if (isSaturating) {
-+    bind(oolRejoin);
-+  }
-+}
-+
-+void MacroAssembler::wasmTruncateDoubleToUInt64(
-+    FloatRegister input, Register64 output, bool isSaturating, Label* oolEntry,
-+    Label* oolRejoin, FloatRegister tempDouble) {
-+  MOZ_ASSERT(tempDouble.isInvalid());
-+  ScratchDoubleScope fpscratch(asMasm());
-+  as_mtfsb0(23);
-+  as_fctiduz(fpscratch, input);
-+  as_mfvsrd(output.reg, fpscratch);
-+  as_mcrfs(cr0, 5);
-+  ma_b(SOBit, oolEntry);
-+  if (isSaturating) {
-+    bind(oolRejoin);
-+  }
-+}
-+
-+void MacroAssembler::wasmTruncateFloat32ToUInt64(
-+    FloatRegister input, Register64 output, bool isSaturating, Label* oolEntry,
-+    Label* oolRejoin, FloatRegister tempFloat) {
-+  MOZ_ASSERT(tempFloat.isInvalid());
-+  ScratchDoubleScope fpscratch(asMasm());
-+  as_mtfsb0(23);
-+  as_fctiduz(fpscratch, input);
-+  as_mfvsrd(output.reg, fpscratch);
-+  as_mcrfs(cr0, 5);
-+  ma_b(SOBit, oolEntry);
-+  if (isSaturating) {
-+    bind(oolRejoin);
-+  }
-+}
-+
-+void MacroAssemblerPPC64Compat::profilerEnterFrame(Register framePtr,
-+                                                   Register scratch) {
-+  asMasm().loadJSContext(scratch);
-+  loadPtr(Address(scratch, offsetof(JSContext, profilingActivation_)), scratch);
-+  storePtr(framePtr,
-+           Address(scratch, JitActivation::offsetOfLastProfilingFrame()));
-+  storePtr(ImmPtr(nullptr),
-+           Address(scratch, JitActivation::offsetOfLastProfilingCallSite()));
-+}
-+
-+void MacroAssemblerPPC64Compat::profilerExitFrame() {
-+  jump(asMasm().runtime()->jitRuntime()->getProfilerExitFrameTail());
-+}
-+
-+void MacroAssemblerPPC64Compat::ma_mod_mask(Register src, Register dest,
-+                                            Register hold, Register remain,
-+                                            int32_t shift, Label* negZero) {
-+  // Compute x % ((1<<shift) - 1) by digit-summing in base b = 1<<shift.
-+  // Since b % (b-1) == 1, x % (b-1) == sum of base-b digits of x, mod (b-1).
-+  int32_t mask = (1 << shift) - 1;
-+  Label head, negative, sumSigned, done;
-+
-+  as_or_(remain, src, src);  // move src -> remain
-+  xs_li(dest, 0);
-+
-+  // Check sign (32-bit signed comparison)
-+  as_cmpwi(remain, 0);
-+  ma_b(Assembler::LessThan, &negative);
-+  xs_li(hold, 1);
-+  jump(&head);
-+
-+  bind(&negative);
-+  xs_li(hold, -1);
-+  as_neg(remain, remain);
-+  as_rldicl(remain, remain, 0, 32);
-+
-+  bind(&head);
-+  {
-+    UseScratchRegisterScope temps(asMasm());
-+    Register scratch = temps.Acquire();
-+
-+    // Extract bottom 'shift' bits: scratch = remain & mask
-+    move32(Imm32(mask), scratch);
-+    as_and_(scratch, remain, scratch);
-+
-+    // Add to accumulator
-+    as_add(dest, dest, scratch);
-+
-+    // Trial subtraction: scratch = dest - mask
-+    move32(Imm32(mask), scratch);
-+    as_subf(scratch, scratch, dest);  // scratch = dest - scratch
-+
-+    // If (dest - mask) > 0, keep the subtracted value
-+    as_cmpwi(scratch, 0);
-+    ma_b(Assembler::LessThan, &sumSigned);
-+    as_or_(dest, scratch, scratch);  // dest = scratch
-+    bind(&sumSigned);
-+
-+    // Shift out the bits we just processed
-+    x_srwi(remain, remain, shift);
-+
-+    // Continue if remain != 0
-+    as_cmpwi(remain, 0);
-+    ma_b(Assembler::NotEqual, &head);
-+  }
-+
-+  // If input was negative, negate result
-+  as_cmpwi(hold, 0);
-+  ma_b(Assembler::GreaterThanOrEqual, &done);
-+
-+  if (negZero != nullptr) {
-+    as_cmpwi(dest, 0);
-+    ma_b(Assembler::Equal, negZero);
-+  }
-+
-+  as_neg(dest, dest);
-+  as_extsw(dest, dest);
-+
-+  bind(&done);
-+}
-+
-+// ========================================================================
-+// Atomic operations.
-+
-+template <typename T>
-+static void CompareExchange(MacroAssembler& masm,
-+                            const wasm::MemoryAccessDesc* access,
-+                            Scalar::Type type, Synchronization sync,
-+                            const T& mem, Register oldval, Register newval,
-+                            Register valueTemp, Register offsetTemp,
-+                            Register maskTemp, Register output) {
-+  UseScratchRegisterScope temps(masm);
-+  bool signExtend = Scalar::isSignedIntType(type);
-+  unsigned nbytes = Scalar::byteSize(type);
-+
-+  switch (nbytes) {
-+    case 1:
-+    case 2:
-+      break;
-+    case 4:
-+      MOZ_ASSERT(valueTemp == InvalidReg);
-+      MOZ_ASSERT(offsetTemp == InvalidReg);
-+      MOZ_ASSERT(maskTemp == InvalidReg);
-+      break;
-+    default:
-+      MOZ_CRASH();
-+  }
-+
-+  Label again, end;
-+
-+  Register scratch = temps.Acquire();
-+  masm.computeEffectiveAddress(mem, scratch);
-+
-+  if (nbytes == 4) {
-+    masm.memoryBarrierBefore(sync);
-+    masm.bind(&again);
-+
-+    if (access) {
-+      masm.flushBuffer();  // see comment in wasmLoadImpl
-+      masm.append(*access, wasm::TrapMachineInsn::Atomic,
-+                  FaultingCodeOffset(masm.currentOffset()));
-+    }
-+
-+    masm.as_lwarx(output, r0, scratch);
-+    // ma_cmp(..., is32bit=true) emits cmpw, which compares only bits
-+    // 32:63 (low 32) of both operands per ISA v3.0B. The upper
-+    // 32 bits of oldval are ignored, so no canonicalising extsw needed.
-+    masm.ma_cmp(output, oldval, Assembler::NotEqual, /* is32bit */ true);
-+    masm.ma_b(Assembler::NotEqual, &end);
-+    masm.as_stwcx(newval, r0, scratch);
-+    masm.ma_b(Assembler::NotEqual, &again);
-+
-+    masm.memoryBarrierAfter(sync);
-+    masm.bind(&end);
-+    // lwarx zero-extends; sign-extend for 32-bit canonical form.
-+    masm.as_extsw(output, output);
-+
-+    return;
-+  }
-+
-+  // Sub-word (1 or 2 byte) compare-exchange via native lbarx/lharx +
-+  // stbcx./sthcx. POWER7+ (well below our POWER8 baseline). Replaces the prior
-+  // round-down-to-word
-+  // + mask + RMW dance. lXarx zero-extends the loaded byte/half; stXcx. stores
-+  // only the low 8/16 bits of RS, so no pre-masking is needed on the store
-+  // side. offsetTemp / maskTemp are still allocated by the lowering but unused
-+  // here.
-+  (void)offsetTemp;
-+  (void)maskTemp;
-+
-+  masm.memoryBarrierBefore(sync);
-+
-+  masm.bind(&again);
-+
-+  if (access) {
-+    masm.flushBuffer();  // see comment in wasmLoadImpl
-+    masm.append(*access, wasm::TrapMachineInsn::Atomic,
-+                FaultingCodeOffset(masm.currentOffset()));
-+  }
-+
-+  switch (nbytes) {
-+    case 1:
-+      masm.as_lbarx(output, r0, scratch);
-+      if (signExtend) {
-+        masm.as_extsb(valueTemp, oldval);
-+        masm.as_extsb(output, output);
-+      } else {
-+        masm.as_andi_rc(valueTemp, oldval, 0xff);
-+      }
-+      break;
-+    case 2:
-+      masm.as_lharx(output, r0, scratch);
-+      if (signExtend) {
-+        masm.as_extsh(valueTemp, oldval);
-+        masm.as_extsh(output, output);
-+      } else {
-+        masm.as_rlwinm(valueTemp, oldval, 0, 16, 31);
-+      }
-+      break;
-+  }
-+
-+  masm.ma_cmp(output, valueTemp, Assembler::NotEqual, /* is32bit */ true);
-+  masm.ma_b(Assembler::NotEqual, &end);
-+
-+  if (nbytes == 1) {
-+    masm.as_stbcx(newval, r0, scratch);
-+  } else {
-+    masm.as_sthcx(newval, r0, scratch);
-+  }
-+  masm.ma_b(Assembler::NotEqual, &again);
-+
-+  masm.memoryBarrierAfter(sync);
-+
-+  masm.bind(&end);
-+}
-+
-+template <typename T>
-+static void CompareExchange64(MacroAssembler& masm,
-+                              const wasm::MemoryAccessDesc* access,
-+                              Synchronization sync, const T& mem,
-+                              Register64 expect, Register64 replace,
-+                              Register64 output) {
-+  MOZ_ASSERT(expect != output && replace != output);
-+  UseScratchRegisterScope temps(masm);
-+  Register scratch = temps.Acquire();
-+  masm.computeEffectiveAddress(mem, scratch);
-+
-+  Label tryAgain;
-+  Label exit;
-+
-+  masm.memoryBarrierBefore(sync);
-+
-+  masm.bind(&tryAgain);
-+
-+  if (access) {
-+    masm.flushBuffer();  // see comment in wasmLoadImpl
-+    masm.append(*access, wasm::TrapMachineInsn::Atomic,
-+                FaultingCodeOffset(masm.currentOffset()));
-+  }
-+
-+  masm.as_ldarx(output.reg, r0, scratch);
-+
-+  masm.ma_cmp(output.reg, expect.reg, Assembler::NotEqual);
-+  masm.ma_b(Assembler::NotEqual, &exit);
-+  masm.as_stdcx(replace.reg, r0, scratch);
-+  masm.ma_b(Assembler::NotEqual, &tryAgain);
-+
-+  masm.memoryBarrierAfter(sync);
-+
-+  masm.bind(&exit);
-+}
-+
-+template <typename T>
-+static void AtomicExchange(MacroAssembler& masm,
-+                           const wasm::MemoryAccessDesc* access,
-+                           Scalar::Type type, Synchronization sync,
-+                           const T& mem, Register value, Register valueTemp,
-+                           Register offsetTemp, Register maskTemp,
-+                           Register output) {
-+  UseScratchRegisterScope temps(masm);
-+  bool signExtend = Scalar::isSignedIntType(type);
-+  unsigned nbytes = Scalar::byteSize(type);
-+
-+  switch (nbytes) {
-+    case 1:
-+    case 2:
-+      break;
-+    case 4:
-+      MOZ_ASSERT(valueTemp == InvalidReg);
-+      MOZ_ASSERT(offsetTemp == InvalidReg);
-+      MOZ_ASSERT(maskTemp == InvalidReg);
-+      break;
-+    default:
-+      MOZ_CRASH();
-+  }
-+
-+  Label again;
-+
-+  Register memTemp = temps.Acquire();
-+  masm.computeEffectiveAddress(mem, memTemp);
-+
-+  if (nbytes == 4) {
-+    masm.memoryBarrierBefore(sync);
-+    masm.bind(&again);
-+
-+    if (access) {
-+      masm.flushBuffer();  // see comment in wasmLoadImpl
-+      masm.append(*access, wasm::TrapMachineInsn::Atomic,
-+                  FaultingCodeOffset(masm.currentOffset()));
-+    }
-+
-+    masm.as_lwarx(output, r0, memTemp);
-+    masm.as_stwcx(value, r0, memTemp);
-+    masm.ma_b(Assembler::NotEqual, &again);
-+
-+    masm.memoryBarrierAfter(sync);
-+    // lwarx zero-extends; sign-extend for 32-bit canonical form.
-+    masm.as_extsw(output, output);
-+
-+    return;
-+  }
-+
-+  // Sub-word exchange via native lbarx/lharx + stbcx./sthcx. (POWER7+).
-+  // valueTemp / offsetTemp / maskTemp are still allocated by the lowering but
-+  // unused here.
-+  (void)valueTemp;
-+  (void)offsetTemp;
-+  (void)maskTemp;
-+
-+  masm.memoryBarrierBefore(sync);
-+
-+  masm.bind(&again);
-+
-+  if (access) {
-+    masm.flushBuffer();  // see comment in wasmLoadImpl
-+    masm.append(*access, wasm::TrapMachineInsn::Atomic,
-+                FaultingCodeOffset(masm.currentOffset()));
-+  }
-+
-+  if (nbytes == 1) {
-+    masm.as_lbarx(output, r0, memTemp);
-+    masm.as_stbcx(value, r0, memTemp);
-+  } else {
-+    masm.as_lharx(output, r0, memTemp);
-+    masm.as_sthcx(value, r0, memTemp);
-+  }
-+  masm.ma_b(Assembler::NotEqual, &again);
-+
-+  if (signExtend) {
-+    if (nbytes == 1) {
-+      masm.as_extsb(output, output);
-+    } else {
-+      masm.as_extsh(output, output);
-+    }
-+  }
-+  // Unsigned: lbarx/lharx already zero-extend; output is canonical.
-+
-+  masm.memoryBarrierAfter(sync);
-+}
-+
-+template <typename T>
-+static void AtomicExchange64(MacroAssembler& masm,
-+                             const wasm::MemoryAccessDesc* access,
-+                             Synchronization sync, const T& mem,
-+                             Register64 value, Register64 output) {
-+  MOZ_ASSERT(value != output);
-+  UseScratchRegisterScope temps(masm);
-+
-+  Register scratch = temps.Acquire();
-+  masm.computeEffectiveAddress(mem, scratch);
-+
-+  Label tryAgain;
-+
-+  masm.memoryBarrierBefore(sync);
-+
-+  masm.bind(&tryAgain);
-+
-+  if (access) {
-+    masm.flushBuffer();  // see comment in wasmLoadImpl
-+    masm.append(*access, wasm::TrapMachineInsn::Atomic,
-+                FaultingCodeOffset(masm.currentOffset()));
-+  }
-+
-+  masm.as_ldarx(output.reg, r0, scratch);
-+
-+  masm.as_stdcx(value.reg, r0, scratch);
-+  masm.ma_b(Assembler::NotEqual, &tryAgain);
-+
-+  masm.memoryBarrierAfter(sync);
-+}
-+
-+template <typename T>
-+static void AtomicFetchOp(MacroAssembler& masm,
-+                          const wasm::MemoryAccessDesc* access,
-+                          Scalar::Type type, Synchronization sync, AtomicOp op,
-+                          const T& mem, Register value, Register valueTemp,
-+                          Register offsetTemp, Register maskTemp,
-+                          Register output) {
-+  UseScratchRegisterScope temps(masm);
-+  bool signExtend = Scalar::isSignedIntType(type);
-+  unsigned nbytes = Scalar::byteSize(type);
-+
-+  switch (nbytes) {
-+    case 1:
-+    case 2:
-+      break;
-+    case 4:
-+      MOZ_ASSERT(valueTemp == InvalidReg);
-+      MOZ_ASSERT(offsetTemp == InvalidReg);
-+      MOZ_ASSERT(maskTemp == InvalidReg);
-+      break;
-+    default:
-+      MOZ_CRASH();
-+  }
-+
-+  Label again;
-+
-+  Register memTemp = temps.Acquire();
-+  masm.computeEffectiveAddress(mem, memTemp);
-+
-+  Register scratch = temps.Acquire();
-+
-+  if (nbytes == 4) {
-+    masm.memoryBarrierBefore(sync);
-+    masm.bind(&again);
-+
-+    if (access) {
-+      masm.flushBuffer();  // see comment in wasmLoadImpl
-+      masm.append(*access, wasm::TrapMachineInsn::Atomic,
-+                  FaultingCodeOffset(masm.currentOffset()));
-+    }
-+
-+    masm.as_lwarx(output, r0, memTemp);
-+
-+    switch (op) {
-+      case AtomicOp::Add:
-+        masm.as_add(scratch, output, value);
-+        break;
-+      case AtomicOp::Sub:
-+        masm.as_subf(scratch, value, output);
-+        break;
-+      case AtomicOp::And:
-+        masm.as_and_(scratch, output, value);
-+        break;
-+      case AtomicOp::Or:
-+        masm.as_or_(scratch, output, value);
-+        break;
-+      case AtomicOp::Xor:
-+        masm.as_xor_(scratch, output, value);
-+        break;
-+      default:
-+        MOZ_CRASH();
-+    }
-+
-+    masm.as_stwcx(scratch, r0, memTemp);
-+    masm.ma_b(Assembler::NotEqual, &again);
-+
-+    masm.memoryBarrierAfter(sync);
-+    // lwarx zero-extends; sign-extend for 32-bit canonical form.
-+    masm.as_extsw(output, output);
-+
-+    return;
-+  }
-+
-+  // Sub-word fetch-and-op via native lbarx/lharx + stbcx./sthcx. (POWER7+).
-+  // `output` holds the pre-op loaded value (returned to caller); `valueTemp`
-+  // is the post-op value we condition-store. stXcx. only stores low 8/16 bits
-+  // of RS, so no pre-mask of valueTemp is needed.
-+  // offsetTemp / maskTemp are still allocated by the lowering but unused; the
-+  // local `scratch` is only used in the 4-byte branch above.
-+  (void)offsetTemp;
-+  (void)maskTemp;
-+
-+  masm.memoryBarrierBefore(sync);
-+
-+  masm.bind(&again);
-+
-+  if (access) {
-+    masm.flushBuffer();  // see comment in wasmLoadImpl
-+    masm.append(*access, wasm::TrapMachineInsn::Atomic,
-+                FaultingCodeOffset(masm.currentOffset()));
-+  }
-+
-+  if (nbytes == 1) {
-+    masm.as_lbarx(output, r0, memTemp);
-+  } else {
-+    masm.as_lharx(output, r0, memTemp);
-+  }
-+
-+  switch (op) {
-+    case AtomicOp::Add:
-+      masm.as_add(valueTemp, output, value);
-+      break;
-+    case AtomicOp::Sub:
-+      masm.as_subf(valueTemp, value, output);
-+      break;
-+    case AtomicOp::And:
-+      masm.as_and_(valueTemp, output, value);
-+      break;
-+    case AtomicOp::Or:
-+      masm.as_or_(valueTemp, output, value);
-+      break;
-+    case AtomicOp::Xor:
-+      masm.as_xor_(valueTemp, output, value);
-+      break;
-+    default:
-+      MOZ_CRASH();
-+  }
-+
-+  if (nbytes == 1) {
-+    masm.as_stbcx(valueTemp, r0, memTemp);
-+  } else {
-+    masm.as_sthcx(valueTemp, r0, memTemp);
-+  }
-+  masm.ma_b(Assembler::NotEqual, &again);
-+
-+  if (signExtend) {
-+    if (nbytes == 1) {
-+      masm.as_extsb(output, output);
-+    } else {
-+      masm.as_extsh(output, output);
-+    }
-+  }
-+  // Unsigned: lbarx/lharx already zero-extend; output is canonical.
-+
-+  masm.memoryBarrierAfter(sync);
-+}
-+
-+template <typename T>
-+static void AtomicFetchOp64(MacroAssembler& masm,
-+                            const wasm::MemoryAccessDesc* access,
-+                            Synchronization sync, AtomicOp op, Register64 value,
-+                            const T& mem, Register64 temp, Register64 output) {
-+  MOZ_ASSERT(value != output);
-+  MOZ_ASSERT(value != temp);
-+  UseScratchRegisterScope temps(masm);
-+  Register scratch = temps.Acquire();
-+  masm.computeEffectiveAddress(mem, scratch);
-+
-+  Label tryAgain;
-+
-+  masm.memoryBarrierBefore(sync);
-+
-+  masm.bind(&tryAgain);
-+
-+  if (access) {
-+    masm.flushBuffer();  // see comment in wasmLoadImpl
-+    masm.append(*access, wasm::TrapMachineInsn::Atomic,
-+                FaultingCodeOffset(masm.currentOffset()));
-+  }
-+
-+  masm.as_ldarx(output.reg, r0, scratch);
-+
-+  switch (op) {
-+    case AtomicOp::Add:
-+      masm.as_add(temp.reg, output.reg, value.reg);
-+      break;
-+    case AtomicOp::Sub:
-+      masm.as_subf(temp.reg, value.reg, output.reg);
-+      break;
-+    case AtomicOp::And:
-+      masm.as_and_(temp.reg, output.reg, value.reg);
-+      break;
-+    case AtomicOp::Or:
-+      masm.as_or_(temp.reg, output.reg, value.reg);
-+      break;
-+    case AtomicOp::Xor:
-+      masm.as_xor_(temp.reg, output.reg, value.reg);
-+      break;
-+    default:
-+      MOZ_CRASH();
-+  }
-+
-+  masm.as_stdcx(temp.reg, r0, scratch);
-+  masm.ma_b(Assembler::NotEqual, &tryAgain);
-+
-+  masm.memoryBarrierAfter(sync);
-+}
-+
-+template <typename T>
-+static void AtomicEffectOp(MacroAssembler& masm,
-+                           const wasm::MemoryAccessDesc* access,
-+                           Scalar::Type type, Synchronization sync, AtomicOp op,
-+                           const T& mem, Register value, Register valueTemp,
-+                           Register offsetTemp, Register maskTemp) {
-+  UseScratchRegisterScope temps(masm);
-+  unsigned nbytes = Scalar::byteSize(type);
-+
-+  switch (nbytes) {
-+    case 1:
-+    case 2:
-+      break;
-+    case 4:
-+      MOZ_ASSERT(valueTemp == InvalidReg);
-+      MOZ_ASSERT(offsetTemp == InvalidReg);
-+      MOZ_ASSERT(maskTemp == InvalidReg);
-+      break;
-+    default:
-+      MOZ_CRASH();
-+  }
-+
-+  Label again;
-+
-+  Register scratch = temps.Acquire();
-+  masm.computeEffectiveAddress(mem, scratch);
-+
-+  Register scratch2 = temps.Acquire();
-+
-+  if (nbytes == 4) {
-+    masm.memoryBarrierBefore(sync);
-+    masm.bind(&again);
-+
-+    if (access) {
-+      masm.flushBuffer();  // see comment in wasmLoadImpl
-+      masm.append(*access, wasm::TrapMachineInsn::Atomic,
-+                  FaultingCodeOffset(masm.currentOffset()));
-+    }
-+
-+    masm.as_lwarx(scratch2, r0, scratch);
-+
-+    switch (op) {
-+      case AtomicOp::Add:
-+        masm.as_add(scratch2, scratch2, value);
-+        break;
-+      case AtomicOp::Sub:
-+        masm.as_subf(scratch2, value, scratch2);
-+        break;
-+      case AtomicOp::And:
-+        masm.as_and_(scratch2, scratch2, value);
-+        break;
-+      case AtomicOp::Or:
-+        masm.as_or_(scratch2, scratch2, value);
-+        break;
-+      case AtomicOp::Xor:
-+        masm.as_xor_(scratch2, scratch2, value);
-+        break;
-+      default:
-+        MOZ_CRASH();
-+    }
-+
-+    masm.as_stwcx(scratch2, r0, scratch);
-+    masm.ma_b(Assembler::NotEqual, &again);
-+
-+    masm.memoryBarrierAfter(sync);
-+
-+    return;
-+  }
-+
-+  // Sub-word effect-only op via native lbarx/lharx + stbcx./sthcx. (POWER7+).
-+  // No output to return; scratch2 holds the load+op+store value.
-+  // valueTemp / offsetTemp / maskTemp are still allocated by the lowering but
-+  // unused here.
-+  (void)valueTemp;
-+  (void)offsetTemp;
-+  (void)maskTemp;
-+
-+  masm.memoryBarrierBefore(sync);
-+
-+  masm.bind(&again);
-+
-+  if (access) {
-+    masm.flushBuffer();  // see comment in wasmLoadImpl
-+    masm.append(*access, wasm::TrapMachineInsn::Atomic,
-+                FaultingCodeOffset(masm.currentOffset()));
-+  }
-+
-+  if (nbytes == 1) {
-+    masm.as_lbarx(scratch2, r0, scratch);
-+  } else {
-+    masm.as_lharx(scratch2, r0, scratch);
-+  }
-+
-+  switch (op) {
-+    case AtomicOp::Add:
-+      masm.as_add(scratch2, scratch2, value);
-+      break;
-+    case AtomicOp::Sub:
-+      masm.as_subf(scratch2, value, scratch2);
-+      break;
-+    case AtomicOp::And:
-+      masm.as_and_(scratch2, scratch2, value);
-+      break;
-+    case AtomicOp::Or:
-+      masm.as_or_(scratch2, scratch2, value);
-+      break;
-+    case AtomicOp::Xor:
-+      masm.as_xor_(scratch2, scratch2, value);
-+      break;
-+    default:
-+      MOZ_CRASH();
-+  }
-+
-+  if (nbytes == 1) {
-+    masm.as_stbcx(scratch2, r0, scratch);
-+  } else {
-+    masm.as_sthcx(scratch2, r0, scratch);
-+  }
-+  masm.ma_b(Assembler::NotEqual, &again);
-+
-+  masm.memoryBarrierAfter(sync);
-+}
-+
-+// Public MacroAssembler methods.
-+
-+void MacroAssembler::compareExchange(Scalar::Type type, Synchronization sync,
-+                                     const Address& mem, Register oldval,
-+                                     Register newval, Register valueTemp,
-+                                     Register offsetTemp, Register maskTemp,
-+                                     Register output) {
-+  CompareExchange(*this, nullptr, type, sync, mem, oldval, newval, valueTemp,
-+                  offsetTemp, maskTemp, output);
-+}
-+
-+void MacroAssembler::compareExchange(Scalar::Type type, Synchronization sync,
-+                                     const BaseIndex& mem, Register oldval,
-+                                     Register newval, Register valueTemp,
-+                                     Register offsetTemp, Register maskTemp,
-+                                     Register output) {
-+  CompareExchange(*this, nullptr, type, sync, mem, oldval, newval, valueTemp,
-+                  offsetTemp, maskTemp, output);
-+}
-+
-+void MacroAssembler::compareExchange64(Synchronization sync, const Address& mem,
-+                                       Register64 expect, Register64 replace,
-+                                       Register64 output) {
-+  CompareExchange64(*this, nullptr, sync, mem, expect, replace, output);
-+}
-+
-+void MacroAssembler::compareExchange64(Synchronization sync,
-+                                       const BaseIndex& mem, Register64 expect,
-+                                       Register64 replace, Register64 output) {
-+  CompareExchange64(*this, nullptr, sync, mem, expect, replace, output);
-+}
-+
-+void MacroAssembler::wasmCompareExchange(const wasm::MemoryAccessDesc& access,
-+                                         const Address& mem, Register oldval,
-+                                         Register newval, Register valueTemp,
-+                                         Register offsetTemp, Register maskTemp,
-+                                         Register output) {
-+  CompareExchange(*this, &access, access.type(), access.sync(), mem, oldval,
-+                  newval, valueTemp, offsetTemp, maskTemp, output);
-+}
-+
-+void MacroAssembler::wasmCompareExchange(const wasm::MemoryAccessDesc& access,
-+                                         const BaseIndex& mem, Register oldval,
-+                                         Register newval, Register valueTemp,
-+                                         Register offsetTemp, Register maskTemp,
-+                                         Register output) {
-+  CompareExchange(*this, &access, access.type(), access.sync(), mem, oldval,
-+                  newval, valueTemp, offsetTemp, maskTemp, output);
-+}
-+
-+void MacroAssembler::wasmCompareExchange64(const wasm::MemoryAccessDesc& access,
-+                                           const Address& mem,
-+                                           Register64 expect,
-+                                           Register64 replace,
-+                                           Register64 output) {
-+  CompareExchange64(*this, &access, access.sync(), mem, expect, replace,
-+                    output);
-+}
-+
-+void MacroAssembler::wasmCompareExchange64(const wasm::MemoryAccessDesc& access,
-+                                           const BaseIndex& mem,
-+                                           Register64 expect,
-+                                           Register64 replace,
-+                                           Register64 output) {
-+  CompareExchange64(*this, &access, access.sync(), mem, expect, replace,
-+                    output);
-+}
-+
-+void MacroAssembler::atomicExchange(Scalar::Type type, Synchronization sync,
-+                                    const Address& mem, Register value,
-+                                    Register valueTemp, Register offsetTemp,
-+                                    Register maskTemp, Register output) {
-+  AtomicExchange(*this, nullptr, type, sync, mem, value, valueTemp, offsetTemp,
-+                 maskTemp, output);
-+}
-+
-+void MacroAssembler::atomicExchange(Scalar::Type type, Synchronization sync,
-+                                    const BaseIndex& mem, Register value,
-+                                    Register valueTemp, Register offsetTemp,
-+                                    Register maskTemp, Register output) {
-+  AtomicExchange(*this, nullptr, type, sync, mem, value, valueTemp, offsetTemp,
-+                 maskTemp, output);
-+}
-+
-+void MacroAssembler::atomicExchange64(Synchronization sync, const Address& mem,
-+                                      Register64 value, Register64 output) {
-+  AtomicExchange64(*this, nullptr, sync, mem, value, output);
-+}
-+
-+void MacroAssembler::atomicExchange64(Synchronization sync,
-+                                      const BaseIndex& mem, Register64 value,
-+                                      Register64 output) {
-+  AtomicExchange64(*this, nullptr, sync, mem, value, output);
-+}
-+
-+void MacroAssembler::wasmAtomicExchange(const wasm::MemoryAccessDesc& access,
-+                                        const Address& mem, Register value,
-+                                        Register valueTemp, Register offsetTemp,
-+                                        Register maskTemp, Register output) {
-+  AtomicExchange(*this, &access, access.type(), access.sync(), mem, value,
-+                 valueTemp, offsetTemp, maskTemp, output);
-+}
-+
-+void MacroAssembler::wasmAtomicExchange(const wasm::MemoryAccessDesc& access,
-+                                        const BaseIndex& mem, Register value,
-+                                        Register valueTemp, Register offsetTemp,
-+                                        Register maskTemp, Register output) {
-+  AtomicExchange(*this, &access, access.type(), access.sync(), mem, value,
-+                 valueTemp, offsetTemp, maskTemp, output);
-+}
-+
-+template <typename T>
-+static void WasmAtomicExchange64(MacroAssembler& masm,
-+                                 const wasm::MemoryAccessDesc& access,
-+                                 const T& mem, Register64 value,
-+                                 Register64 output) {
-+  AtomicExchange64(masm, &access, access.sync(), mem, value, output);
-+}
-+
-+void MacroAssembler::wasmAtomicExchange64(const wasm::MemoryAccessDesc& access,
-+                                          const Address& mem, Register64 src,
-+                                          Register64 output) {
-+  WasmAtomicExchange64(*this, access, mem, src, output);
-+}
-+
-+void MacroAssembler::wasmAtomicExchange64(const wasm::MemoryAccessDesc& access,
-+                                          const BaseIndex& mem, Register64 src,
-+                                          Register64 output) {
-+  WasmAtomicExchange64(*this, access, mem, src, output);
-+}
-+
-+void MacroAssembler::atomicFetchOp(Scalar::Type type, Synchronization sync,
-+                                   AtomicOp op, Register value,
-+                                   const Address& mem, Register valueTemp,
-+                                   Register offsetTemp, Register maskTemp,
-+                                   Register output) {
-+  AtomicFetchOp(*this, nullptr, type, sync, op, mem, value, valueTemp,
-+                offsetTemp, maskTemp, output);
-+}
-+
-+void MacroAssembler::atomicFetchOp(Scalar::Type type, Synchronization sync,
-+                                   AtomicOp op, Register value,
-+                                   const BaseIndex& mem, Register valueTemp,
-+                                   Register offsetTemp, Register maskTemp,
-+                                   Register output) {
-+  AtomicFetchOp(*this, nullptr, type, sync, op, mem, value, valueTemp,
-+                offsetTemp, maskTemp, output);
-+}
-+
-+void MacroAssembler::atomicFetchOp64(Synchronization sync, AtomicOp op,
-+                                     Register64 value, const Address& mem,
-+                                     Register64 temp, Register64 output) {
-+  AtomicFetchOp64(*this, nullptr, sync, op, value, mem, temp, output);
-+}
-+
-+void MacroAssembler::atomicFetchOp64(Synchronization sync, AtomicOp op,
-+                                     Register64 value, const BaseIndex& mem,
-+                                     Register64 temp, Register64 output) {
-+  AtomicFetchOp64(*this, nullptr, sync, op, value, mem, temp, output);
-+}
-+
-+void MacroAssembler::atomicEffectOp64(Synchronization sync, AtomicOp op,
-+                                      Register64 value, const Address& mem,
-+                                      Register64 temp) {
-+  AtomicFetchOp64(*this, nullptr, sync, op, value, mem, temp, temp);
-+}
-+
-+void MacroAssembler::atomicEffectOp64(Synchronization sync, AtomicOp op,
-+                                      Register64 value, const BaseIndex& mem,
-+                                      Register64 temp) {
-+  AtomicFetchOp64(*this, nullptr, sync, op, value, mem, temp, temp);
-+}
-+
-+void MacroAssembler::wasmAtomicFetchOp(const wasm::MemoryAccessDesc& access,
-+                                       AtomicOp op, Register value,
-+                                       const Address& mem, Register valueTemp,
-+                                       Register offsetTemp, Register maskTemp,
-+                                       Register output) {
-+  AtomicFetchOp(*this, &access, access.type(), access.sync(), op, mem, value,
-+                valueTemp, offsetTemp, maskTemp, output);
-+}
-+
-+void MacroAssembler::wasmAtomicFetchOp(const wasm::MemoryAccessDesc& access,
-+                                       AtomicOp op, Register value,
-+                                       const BaseIndex& mem, Register valueTemp,
-+                                       Register offsetTemp, Register maskTemp,
-+                                       Register output) {
-+  AtomicFetchOp(*this, &access, access.type(), access.sync(), op, mem, value,
-+                valueTemp, offsetTemp, maskTemp, output);
-+}
-+
-+void MacroAssembler::wasmAtomicFetchOp64(const wasm::MemoryAccessDesc& access,
-+                                         AtomicOp op, Register64 value,
-+                                         const Address& mem, Register64 temp,
-+                                         Register64 output) {
-+  AtomicFetchOp64(*this, &access, access.sync(), op, value, mem, temp, output);
-+}
-+
-+void MacroAssembler::wasmAtomicFetchOp64(const wasm::MemoryAccessDesc& access,
-+                                         AtomicOp op, Register64 value,
-+                                         const BaseIndex& mem, Register64 temp,
-+                                         Register64 output) {
-+  AtomicFetchOp64(*this, &access, access.sync(), op, value, mem, temp, output);
-+}
-+
-+void MacroAssembler::wasmAtomicEffectOp(const wasm::MemoryAccessDesc& access,
-+                                        AtomicOp op, Register value,
-+                                        const Address& mem, Register valueTemp,
-+                                        Register offsetTemp,
-+                                        Register maskTemp) {
-+  AtomicEffectOp(*this, &access, access.type(), access.sync(), op, mem, value,
-+                 valueTemp, offsetTemp, maskTemp);
-+}
-+
-+void MacroAssembler::wasmAtomicEffectOp(const wasm::MemoryAccessDesc& access,
-+                                        AtomicOp op, Register value,
-+                                        const BaseIndex& mem,
-+                                        Register valueTemp, Register offsetTemp,
-+                                        Register maskTemp) {
-+  AtomicEffectOp(*this, &access, access.type(), access.sync(), op, mem, value,
-+                 valueTemp, offsetTemp, maskTemp);
-+}
-+
-+// ========================================================================
-+// JS atomic operations.
-+
-+template <typename T>
-+static void CompareExchangeJS(MacroAssembler& masm, Scalar::Type arrayType,
-+                              Synchronization sync, const T& mem,
-+                              Register oldval, Register newval,
-+                              Register valueTemp, Register offsetTemp,
-+                              Register maskTemp, Register temp,
-+                              AnyRegister output) {
-+  if (arrayType == Scalar::Uint32) {
-+    masm.compareExchange(arrayType, sync, mem, oldval, newval, valueTemp,
-+                         offsetTemp, maskTemp, temp);
-+    masm.convertUInt32ToDouble(temp, output.fpu());
-+  } else {
-+    masm.compareExchange(arrayType, sync, mem, oldval, newval, valueTemp,
-+                         offsetTemp, maskTemp, output.gpr());
-+  }
-+}
-+
-+template <typename T>
-+static void AtomicExchangeJS(MacroAssembler& masm, Scalar::Type arrayType,
-+                             Synchronization sync, const T& mem, Register value,
-+                             Register valueTemp, Register offsetTemp,
-+                             Register maskTemp, Register temp,
-+                             AnyRegister output) {
-+  if (arrayType == Scalar::Uint32) {
-+    masm.atomicExchange(arrayType, sync, mem, value, valueTemp, offsetTemp,
-+                        maskTemp, temp);
-+    masm.convertUInt32ToDouble(temp, output.fpu());
-+  } else {
-+    masm.atomicExchange(arrayType, sync, mem, value, valueTemp, offsetTemp,
-+                        maskTemp, output.gpr());
-+  }
-+}
-+
-+template <typename T>
-+static void AtomicFetchOpJS(MacroAssembler& masm, Scalar::Type arrayType,
-+                            Synchronization sync, AtomicOp op, Register value,
-+                            const T& mem, Register valueTemp,
-+                            Register offsetTemp, Register maskTemp,
-+                            Register temp, AnyRegister output) {
-+  if (arrayType == Scalar::Uint32) {
-+    masm.atomicFetchOp(arrayType, sync, op, value, mem, valueTemp, offsetTemp,
-+                       maskTemp, temp);
-+    masm.convertUInt32ToDouble(temp, output.fpu());
-+  } else {
-+    masm.atomicFetchOp(arrayType, sync, op, value, mem, valueTemp, offsetTemp,
-+                       maskTemp, output.gpr());
-+  }
-+}
-+
-+void MacroAssembler::compareExchangeJS(Scalar::Type arrayType,
-+                                       Synchronization sync, const Address& mem,
-+                                       Register oldval, Register newval,
-+                                       Register valueTemp, Register offsetTemp,
-+                                       Register maskTemp, Register temp,
-+                                       AnyRegister output) {
-+  CompareExchangeJS(*this, arrayType, sync, mem, oldval, newval, valueTemp,
-+                    offsetTemp, maskTemp, temp, output);
-+}
-+
-+void MacroAssembler::compareExchangeJS(Scalar::Type arrayType,
-+                                       Synchronization sync,
-+                                       const BaseIndex& mem, Register oldval,
-+                                       Register newval, Register valueTemp,
-+                                       Register offsetTemp, Register maskTemp,
-+                                       Register temp, AnyRegister output) {
-+  CompareExchangeJS(*this, arrayType, sync, mem, oldval, newval, valueTemp,
-+                    offsetTemp, maskTemp, temp, output);
-+}
-+
-+void MacroAssembler::atomicExchangeJS(Scalar::Type arrayType,
-+                                      Synchronization sync, const Address& mem,
-+                                      Register value, Register valueTemp,
-+                                      Register offsetTemp, Register maskTemp,
-+                                      Register temp, AnyRegister output) {
-+  AtomicExchangeJS(*this, arrayType, sync, mem, value, valueTemp, offsetTemp,
-+                   maskTemp, temp, output);
-+}
-+
-+void MacroAssembler::atomicExchangeJS(Scalar::Type arrayType,
-+                                      Synchronization sync,
-+                                      const BaseIndex& mem, Register value,
-+                                      Register valueTemp, Register offsetTemp,
-+                                      Register maskTemp, Register temp,
-+                                      AnyRegister output) {
-+  AtomicExchangeJS(*this, arrayType, sync, mem, value, valueTemp, offsetTemp,
-+                   maskTemp, temp, output);
-+}
-+
-+void MacroAssembler::atomicFetchOpJS(Scalar::Type arrayType,
-+                                     Synchronization sync, AtomicOp op,
-+                                     Register value, const Address& mem,
-+                                     Register valueTemp, Register offsetTemp,
-+                                     Register maskTemp, Register temp,
-+                                     AnyRegister output) {
-+  AtomicFetchOpJS(*this, arrayType, sync, op, value, mem, valueTemp, offsetTemp,
-+                  maskTemp, temp, output);
-+}
-+
-+void MacroAssembler::atomicFetchOpJS(Scalar::Type arrayType,
-+                                     Synchronization sync, AtomicOp op,
-+                                     Register value, const BaseIndex& mem,
-+                                     Register valueTemp, Register offsetTemp,
-+                                     Register maskTemp, Register temp,
-+                                     AnyRegister output) {
-+  AtomicFetchOpJS(*this, arrayType, sync, op, value, mem, valueTemp, offsetTemp,
-+                  maskTemp, temp, output);
-+}
-+
-+void MacroAssembler::atomicEffectOpJS(Scalar::Type arrayType,
-+                                      Synchronization sync, AtomicOp op,
-+                                      Register value, const BaseIndex& mem,
-+                                      Register valueTemp, Register offsetTemp,
-+                                      Register maskTemp) {
-+  AtomicEffectOp(*this, nullptr, arrayType, sync, op, mem, value, valueTemp,
-+                 offsetTemp, maskTemp);
-+}
-+
-+void MacroAssembler::atomicEffectOpJS(Scalar::Type arrayType,
-+                                      Synchronization sync, AtomicOp op,
-+                                      Register value, const Address& mem,
-+                                      Register valueTemp, Register offsetTemp,
-+                                      Register maskTemp) {
-+  AtomicEffectOp(*this, nullptr, arrayType, sync, op, mem, value, valueTemp,
-+                 offsetTemp, maskTemp);
-+}
-+
-+// ========================================================================
-+// Wasm address offset carry tests.
-+
-+void MacroAssemblerPPC64Compat::ma_add32TestCarry(Condition cond, Register rd,
-+                                                  Register rs, Imm32 imm,
-+                                                  Label* overflow) {
-+  MOZ_ASSERT(cond == Assembler::CarrySet || cond == Assembler::CarryClear);
-+  if (rd != rs) {
-+    asMasm().move32(rs, rd);
-+    asMasm().add32(imm, rd);
-+    as_cmplw(rd, rs);
-+  } else {
-+    // visitWasmAddOffset uses useRegisterAtStart, so the LIR allocator may
-+    // collapse rd onto rs. move32 + add32 would clobber rs before the
-+    // compare; save rs to a scratch first.
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    asMasm().move32(rs, scratch);
-+    asMasm().add32(imm, rd);
-+    as_cmplw(rd, scratch);
-+  }
-+  ma_b(cond == Assembler::CarrySet ? LessThan : GreaterThanOrEqual, overflow);
-+}
-+
-+void MacroAssemblerPPC64Compat::ma_addPtrTestCarry(Condition cond, Register rd,
-+                                                   Register rs, ImmWord imm,
-+                                                   Label* overflow) {
-+  MOZ_ASSERT(cond == Assembler::CarrySet || cond == Assembler::CarryClear);
-+  if (rd != rs) {
-+    asMasm().movePtr(rs, rd);
-+    asMasm().addPtr(ImmWord(imm.value), rd);
-+    as_cmpld(rd, rs);
-+  } else {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    asMasm().movePtr(rs, scratch);
-+    asMasm().addPtr(ImmWord(imm.value), rd);
-+    as_cmpld(rd, scratch);
-+  }
-+  ma_b(cond == Assembler::CarrySet ? LessThan : GreaterThanOrEqual, overflow);
-+}
-+
-+// ========================================================================
-+// Wasm load/store helpers.
-+
-+void MacroAssemblerPPC64Compat::wasmProbeLastByte(
-+    const wasm::MemoryAccessDesc& access, Register memoryBase, Register ptr) {
-+  if (HasPOWER9()) {
-+    return;
-+  }
-+  const unsigned size = Scalar::byteSize(access.type());
-+  if (size <= 1) {
-+    return;
-+  }
-+  UseScratchRegisterScope temps(asMasm());
-+  Register probeAddr = temps.Acquire();
-+  // size is at most 16 (Simd128), well within the int16_t range of as_addi.
-+  as_addi(probeAddr, ptr, int16_t(size - 1));
-+  // Record the probe as a wasm trap site so its SIGSEGV dispatches
-+  // through the wasm signal handler the same way the real access would.
-+  m_buffer.flushPool();
-+  append(access, wasm::TrapMachineInsn::Load8,
-+         FaultingCodeOffset(currentOffset()));
-+  // Probing 1-byte load; result discarded.
-+  as_lbzx(probeAddr, memoryBase, probeAddr);
-+}
-+
-+void MacroAssemblerPPC64Compat::wasmLoadImpl(
-+    const wasm::MemoryAccessDesc& access, Register memoryBase, Register ptr,
-+    Register ptrScratch, AnyRegister output) {
-+  access.assertOffsetInGuardPages();
-+  uint32_t offset = access.offset32();
-+  MOZ_ASSERT_IF(offset, ptrScratch != InvalidReg);
-+
-+  if (offset) {
-+    asMasm().addPtr(ImmWord(offset), ptrScratch);
-+    ptr = ptrScratch;
-+  }
-+
-+  wasmProbeLastByte(access, memoryBase, ptr);
-+
-+  asMasm().memoryBarrierBefore(access.sync());
-+  // Flush any pending constant pool entries before recording the trap site,
-+  // otherwise a pool body inserted between the recorded offset and the
-+  // emitted load shifts the load and leaves the pool guard branch at the
-+  // recorded offset (SummarizeTrapInstruction then rejects the trap site).
-+  m_buffer.flushPool();
-+  append(access, wasm::TrapMachineInsnForLoad(Scalar::byteSize(access.type())),
-+         FaultingCodeOffset(currentOffset()));
-+
-+  switch (access.type()) {
-+    case Scalar::Int8:
-+      as_lbzx(output.gpr(), memoryBase, ptr);
-+      as_extsb(output.gpr(), output.gpr());
-+      break;
-+    case Scalar::Uint8:
-+      as_lbzx(output.gpr(), memoryBase, ptr);
-+      break;
-+    case Scalar::Int16:
-+      as_lhax(output.gpr(), memoryBase, ptr);
-+      break;
-+    case Scalar::Uint16:
-+      as_lhzx(output.gpr(), memoryBase, ptr);
-+      break;
-+    case Scalar::Int32:
-+    case Scalar::Uint32:
-+      as_lwzx(output.gpr(), memoryBase, ptr);
-+      as_extsw(output.gpr(), output.gpr());
-+      break;
-+    case Scalar::Float64:
-+      if (access.isZeroExtendSimd128Load() || access.isSplatSimd128Load() ||
-+          access.isWidenSimd128Load()) {
-+        // lfdx is X-form scalar FP — encodes only 5-bit FRT, so a
-+        // Simd128 dest (encoding 32+) corrupts the opcode. Bridge
-+        // through ScratchDoubleReg (FPR f0, encoding 0).
-+        ScratchDoubleScope dscratch(asMasm());
-+        as_lfdx(dscratch, memoryBase, ptr);
-+        if (access.isZeroExtendSimd128Load()) {
-+          // Loaded value goes to BE dw1 (= LE dw0 = lane 0); BE dw0 = 0.
-+          as_xxlxor(ScratchSimd128Reg, ScratchSimd128Reg, ScratchSimd128Reg);
-+          as_xxpermdi(output.fpu(), ScratchSimd128Reg, dscratch, 0);
-+        } else if (access.isSplatSimd128Load()) {
-+          as_xxpermdi(output.fpu(), dscratch, dscratch, 0);
-+        } else {
-+          // widen: place loaded 64 bits in LE dw0 (= BE dw1) for widenLow.
-+          as_xxpermdi(output.fpu(), dscratch, dscratch, 2);
-+          switch (access.widenSimdOp()) {
-+            case wasm::SimdOp::V128Load8x8S:
-+              asMasm().widenLowInt8x16(output.fpu(), output.fpu());
-+              break;
-+            case wasm::SimdOp::V128Load8x8U:
-+              asMasm().unsignedWidenLowInt8x16(output.fpu(), output.fpu());
-+              break;
-+            case wasm::SimdOp::V128Load16x4S:
-+              asMasm().widenLowInt16x8(output.fpu(), output.fpu());
-+              break;
-+            case wasm::SimdOp::V128Load16x4U:
-+              asMasm().unsignedWidenLowInt16x8(output.fpu(), output.fpu());
-+              break;
-+            case wasm::SimdOp::V128Load32x2S:
-+              asMasm().widenLowInt32x4(output.fpu(), output.fpu());
-+              break;
-+            case wasm::SimdOp::V128Load32x2U:
-+              asMasm().unsignedWidenLowInt32x4(output.fpu(), output.fpu());
-+              break;
-+            default:
-+              MOZ_CRASH("Unexpected widen op");
-+          }
-+        }
-+      } else {
-+        as_lfdx(output.fpu(), memoryBase, ptr);
-+      }
-+      break;
-+    case Scalar::Float32:
-+      if (access.isZeroExtendSimd128Load()) {
-+        // v128.load32_zero: load 32 raw bits into lane 0, zero the rest.
-+        UseScratchRegisterScope temps(asMasm());
-+        Register tmp = temps.Acquire();
-+        as_lwzx(tmp, memoryBase, ptr);
-+        as_xxlxor(output.fpu(), output.fpu(), output.fpu());
-+        if (HasPOWER9()) {
-+          as_mtvsrws(ScratchSimd128Reg, tmp);
-+          as_xxinsertw(output.fpu(), ScratchSimd128Reg, 12);
-+        } else {
-+          // POWER8: mtvsrd puts value in BE dw0 low 32 bits.
-+          // xxpermdi(dest, zero, scratch, 0) = {zero[dw0], scratch[dw0]}
-+          // in BE, placing the value in LE word 0 with the rest zero.
-+          as_mtvsrd(ScratchSimd128Reg, tmp);
-+          as_xxpermdi(output.fpu(), output.fpu(), ScratchSimd128Reg, 0);
-+        }
-+      } else {
-+        as_lfsx(output.fpu(), memoryBase, ptr);
-+      }
-+      break;
-+    case Scalar::Simd128:
-+      if (HasPOWER9()) {
-+        as_lxvx(output.fpu(), memoryBase, ptr);
-+      } else {
-+        as_lxvd2x(output.fpu(), memoryBase, ptr);
-+        as_xxpermdi(output.fpu(), output.fpu(), output.fpu(), 2);
-+      }
-+      break;
-+    default:
-+      MOZ_CRASH("unexpected array type");
-+  }
-+
-+  asMasm().memoryBarrierAfter(access.sync());
-+}
-+
-+void MacroAssemblerPPC64Compat::wasmStoreImpl(
-+    const wasm::MemoryAccessDesc& access, AnyRegister value,
-+    Register memoryBase, Register ptr, Register ptrScratch) {
-+  access.assertOffsetInGuardPages();
-+  uint32_t offset = access.offset32();
-+  MOZ_ASSERT_IF(offset, ptrScratch != InvalidReg);
-+
-+  if (offset) {
-+    asMasm().addPtr(ImmWord(offset), ptrScratch);
-+    ptr = ptrScratch;
-+  }
-+
-+  wasmProbeLastByte(access, memoryBase, ptr);
-+
-+  asMasm().memoryBarrierBefore(access.sync());
-+  // Record trap site at the faulting memory instruction. For P8 Simd128
-+  // store, the faulting instruction (stxvd2x) is after a byte-swap
-+  // (xxpermdi), so we defer the trap site recording.
-+  // Flush pool first; see comment in wasmLoadImpl.
-+  if (access.type() != Scalar::Simd128 || HasPOWER9()) {
-+    m_buffer.flushPool();
-+    append(access,
-+           wasm::TrapMachineInsnForStore(Scalar::byteSize(access.type())),
-+           FaultingCodeOffset(currentOffset()));
-+  }
-+
-+  switch (access.type()) {
-+    case Scalar::Int8:
-+    case Scalar::Uint8:
-+      as_stbx(value.gpr(), memoryBase, ptr);
-+      break;
-+    case Scalar::Int16:
-+    case Scalar::Uint16:
-+      as_sthx(value.gpr(), memoryBase, ptr);
-+      break;
-+    case Scalar::Int32:
-+    case Scalar::Uint32:
-+      as_stwx(value.gpr(), memoryBase, ptr);
-+      break;
-+    case Scalar::Int64:
-+      as_stdx(value.gpr(), memoryBase, ptr);
-+      break;
-+    case Scalar::Float64:
-+      as_stfdx(value.fpu(), memoryBase, ptr);
-+      break;
-+    case Scalar::Float32:
-+      as_stfsx(value.fpu(), memoryBase, ptr);
-+      break;
-+    case Scalar::Simd128:
-+      if (HasPOWER9()) {
-+        as_stxvx(value.fpu(), memoryBase, ptr);
-+      } else {
-+        as_xxpermdi(ScratchSimd128Reg, value.fpu(), value.fpu(), 2);
-+        m_buffer.flushPool();  // see comment in wasmLoadImpl
-+        append(access,
-+               wasm::TrapMachineInsnForStore(Scalar::byteSize(access.type())),
-+               FaultingCodeOffset(currentOffset()));
-+        as_stxvd2x(ScratchSimd128Reg, memoryBase, ptr);
-+      }
-+      break;
-+    default:
-+      MOZ_CRASH("unexpected array type");
-+  }
-+
-+  asMasm().memoryBarrierAfter(access.sync());
-+}
-+
-+void MacroAssemblerPPC64Compat::wasmLoadI64Impl(
-+    const wasm::MemoryAccessDesc& access, Register memoryBase, Register ptr,
-+    Register ptrScratch, Register64 output) {
-+  uint32_t offset = access.offset32();
-+  MOZ_ASSERT_IF(offset, ptrScratch != InvalidReg);
-+
-+  if (offset) {
-+    asMasm().addPtr(ImmWord(offset), ptrScratch);
-+    ptr = ptrScratch;
-+  }
-+
-+  wasmProbeLastByte(access, memoryBase, ptr);
-+
-+  asMasm().memoryBarrierBefore(access.sync());
-+  m_buffer.flushPool();  // see comment in wasmLoadImpl
-+  append(access, wasm::TrapMachineInsnForLoad(Scalar::byteSize(access.type())),
-+         FaultingCodeOffset(currentOffset()));
-+
-+  switch (access.type()) {
-+    case Scalar::Int8:
-+      as_lbzx(output.reg, memoryBase, ptr);
-+      as_extsb(output.reg, output.reg);
-+      break;
-+    case Scalar::Uint8:
-+      as_lbzx(output.reg, memoryBase, ptr);
-+      break;
-+    case Scalar::Int16:
-+      as_lhax(output.reg, memoryBase, ptr);
-+      break;
-+    case Scalar::Uint16:
-+      as_lhzx(output.reg, memoryBase, ptr);
-+      break;
-+    case Scalar::Int32:
-+      as_lwzx(output.reg, memoryBase, ptr);
-+      as_extsw(output.reg, output.reg);
-+      break;
-+    case Scalar::Uint32:
-+      as_lwzx(output.reg, memoryBase, ptr);
-+      // Zero-extended by lwzx already
-+      break;
-+    case Scalar::Int64:
-+      as_ldx(output.reg, memoryBase, ptr);
-+      break;
-+    default:
-+      MOZ_CRASH("unexpected array type");
-+  }
-+
-+  asMasm().memoryBarrierAfter(access.sync());
-+}
-+
-+void MacroAssemblerPPC64Compat::wasmStoreI64Impl(
-+    const wasm::MemoryAccessDesc& access, Register64 value, Register memoryBase,
-+    Register ptr, Register ptrScratch) {
-+  uint32_t offset = access.offset32();
-+  MOZ_ASSERT_IF(offset, ptrScratch != InvalidReg);
-+
-+  if (offset) {
-+    asMasm().addPtr(ImmWord(offset), ptrScratch);
-+    ptr = ptrScratch;
-+  }
-+
-+  wasmProbeLastByte(access, memoryBase, ptr);
-+
-+  asMasm().memoryBarrierBefore(access.sync());
-+  m_buffer.flushPool();  // see comment in wasmLoadImpl
-+  append(access, wasm::TrapMachineInsnForStore(Scalar::byteSize(access.type())),
-+         FaultingCodeOffset(currentOffset()));
-+
-+  switch (access.type()) {
-+    case Scalar::Int8:
-+    case Scalar::Uint8:
-+      as_stbx(value.reg, memoryBase, ptr);
-+      break;
-+    case Scalar::Int16:
-+    case Scalar::Uint16:
-+      as_sthx(value.reg, memoryBase, ptr);
-+      break;
-+    case Scalar::Int32:
-+    case Scalar::Uint32:
-+      as_stwx(value.reg, memoryBase, ptr);
-+      break;
-+    case Scalar::Int64:
-+      as_stdx(value.reg, memoryBase, ptr);
-+      break;
-+    default:
-+      MOZ_CRASH("unexpected array type");
-+  }
-+
-+  asMasm().memoryBarrierAfter(access.sync());
-+}
-+
-+void MacroAssembler::wasmLoad(const wasm::MemoryAccessDesc& access,
-+                              Register memoryBase, Register ptr,
-+                              Register ptrScratch, AnyRegister output) {
-+  wasmLoadImpl(access, memoryBase, ptr, ptrScratch, output);
-+}
-+
-+void MacroAssembler::wasmLoadI64(const wasm::MemoryAccessDesc& access,
-+                                 Register memoryBase, Register ptr,
-+                                 Register ptrScratch, Register64 output) {
-+  wasmLoadI64Impl(access, memoryBase, ptr, ptrScratch, output);
-+}
-+
-+void MacroAssembler::wasmStore(const wasm::MemoryAccessDesc& access,
-+                               AnyRegister value, Register memoryBase,
-+                               Register ptr, Register ptrScratch) {
-+  wasmStoreImpl(access, value, memoryBase, ptr, ptrScratch);
-+}
-+
-+void MacroAssembler::wasmStoreI64(const wasm::MemoryAccessDesc& access,
-+                                  Register64 value, Register memoryBase,
-+                                  Register ptr, Register ptrScratch) {
-+  wasmStoreI64Impl(access, value, memoryBase, ptr, ptrScratch);
-+}
-+
-+//}}} check_macroassembler_style
-+
-+}  // namespace jit
-+}  // namespace js
-+
-+#ifdef ENABLE_WASM_SIMD
-+// static
-+bool MacroAssembler::MustMaskShiftCountSimd128(wasm::SimdOp op, int32_t* mask) {
-+  return false;
-+}
-+#endif
-diff --git a/js/src/jit/ppc64/MacroAssembler-ppc64.h b/js/src/jit/ppc64/MacroAssembler-ppc64.h
-new file mode 100644
-index 000000000000..bc2143b67465
---- /dev/null
-+++ b/js/src/jit/ppc64/MacroAssembler-ppc64.h
-@@ -0,0 +1,2031 @@
-+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
-+ * vim: set ts=8 sts=2 et sw=2 tw=80:
-+ * This Source Code Form is subject to the terms of the Mozilla Public
-+ * License, v. 2.0. If a copy of the MPL was not distributed with this
-+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-+
-+#ifndef jit_ppc64_MacroAssembler_ppc64_h
-+#define jit_ppc64_MacroAssembler_ppc64_h
-+
-+#include "jit/MoveResolver.h"
-+#include "jit/ppc64/Assembler-ppc64.h"
-+#include "wasm/WasmBuiltins.h"
-+
-+namespace js {
-+namespace jit {
-+
-+inline bool is_intN(int64_t x, unsigned n) {
-+  MOZ_ASSERT((0 < n) && (n < 64));
-+  int64_t limit = static_cast<int64_t>(1) << (n - 1);
-+  return (-limit <= x) && (x < limit);
-+}
-+
-+inline bool is_uintN(uint64_t x, unsigned n) {
-+  MOZ_ASSERT((0 < n) && (n < 64));
-+  return !(x >> n);
-+}
-+
-+// enterNoPool() guard sizes. Inhibiting the constant pool keeps these
-+// stanzas at a fixed instruction count, which patchers and long-branch
-+// resolvers rely on. Each constant names a distinct stanza shape; see
-+// the emitting call site for the exact layout.
-+//
-+// kNoPoolLoad64StanzaInsns (8): emitLoad64Stanza body — 8 NOPs that
-+//   WriteLoad64Instructions later overwrites in place. Two shapes share
-+//   the same 8-slot footprint with the .quad fixed at slots [6..7]:
-+//     - POWER9+ (HasPOWER9()): addpcis + ld + b + 3 NOPs (2 dynamic insns,
-+//       no LR clobber). Preferred path.
-+//     - POWER8 fallback: mflr/bcl/mflr/mtlr/ld/b LR-bouncing sequence
-+//       (6 dynamic insns, RAS-thrashing — kept only because P8 has no
-+//       addpcis).
-+//
-+// kNoPoolPatchableBranchInsns (10): patchable far call / jump /
-+//   unconditional branch. Three alternative shapes, all fitting the
-+//   same budget:
-+//     - load64 stanza (8) + mtctr + bctr[l]  = 10  (bound call/jump)
-+//     - 9 NOPs + bl                          = 10  (short bound call)
-+//     - xs_trap_tagged(TAG) + chain + 8 NOPs = 10  (fwd-ref stanza)
-+//
-+// kNoPoolCondLongBranchInsnsP8Max (14): conditional long branch, POWER8
-+//   Overflow worst case. POWER8 has no mcrxrx so overflow/carry test is
-+//   mfxer+rlwinm+mtcrf (3 insns) on top of the base shape. Budget =
-+//   3 (XER inspection) + 1 (bc) + 8 (load64 stanza) + 2 (mtctr+bctr) = 14.
-+static constexpr size_t kNoPoolLoad64StanzaInsns = 8;
-+static constexpr size_t kNoPoolPatchableBranchInsns = 10;
-+static constexpr size_t kNoPoolCondLongBranchInsnsP8Max = 14;
-+
-+enum LoadStoreSize {
-+  SizeByte = 8,
-+  SizeHalfWord = 16,
-+  SizeWord = 32,
-+  SizeDouble = 64
-+};
-+
-+enum LoadStoreExtension { ZeroExtend = 0, SignExtend = 1 };
-+
-+static Register CallReg = r12;
-+
-+struct ImmShiftedTag : public ImmWord {
-+  explicit ImmShiftedTag(JSValueShiftedTag shtag) : ImmWord((uintptr_t)shtag) {}
-+  explicit ImmShiftedTag(JSValueType type)
-+      : ImmWord(((uintptr_t)JSVAL_TYPE_TO_SHIFTED_TAG(type))) {}
-+};
-+
-+struct ImmTag : public Imm32 {
-+  explicit ImmTag(JSValueTag tag) : Imm32(tag) {}
-+};
-+
-+class ScratchTagScope {
-+  UseScratchRegisterScope temps_;
-+  Register scratch_;
-+  bool owned_;
-+  mozilla::DebugOnly<bool> released_;
-+
-+ public:
-+  ScratchTagScope(Assembler& masm, const ValueOperand&)
-+      : temps_(masm), owned_(true), released_(false) {
-+    scratch_ = temps_.Acquire();
-+  }
-+
-+  operator Register() {
-+    MOZ_ASSERT(!released_);
-+    return scratch_;
-+  }
-+
-+  void release() {
-+    MOZ_ASSERT(!released_);
-+    released_ = true;
-+    if (owned_) {
-+      temps_.Release(scratch_);
-+      owned_ = false;
-+    }
-+  }
-+
-+  void reacquire() {
-+    MOZ_ASSERT(released_);
-+    released_ = false;
-+    if (!owned_) {
-+      scratch_ = temps_.Acquire();
-+      owned_ = true;
-+    }
-+  }
-+};
-+
-+class ScratchTagScopeRelease {
-+  ScratchTagScope* ts_;
-+
-+ public:
-+  explicit ScratchTagScopeRelease(ScratchTagScope* ts) : ts_(ts) {
-+    ts_->release();
-+  }
-+  ~ScratchTagScopeRelease() { ts_->reacquire(); }
-+};
-+
-+class MacroAssemblerPPC64 : public Assembler {
-+ protected:
-+  MacroAssembler& asMasm();
-+  const MacroAssembler& asMasm() const;
-+};
-+
-+class MacroAssemblerPPC64Compat : public MacroAssemblerPPC64 {
-+ public:
-+  using MacroAssemblerPPC64::MacroAssemblerPPC64;
-+
-+  MacroAssemblerPPC64Compat() {}
-+
-+  bool buildOOLFakeExitFrame(void* fakeReturnAddr);
-+
-+  // ===============================================================
-+  // Conversion functions
-+
-+  void convertBoolToInt32(Register src, Register dest) {
-+    as_rlwinm(dest, src, 0, 31, 31);
-+  }
-+  void convertInt32ToDouble(Register src, FloatRegister dest) {
-+    // mtvsrwa: VSR[dest].dw0 = sign_ext_64(src[32:63]); P8+ (ISA 2.07).
-+    // Replaces extsw + mtvsrd (2 insns + scratch GPR) with 1 insn.
-+    as_mtvsrwa(dest, src);
-+    as_fcfid(dest, dest);
-+  }
-+  void convertInt32ToDouble(const Address& src, FloatRegister dest) {
-+    // lfiwax (P7+): FPR.dw[0] = sign_ext_64(MEM[addr, 4]). X-form indexed
-+    // — no immediate offset, so when offset != 0 we add it into a scratch
-+    // first. Replaces lwz + extsw + mtvsrd with lfiwax (one insn) plus
-+    // optional address add.
-+    if (src.offset == 0) {
-+      as_lfiwax(dest, r0, src.base);
-+    } else {
-+      UseScratchRegisterScope temps(*this);
-+      Register scratch = temps.Acquire();
-+      if (is_intN(src.offset, 16)) {
-+        as_addi(scratch, src.base, src.offset);
-+        as_lfiwax(dest, r0, scratch);
-+      } else {
-+        // X-form indexed: lfiwax computes base + scratch directly, no add.
-+        movePtr(ImmWord(src.offset), scratch);
-+        as_lfiwax(dest, src.base, scratch);
-+      }
-+    }
-+    as_fcfid(dest, dest);
-+  }
-+  void convertInt32ToDouble(const BaseIndex& src, FloatRegister dest) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    computeScaledAddress(src, scratch);
-+    convertInt32ToDouble(Address(scratch, src.offset), dest);
-+  }
-+  void convertUInt32ToDouble(Register src, FloatRegister dest);
-+  void convertUInt32ToFloat32(Register src, FloatRegister dest);
-+  void convertDoubleToFloat32(FloatRegister src, FloatRegister dest) {
-+    as_frsp(dest, src);
-+  }
-+  // POWER9 FP16 conversions (1 insn each). Caller must have verified
-+  // HasPOWER9() — SupportsFloat{64,32}To16 gates that. PPC64 FPRs hold
-+  // doubles internally; an "FP32-in-FPR" is just the FP32 value stored
-+  // as exact FP64, so xscvdphp/xscvhpdp work for both FP32↔FP16 and
-+  // FP64↔FP16 (FP16 fits exactly in FP32 which fits exactly in FP64).
-+  void convertDoubleToFloat16(FloatRegister src, FloatRegister dest) {
-+    MOZ_ASSERT(HasPOWER9());
-+    as_xscvdphp(dest, src);
-+  }
-+  void convertFloat16ToDouble(FloatRegister src, FloatRegister dest) {
-+    MOZ_ASSERT(HasPOWER9());
-+    as_xscvhpdp(dest, src);
-+  }
-+  void convertFloat32ToFloat16(FloatRegister src, FloatRegister dest) {
-+    MOZ_ASSERT(HasPOWER9());
-+    as_xscvdphp(dest, src);
-+  }
-+  void convertFloat16ToFloat32(FloatRegister src, FloatRegister dest) {
-+    MOZ_ASSERT(HasPOWER9());
-+    as_xscvhpdp(dest, src);
-+  }
-+  void convertInt32ToFloat16(Register src, FloatRegister dest) {
-+    MOZ_ASSERT(HasPOWER9());
-+    convertInt32ToFloat32(src, dest);
-+    convertFloat32ToFloat16(dest, dest);
-+  }
-+  void convertDoubleToInt32(FloatRegister src, Register dest, Label* fail,
-+                            bool negativeZeroCheck = true);
-+  void convertDoubleToPtr(FloatRegister src, Register dest, Label* fail,
-+                          bool negativeZeroCheck = true);
-+  void convertFloat32ToInt32(FloatRegister src, Register dest, Label* fail,
-+                             bool negativeZeroCheck = true);
-+  void convertFloat32ToDouble(FloatRegister src, FloatRegister dest) {
-+    // PPC64 FPRs hold every FP32 value in its FP64-equivalent representation,
-+    // so f64.promote_f32 is conceptually a no-op except that wasm requires
-+    // sNaN inputs to be quieted. frsp (Round to Single-Precision) is the
-+    // identity for SP-representable inputs but applies IEEE NaN-quieting as
-+    // a side effect, replacing the prior fmr + fcmpu + branch + canonical-
-+    // NaN-load (5+ insns + scratch GPR) with a single instruction. Result
-+    // matches what x86 vcvtss2sd / ARM fcvt produce.
-+    as_frsp(dest, src);
-+  }
-+  void convertInt32ToFloat32(Register src, FloatRegister dest) {
-+    // mtvsrwa + fcfids; same recipe as convertInt32ToDouble(Register).
-+    as_mtvsrwa(dest, src);
-+    as_fcfids(dest, dest);
-+  }
-+  void convertInt32ToFloat32(const Address& src, FloatRegister dest) {
-+    // lfiwax + fcfids; same recipe as convertInt32ToDouble(Address).
-+    if (src.offset == 0) {
-+      as_lfiwax(dest, r0, src.base);
-+    } else {
-+      UseScratchRegisterScope temps(*this);
-+      Register scratch = temps.Acquire();
-+      if (is_intN(src.offset, 16)) {
-+        as_addi(scratch, src.base, src.offset);
-+        as_lfiwax(dest, r0, scratch);
-+      } else {
-+        movePtr(ImmWord(src.offset), scratch);
-+        as_lfiwax(dest, src.base, scratch);
-+      }
-+    }
-+    as_fcfids(dest, dest);
-+  }
-+
-+  // POWER9 FP16 load: lxsihzx writes the 2 memory bytes directly into
-+  // dw[0] low 16 bits with the rest zeroed — matching the layout that
-+  // xscvhpdp expects, in a single instruction.
-+  FaultingCodeOffset loadFloat16(const Address& addr, FloatRegister dest,
-+                                 Register temp) {
-+    MOZ_ASSERT(HasPOWER9());
-+    if (addr.offset == 0) {
-+      return FaultingCodeOffset(as_lxsihzx(dest, r0, addr.base).getOffset());
-+    }
-+    if (is_intN(addr.offset, 16)) {
-+      as_addi(temp, addr.base, addr.offset);
-+      return FaultingCodeOffset(as_lxsihzx(dest, r0, temp).getOffset());
-+    }
-+    movePtr(ImmWord(addr.offset), temp);
-+    return FaultingCodeOffset(as_lxsihzx(dest, addr.base, temp).getOffset());
-+  }
-+  FaultingCodeOffset loadFloat16(const BaseIndex& src, FloatRegister dest,
-+                                 Register temp) {
-+    MOZ_ASSERT(HasPOWER9());
-+    computeEffectiveAddress(src, temp);
-+    return FaultingCodeOffset(as_lxsihzx(dest, r0, temp).getOffset());
-+  }
-+
-+  // ===============================================================
-+  // Effective address computation
-+
-+  void computeScaledAddress(const BaseIndex& address, Register dest) {
-+    if (address.scale == TimesOne) {
-+      as_add(dest, address.base, address.index);
-+    } else if (dest != address.base && dest != address.index) {
-+      x_sldi(dest, address.index, address.scale);
-+      as_add(dest, address.base, dest);
-+    } else {
-+      UseScratchRegisterScope temps(*this);
-+      Register scratch = temps.Acquire();
-+      x_sldi(scratch, address.index, address.scale);
-+      as_add(dest, address.base, scratch);
-+    }
-+  }
-+
-+  void computeEffectiveAddress(const Address& address, Register dest) {
-+    if (address.offset == 0) {
-+      if (dest != address.base) {
-+        xs_mr(dest, address.base);
-+      }
-+    } else if (is_intN(address.offset, 16)) {
-+      as_addi(dest, address.base, address.offset);
-+    } else if (HasPOWER10() && is_intN(address.offset, 34)) {
-+      // Single-insn 34-bit-signed reg+imm add. Avoids the scratch GPR.
-+      as_paddi(dest, address.base, address.offset, /*R=*/false);
-+    } else {
-+      UseScratchRegisterScope temps(*this);
-+      Register scratch = temps.Acquire();
-+      MOZ_ASSERT(scratch != dest);
-+      movePtr(ImmWord(address.offset), scratch);
-+      as_add(dest, address.base, scratch);
-+    }
-+  }
-+  void computeEffectiveAddress(const BaseIndex& address, Register dest) {
-+    computeScaledAddress(address, dest);
-+    if (address.offset) {
-+      if (is_intN(address.offset, 16)) {
-+        as_addi(dest, dest, address.offset);
-+      } else if (HasPOWER10() && is_intN(address.offset, 34)) {
-+        as_paddi(dest, dest, address.offset, /*R=*/false);
-+      } else {
-+        UseScratchRegisterScope temps(*this);
-+        Register scratch = temps.Acquire();
-+        MOZ_ASSERT(scratch != dest);
-+        movePtr(ImmWord(address.offset), scratch);
-+        as_add(dest, dest, scratch);
-+      }
-+    }
-+  }
-+
-+  // ===============================================================
-+  // Move instructions
-+
-+  void mov(Register src, Register dest) { xs_mr(dest, src); }
-+  void mov(ImmWord imm, Register dest) { movePtr(imm, dest); }
-+  void mov(ImmPtr imm, Register dest) {
-+    mov(ImmWord(uintptr_t(imm.value)), dest);
-+  }
-+  // Emit an 8-instruction NOP stanza for a patchable 64-bit load.
-+  // Pool flushes are inhibited during emission to prevent pool data
-+  // from being inserted mid-stanza.
-+  BufferOffset emitLoad64Stanza(Register dest, uint64_t value) {
-+    m_buffer.enterNoPool(kNoPoolLoad64StanzaInsns);
-+    BufferOffset bo = writeInst(NopInst);
-+    writeInst(NopInst);
-+    writeInst(NopInst);
-+    writeInst(NopInst);
-+    writeInst(NopInst);
-+    writeInst(NopInst);
-+    writeInst(NopInst);
-+    writeInst(NopInst);
-+    m_buffer.leaveNoPool();
-+    // If any of the 8 writeInst calls hit OOM, only some of the stanza
-+    // was reserved in the buffer. WriteLoad64Instructions writes 32 bytes
-+    // unconditionally, so calling it here would overflow the Vector's
-+    // backing store and corrupt the next heap chunk's metadata, surfacing
-+    // later as a malloc-detected free-time crash.
-+    if (m_buffer.oom()) {
-+      return bo;
-+    }
-+    WriteLoad64Instructions((Instruction*)editSrc(bo), dest, value);
-+    return bo;
-+  }
-+
-+  void mov(CodeLabel* label, Register dest) {
-+    BufferOffset bo = emitLoad64Stanza(dest, LabelBase::INVALID_OFFSET);
-+    label->patchAt()->bind(bo.getOffset());
-+    label->setLinkMode(CodeLabel::MoveImmediate);
-+  }
-+  void mov(Register src, Address dest) { storePtr(src, dest); }
-+  void mov(Address src, Register dest) { loadPtr(src, dest); }
-+
-+  void move32(Imm32 imm, Register dest) {
-+    if (is_intN(imm.value, 16)) {
-+      xs_li(dest, (int16_t)imm.value);
-+    } else if (is_uintN((uint32_t)imm.value, 16)) {
-+      xs_li(dest, 0);
-+      as_ori(dest, dest, (uint16_t)imm.value);
-+    } else {
-+      xs_lis(dest, (int16_t)((uint32_t)imm.value >> 16));
-+      if (imm.value & 0xffff) {
-+        as_ori(dest, dest, (uint16_t)imm.value);
-+      }
-+    }
-+  }
-+  void move32(Register src, Register dest) { as_extsw(dest, src); }
-+
-+  void movePtr(Register src, Register dest) {
-+    if (src != dest) {
-+      xs_mr(dest, src);
-+    }
-+  }
-+  void movePtr(ImmWord imm, Register dest) {
-+    if (imm.value == 0) {
-+      xs_li(dest, 0);
-+    } else if (is_intN((intptr_t)imm.value, 16)) {
-+      xs_li(dest, (int16_t)imm.value);
-+    } else if (is_uintN(imm.value, 16)) {
-+      xs_li(dest, 0);
-+      as_ori(dest, dest, (uint16_t)imm.value);
-+    } else if (is_intN((intptr_t)imm.value, 32)) {
-+      // 32-bit signed: lis + ori (2 instructions).
-+      xs_lis(dest, (int16_t)((uint32_t)imm.value >> 16));
-+      if (imm.value & 0xFFFF) {
-+        as_ori(dest, dest, (uint16_t)imm.value);
-+      }
-+    } else if (HasPOWER10() && is_intN((intptr_t)imm.value, 34)) {
-+      // POWER10 single-instruction 34-bit signed immediate. Replaces the
-+      // 5-insn fallback for values in (33-34)-bit signed range.
-+      // 8 bytes vs 20 bytes; one slot temp register is no longer needed.
-+      as_paddi(dest, r0, (int64_t)imm.value, /*R=*/false);
-+    } else {
-+      // Full 64-bit: GCC-style lis+ori+lis+ori+rldimi (5 instructions).
-+      // No LR clobber, no embedded data — pure instruction sequence.
-+      uint32_t lo32 = (uint32_t)(imm.value);
-+      uint32_t hi32 = (uint32_t)(imm.value >> 32);
-+      Register temp = (dest != SecondScratchReg) ? SecondScratchReg
-+                                                 : SavedScratchRegister;
-+      m_buffer.ensureSpace(5 * sizeof(uint32_t));
-+      xs_lis(dest, (int16_t)(lo32 >> 16));
-+      as_ori(dest, dest, lo32 & 0xFFFF);
-+      xs_lis(temp, (int16_t)(hi32 >> 16));
-+      as_ori(temp, temp, hi32 & 0xFFFF);
-+      as_rldimi(dest, temp, 32, 0);
-+    }
-+  }
-+  void movePtr(ImmPtr imm, Register dest) {
-+    movePtr(ImmWord(uintptr_t(imm.value)), dest);
-+  }
-+
-+  // Load a 64-bit FPR constant from the inline constant pool.
-+  // POWER9: 2 instructions (addpcis + lfd) -- no alignment constraint.
-+  // POWER10: 1 prefixed instruction (plfd, 2 slots), or 3 slots in the
-+  //   (loadAddr & 63) == 60 alignment-leading-nop case. Reserve 3 to
-+  //   cover both cases conservatively.
-+  // POWER8: not used -- loadConstantDouble inlines the constant.
-+  BufferOffset loadFromPoolFloat64(FloatRegister dest, double value) {
-+    size_t slots = HasPOWER10() ? 3 : 2;
-+    uint32_t hint = (uint32_t(dest.encoding()) << 16) |
-+                    (uint32_t(PoolLoadFPR64) << 21) | 0xF0000000;
-+    uint32_t inst[3] = {hint, NopInst, NopInst};
-+    return m_buffer.allocEntry(slots, 2, (uint8_t*)inst, (uint8_t*)&value);
-+  }
-+  // Load a 32-bit FPR constant from the inline constant pool.
-+  // Same shape as loadFromPoolFloat64 (above). lfs/plfs auto-expand the
-+  // 32-bit single-precision value to double in the FPR, so no follow-up
-+  // xscvspdpn is needed.
-+  BufferOffset loadFromPoolFloat32(FloatRegister dest, float value) {
-+    size_t slots = HasPOWER10() ? 3 : 2;
-+    uint32_t hint = (uint32_t(dest.encoding()) << 16) |
-+                    (uint32_t(PoolLoadFPR32) << 21) | 0xF0000000;
-+    uint32_t inst[3] = {hint, NopInst, NopInst};
-+    return m_buffer.allocEntry(slots, 1, (uint8_t*)inst, (uint8_t*)&value);
-+  }
-+  // Load a 128-bit SIMD constant from the inline constant pool.
-+  // Per-arch slot reservation -- the patcher writes only the slots
-+  // each micro-arch actually needs:
-+  //   P8: 5 (bcl + mflr + addi + lxvd2x + xxpermdi)
-+  //   P9: 3 (addpcis + addi + lxvx) -- no LR touch, no RAS hazard
-+  //   P10: 3 (alignment-safe: prefix + suffix + 1 reserve for the
-+  //          (loadAddr & 63) == 60 leading-nop case)
-+  // Pool entry is 4 × 4-byte words = 16 bytes. P9 uses
-+  // SavedScratchRegister (r16) as the PC base; P10 emits a single
-+  // PC-relative plxv with no scratch and no LR touch. Only P8 still
-+  // clobbers LR (correctness-only fallback; live by design).
-+  BufferOffset loadFromPoolSimd128(FloatRegister dest,
-+                                   const SimdConstant& v) {
-+    size_t slots;
-+    if (HasPOWER10()) {
-+      slots = 3;
-+    } else if (HasPOWER9()) {
-+      slots = 3;
-+    } else {
-+      slots = 5;
-+    }
-+    // Simd128 encoding is 32-63; mask to 5 bits for hint.
-+    // PatchConstantPoolLoad sets TX bit unconditionally for Simd128.
-+    uint32_t hint = ((uint32_t(dest.encoding()) & 0x1F) << 16) |
-+                    (uint32_t(PoolLoadSimd128) << 21) | 0xF0000000;
-+    uint32_t inst[5] = {hint, NopInst, NopInst, NopInst, NopInst};
-+    return m_buffer.allocEntry(slots, 4, (uint8_t*)inst, (uint8_t*)v.bytes());
-+  }
-+  void movePtr(wasm::SymbolicAddress imm, Register dest) {
-+    BufferOffset bo = emitLoad64Stanza(dest, (uint64_t)-1);
-+    append(wasm::SymbolicAccess(CodeOffset(bo.getOffset()), imm));
-+  }
-+  void movePtr(ImmGCPtr imm, Register dest) {
-+    BufferOffset bo = emitLoad64Stanza(dest,
-+                                       (uint64_t)uintptr_t(imm.value));
-+    Assembler::writeDataRelocation(bo, imm);
-+  }
-+
-+  void moveFloat32(FloatRegister src, FloatRegister dest) {
-+    if (src != dest) {
-+      as_fmr(dest, src);
-+    }
-+  }
-+  void moveDouble(FloatRegister src, FloatRegister dest) {
-+    if (src != dest) {
-+      as_fmr(dest, src);
-+    }
-+  }
-+
-+  // ===============================================================
-+  // Branch functions
-+
-+  void branch(JitCode* c) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    BufferOffset bo = emitLoad64Stanza(scratch, (uint64_t)uintptr_t(c->raw()));
-+    addPendingJump(bo, ImmPtr(c->raw()), RelocationKind::JITCODE);
-+    xs_mtctr(scratch);
-+    as_bctr();
-+  }
-+  void branch(const Register reg) {
-+    xs_mtctr(reg);
-+    as_bctr();
-+  }
-+
-+  void jump(Label* label) {
-+    if (label->bound()) {
-+      // Open the no-pool window BEFORE computing the displacement. The
-+      // enterNoPool() call itself can trigger a pool flush, which advances
-+      // currentOffset(). Computing the displacement against the pre-flush
-+      // offset and then emitting the b at the post-flush offset would land
-+      // the branch (poolSize) bytes past the intended target.
-+      m_buffer.enterNoPool(2);
-+      int32_t offset = label->offset() - currentOffset();
-+      if (JOffImm26::IsInRange(offset)) {
-+        as_b(offset);
-+        writeInst(NopInst);
-+        m_buffer.leaveNoPool();
-+        return;
-+      }
-+      m_buffer.leaveNoPool();
-+      // Long jump to bound label.
-+      BufferOffset bo =
-+          emitLoad64Stanza(SecondScratchReg, LabelBase::INVALID_OFFSET);
-+      xs_mtctr(SecondScratchReg);
-+      as_bctr();
-+      addLongJump(bo, BufferOffset(label->offset()));
-+      return;
-+    }
-+    // Unbound label: emit trap-tagged stanza (10 slots).
-+    m_buffer.enterNoPool(kNoPoolPatchableBranchInsns);
-+    BufferOffset bo = xs_trap_tagged(BTag);
-+    writeInst(label->used() ? label->offset() : LabelBase::INVALID_OFFSET);
-+    writeInst(NopInst);
-+    writeInst(NopInst);
-+    writeInst(NopInst);
-+    writeInst(NopInst);
-+    writeInst(NopInst);
-+    writeInst(NopInst);
-+    writeInst(NopInst);
-+    writeInst(NopInst);
-+    m_buffer.leaveNoPool();
-+    if (!oom()) {
-+      label->use(bo.getOffset());
-+    }
-+  }
-+  void jump(Register reg) {
-+    xs_mtctr(reg);
-+    as_bctr();
-+  }
-+  void jump(const Address& address) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    loadPtr(address, scratch);
-+    xs_mtctr(scratch);
-+    as_bctr();
-+  }
-+  void jump(JitCode* code) { branch(code); }
-+  void jump(ImmPtr ptr) {
-+    BufferOffset bo =
-+        emitLoad64Stanza(SecondScratchReg, (uint64_t)uintptr_t(ptr.value));
-+    addPendingJump(bo, ptr, RelocationKind::HARDCODED);
-+    xs_mtctr(SecondScratchReg);
-+    as_bctr();
-+  }
-+  void jump(TrampolinePtr code) { jump(ImmPtr(code.value)); }
-+
-+  // Conditional branch to label. Assumes a compare instruction has already
-+  // been emitted that sets CR0.
-+  template <typename CondT>
-+  void ma_b(CondT cond, Label* label) {
-+    if constexpr (std::is_same_v<CondT, Condition>) {
-+      if (cond == Always) {
-+        jump(label);
-+        return;
-+      }
-+    }
-+    if (label->bound()) {
-+      // Open the no-pool window BEFORE computing the displacement. Same
-+      // hazard as jump(): enterNoPool may itself flush a pending pool,
-+      // advancing currentOffset(); the bc must emit with a displacement
-+      // computed against the post-flush offset. Budget covers max 6
-+      // instructions: POWER8 Overflow XER ops (3) + cror (1) + bc (1) +
-+      // nop (1) for the worst-case DoubleCondition+Overflow short path.
-+      m_buffer.enterNoPool(6);
-+      // For DoubleCondition, as_bc emits cror/crandc before the bc
-+      // instruction, advancing currentOffset() by 4. Account for this
-+      // in the offset calculation.
-+      int32_t crAdjust = 0;
-+      if constexpr (std::is_same_v<CondT, DoubleCondition>) {
-+        crAdjust = -(int32_t)sizeof(uint32_t);
-+      }
-+      int32_t offset = label->offset() - currentOffset() + crAdjust;
-+      if (BOffImm16::IsInRange(offset)) {
-+        as_bc((int16_t)offset, cond);
-+        writeInst(NopInst);
-+        m_buffer.leaveNoPool();
-+        return;
-+      }
-+      m_buffer.leaveNoPool();
-+      // Long conditional branch for bound label.
-+      // XER ops(0-3) + cror(0-1) + bc(1) + stanza(8) + mtctr(1) + bctr(1).
-+      // P8 Overflow: mfxer+rlwinm+mtcrf+bc+stanza+mtctr+bctr = 14 max.
-+      m_buffer.enterNoPool(kNoPoolCondLongBranchInsnsP8Max);
-+      as_bc((int16_t)44, InvertCondition(cond));
-+      BufferOffset boLoad =
-+          emitLoad64Stanza(SecondScratchReg, LabelBase::INVALID_OFFSET);
-+      xs_mtctr(SecondScratchReg);
-+      as_bctr();
-+      m_buffer.leaveNoPool();
-+      addLongJump(boLoad, BufferOffset(label->offset()));
-+      return;
-+    }
-+    // Forward reference: emit BCTag stanza.
-+    // XER ops(0-3) + cror(0-1) + bc(1) + trap_tagged(1) + chain(1) + 8 NOPs.
-+    // P8 Overflow: mfxer+rlwinm+mtcrf+bc+trap+chain+8NOPs = 14 max.
-+    m_buffer.enterNoPool(kNoPoolCondLongBranchInsnsP8Max);
-+    as_bc((int16_t)44, InvertCondition(cond));
-+    BufferOffset bo = xs_trap_tagged(BCTag);
-+    writeInst(label->used() ? label->offset() : LabelBase::INVALID_OFFSET);
-+    writeInst(NopInst);
-+    writeInst(NopInst);
-+    writeInst(NopInst);
-+    writeInst(NopInst);
-+    writeInst(NopInst);
-+    writeInst(NopInst);
-+    writeInst(NopInst);
-+    writeInst(NopInst);
-+    m_buffer.leaveNoPool();
-+    if (!oom()) {
-+      label->use(bo.getOffset());
-+    }
-+  }
-+
-+  // Set dest = 1 if CR0 satisfies cond, else dest = 0.
-+  // POWER10: setbc/setbcr (1 insn). P8/P9: isel-based path with the
-+  // r0-as-zero trick on the BranchOnClear half.
-+  void ma_cmp_set(Register dest, Condition cond) {
-+    uint32_t base = uint32_t(cond) & 0xff;
-+    uint32_t setbase = (base & ~BranchOptionMask) | BranchOnSet;
-+    if (HasPOWER10()) {
-+      if ((base & BranchOptionMask) == BranchOnSet) {
-+        as_setbc(dest, setbase, cr0);
-+      } else {
-+        as_setbcr(dest, setbase, cr0);
-+      }
-+      return;
-+    }
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    xs_li(scratch, 1);
-+    if ((base & BranchOptionMask) == BranchOnSet) {
-+      xs_li(dest, 0);
-+      as_isel(dest, scratch, dest, setbase, cr0);
-+    } else {
-+      as_isel0(dest, r0, scratch, setbase, cr0);
-+    }
-+  }
-+
-+  void ma_cmp_set_dbl(Register dest, DoubleCondition cond) {
-+    uint32_t base = uint32_t(cond) & 0xff;
-+    bool hasUnorderedFlag = uint32_t(cond) & DoubleConditionUnordered;
-+    uint32_t setbase = (base & ~BranchOptionMask) | BranchOnSet;
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    if (HasPOWER10()) {
-+      if ((base & BranchOptionMask) == BranchOnSet) {
-+        as_setbc(dest, setbase, cr0);
-+      } else {
-+        as_setbcr(dest, setbase, cr0);
-+      }
-+      // Fixup paths below still need scratch=1 for the SO-isel.
-+      if (hasUnorderedFlag || ((base & BranchOptionMask) != BranchOnSet &&
-+                               cond != DoubleOrdered)) {
-+        xs_li(scratch, 1);
-+      }
-+    } else {
-+      xs_li(scratch, 1);
-+      if ((base & BranchOptionMask) == BranchOnSet) {
-+        xs_li(dest, 0);
-+        as_isel(dest, scratch, dest, setbase, cr0);
-+      } else {
-+        as_isel0(dest, r0, scratch, setbase, cr0);
-+      }
-+    }
-+    if (hasUnorderedFlag) {
-+      // Condition includes unordered (NaN): force dest=1 when SO is set.
-+      // isel dest, scratch(=1), dest, SO
-+      as_isel(dest, scratch, dest, uint16_t(SOBit), cr0);
-+    } else if ((base & BranchOptionMask) != BranchOnSet &&
-+               cond != DoubleOrdered) {
-+      // Ordered comparison that negates a CR bit (BranchOnClear): NaN
-+      // produces all-zero LT/GT/EQ bits which makes the negation return
-+      // true.  Fix by forcing dest=0 when SO is set.
-+      as_isel0(dest, r0, dest, uint16_t(SOBit), cr0);
-+    }
-+  }
-+
-+  // Conditional move: if CR0 satisfies cond, dest = src.
-+  void ma_cmp_move(Register dest, Register src, Condition cond) {
-+    uint32_t base = uint32_t(cond) & 0xff;
-+    uint32_t setbase = (base & ~BranchOptionMask) | BranchOnSet;
-+    if ((base & BranchOptionMask) == BranchOnSet) {
-+      as_isel(dest, src, dest, setbase, cr0);
-+    } else {
-+      as_isel(dest, dest, src, setbase, cr0);
-+    }
-+  }
-+
-+  // If cond == 0, move src to dst; otherwise dst is unchanged. The only
-+  // callers are wasm select, whose condition is a 32-bit value: test its
-+  // 32-bit sign with cmpwi so high-bit garbage (e.g. under register pressure)
-+  // does not make a zero condition read as non-zero.
-+  void moveIfZero(Register dst, Register src, Register cond) {
-+    as_cmpwi(cond, 0);
-+    as_isel(dst, src, dst, Equal, cr0);
-+  }
-+
-+  void ma_add32TestCarry(Condition cond, Register rd, Register rs, Imm32 imm,
-+                         Label* overflow);
-+  void ma_addPtrTestCarry(Condition cond, Register rd, Register rs, ImmWord imm,
-+                          Label* overflow);
-+
-+  // Issue the correct compare instruction for the given condition and
-+  // operand sizes. Returns the condition to use with ma_b or ma_cmp_set
-+  // (usually the same, but unsigned conditions use cmpl* variants).
-+  Condition ma_cmp(Register lhs, Register rhs, Condition cond,
-+                   bool is32bit = false) {
-+    Condition base =
-+        static_cast<Condition>(cond & ~(ConditionUnsigned | ConditionZero));
-+    bool isUnsigned = (cond & ConditionUnsigned) != 0;
-+    // ConditionZero-flagged conditions (Signed, NotSigned, Zero, NonZero)
-+    // test a single register against zero, not two registers against each
-+    // other. Compare against immediate 0.
-+    if ((cond & ConditionZero) != 0) {
-+      if (is32bit) {
-+        as_cmpwi(lhs, 0);
-+      } else {
-+        as_cmpdi(lhs, 0);
-+      }
-+      return base;
-+    }
-+    if (is32bit) {
-+      if (isUnsigned) {
-+        as_cmplw(lhs, rhs);
-+      } else {
-+        as_cmpw(lhs, rhs);
-+      }
-+    } else {
-+      if (isUnsigned) {
-+        as_cmpld(lhs, rhs);
-+      } else {
-+        as_cmpd(lhs, rhs);
-+      }
-+    }
-+    return base;
-+  }
-+
-+  Condition ma_cmp(Register lhs, Imm32 rhs, Condition cond,
-+                   bool is32bit = false) {
-+    Condition base =
-+        static_cast<Condition>(cond & ~(ConditionUnsigned | ConditionZero));
-+    bool isUnsigned = (cond & ConditionUnsigned) != 0;
-+    if (isUnsigned) {
-+      if (is_uintN(rhs.value, 16)) {
-+        if (is32bit) {
-+          as_cmplwi(lhs, rhs.value);
-+        } else {
-+          as_cmpldi(lhs, rhs.value);
-+        }
-+        return base;
-+      }
-+    } else {
-+      if (is_intN(rhs.value, 16)) {
-+        if (is32bit) {
-+          as_cmpwi(lhs, rhs.value);
-+        } else {
-+          as_cmpdi(lhs, rhs.value);
-+        }
-+        return base;
-+      }
-+    }
-+    // Immediate doesn't fit — materialize into scratch and compare.
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    move32(rhs, scratch);
-+    return ma_cmp(lhs, scratch, cond, is32bit);
-+  }
-+
-+  Condition ma_cmp(Register lhs, ImmWord rhs, Condition cond) {
-+    Condition base =
-+        static_cast<Condition>(cond & ~(ConditionUnsigned | ConditionZero));
-+    bool isUnsigned = (cond & ConditionUnsigned) != 0;
-+    if (isUnsigned) {
-+      if (is_uintN(rhs.value, 16)) {
-+        as_cmpldi(lhs, rhs.value);
-+        return base;
-+      }
-+    } else {
-+      if (is_intN(rhs.value, 16)) {
-+        as_cmpdi(lhs, rhs.value);
-+        return base;
-+      }
-+    }
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    movePtr(rhs, scratch);
-+    return ma_cmp(lhs, scratch, cond);
-+  }
-+
-+  Condition ma_cmp(Register lhs, ImmPtr rhs, Condition cond) {
-+    return ma_cmp(lhs, ImmWord(uintptr_t(rhs.value)), cond);
-+  }
-+
-+  Condition ma_cmp(Register lhs, ImmGCPtr rhs, Condition cond) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    movePtr(rhs, scratch);
-+    return ma_cmp(lhs, scratch, cond);
-+  }
-+
-+  Condition ma_cmp(Register lhs, ImmTag rhs, Condition cond) {
-+    // Tag values on PUNBOX64 are 17-bit (0x1FFF0+), too large for 16-bit
-+    // signed or unsigned immediates.
-+    Condition base =
-+        static_cast<Condition>(cond & ~(ConditionUnsigned | ConditionZero));
-+    bool isUnsigned = (cond & ConditionUnsigned) != 0;
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    move32(Imm32(rhs.value), scratch);
-+    if (isUnsigned) {
-+      as_cmpld(lhs, scratch);
-+    } else {
-+      as_cmpd(lhs, scratch);
-+    }
-+    return base;
-+  }
-+
-+  // Compare a tag register against an ImmTag constant and branch, WITHOUT
-+  // acquiring a scratch register.  Uses xoris+cmplwi which MODIFIES tagReg.
-+  // Only safe when tagReg is a scratch register owned by the caller.
-+  void branchTestTag(Condition cond, Register tagReg, ImmTag tag, Label* label) {
-+    MOZ_ASSERT(cond == Equal || cond == NotEqual);
-+    uint32_t t = tag.value;
-+    as_xoris(tagReg, tagReg, t >> 16);
-+    as_cmplwi(tagReg, t & 0xFFFF);
-+    Condition c = (cond == Equal) ? Equal : NotEqual;
-+    ma_b(c, label);
-+  }
-+
-+  void ma_mod_mask(Register src, Register dest, Register hold, Register remain,
-+                   int32_t shift, Label* negZero = nullptr);
-+
-+  void nop() { writeInst(NopInst); }
-+  void breakpoint(uint32_t value = 0) { xs_trap(); }
-+
-+  inline void retn(Imm32 n);
-+
-+  // ===============================================================
-+  // Stack operations
-+
-+  void push(Imm32 imm) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    move32(imm, scratch);
-+    push(scratch);
-+  }
-+  void push(ImmWord imm) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    movePtr(imm, scratch);
-+    push(scratch);
-+  }
-+  void push(ImmGCPtr imm) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    movePtr(imm, scratch);
-+    push(scratch);
-+  }
-+  void push(const Address& address) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    loadPtr(address, scratch);
-+    push(scratch);
-+  }
-+  void push(Register reg) { as_stdu(reg, StackPointer, -8); }
-+  void push(FloatRegister reg) {
-+    // stfdu/stfsu fuses the SP decrement and the FP store: EA=SP-8,
-+    // MEM[EA]=reg, SP=EA. 1 insn instead of addi+stfd/stfs.
-+    if (reg.isSingle()) {
-+      as_stfsu(reg, StackPointer, -8);
-+    } else {
-+      as_stfdu(reg, StackPointer, -8);
-+    }
-+  }
-+  void pop(Register reg) {
-+    as_ld(reg, StackPointer, 0);
-+    as_addi(StackPointer, StackPointer, 8);
-+  }
-+  void pop(FloatRegister reg) {
-+    if (reg.isSingle()) {
-+      as_lfs(reg, StackPointer, 0);
-+    } else {
-+      as_lfd(reg, StackPointer, 0);
-+    }
-+    as_addi(StackPointer, StackPointer, 8);
-+  }
-+
-+  CodeOffset pushWithPatch(ImmWord imm) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    CodeOffset offset = movWithPatch(imm, scratch);
-+    push(scratch);
-+    return offset;
-+  }
-+  CodeOffset movWithPatch(ImmWord imm, Register dest) {
-+    BufferOffset bo = emitLoad64Stanza(dest, (uint64_t)imm.value);
-+    return CodeOffset(bo.getOffset());
-+  }
-+  CodeOffset movWithPatch(ImmPtr imm, Register dest) {
-+    return movWithPatch(ImmWord(uintptr_t(imm.value)), dest);
-+  }
-+
-+  // ===============================================================
-+  // Tag/unbox operations
-+
-+  void splitTag(Register src, Register dest) {
-+    x_srdi(dest, src, JSVAL_TAG_SHIFT);
-+  }
-+  void splitTag(const ValueOperand& operand, Register dest) {
-+    splitTag(operand.valueReg(), dest);
-+  }
-+  void splitTagForTest(const ValueOperand& value, ScratchTagScope& tag) {
-+    splitTag(value, tag);
-+  }
-+
-+  void unboxNonDouble(const ValueOperand& operand, Register dest,
-+                      JSValueType type) {
-+    unboxNonDouble(operand.valueReg(), dest, type);
-+  }
-+  template <typename T>
-+  void unboxNonDouble(T src, Register dest, JSValueType type) {
-+    MOZ_ASSERT(type != JSVAL_TYPE_DOUBLE);
-+    if (type == JSVAL_TYPE_INT32 || type == JSVAL_TYPE_BOOLEAN) {
-+      load32(src, dest);
-+      return;
-+    }
-+    loadPtr(src, dest);
-+    unboxNonDouble(dest, dest, type);
-+  }
-+  void unboxNonDouble(Register src, Register dest, JSValueType type) {
-+    MOZ_ASSERT(type != JSVAL_TYPE_DOUBLE);
-+    if (type == JSVAL_TYPE_INT32 || type == JSVAL_TYPE_BOOLEAN) {
-+      as_extsw(dest, src);
-+      return;
-+    }
-+    // Extract the payload (lower 47 bits) by clearing the tag.
-+    // This avoids acquiring a scratch register, preventing pool exhaustion
-+    // when called from nested scratch scopes (e.g., ScratchTagScope →
-+    // branchTestStringTruthy → unboxString → here).
-+    // rldicl dest, src, 0, 17 — clear upper 17 bits (tag), keep lower 47.
-+    as_rldicl(dest, src, 0, 17);
-+  }
-+  void unboxGCThingForGCBarrier(const Address& src, Register dest) {
-+    loadPtr(src, dest);
-+    // Clear tag bits (top 17 bits on 64-bit).
-+    as_rldicl(dest, dest, 0, 64 - JSVAL_TAG_SHIFT);
-+  }
-+  void unboxGCThingForGCBarrier(const ValueOperand& src, Register dest) {
-+    as_rldicl(dest, src.valueReg(), 0, 64 - JSVAL_TAG_SHIFT);
-+  }
-+  void unboxWasmAnyRefGCThingForGCBarrier(const Address& src, Register dest) {
-+    static_assert(wasm::AnyRef::TagShift == 2);
-+    loadPtr(src, dest);
-+    as_rldicr(dest, dest, 0, 61);
-+  }
-+  void getGCThingValueChunk(const Address& src, Register dest) {
-+    loadPtr(src, dest);
-+    as_rldicl(dest, dest, 0, 17);
-+    as_rldicr(dest, dest, 0, 43);
-+  }
-+  void getGCThingValueChunk(const ValueOperand& src, Register dest) {
-+    as_rldicl(dest, src.valueReg(), 0, 17);
-+    as_rldicr(dest, dest, 0, 43);
-+  }
-+
-+  void boxDouble(FloatRegister src, const ValueOperand& dest, FloatRegister) {
-+    as_mfvsrd(dest.valueReg(), src);
-+  }
-+  void boxNonDouble(JSValueType type, Register src, const ValueOperand& dest) {
-+    boxValue(type, src, dest.valueReg());
-+  }
-+  void boxNonDouble(Register type, Register src, const ValueOperand& dest) {
-+    boxValue(type, src, dest.valueReg());
-+  }
-+  void unboxInt32(const ValueOperand& operand, Register dest) {
-+    as_extsw(dest, operand.valueReg());
-+  }
-+  void unboxInt32(const Address& src, Register dest) { load32(src, dest); }
-+  void unboxInt32(const BaseIndex& src, Register dest) { load32(src, dest); }
-+  void unboxBoolean(const ValueOperand& operand, Register dest) {
-+    as_extsw(dest, operand.valueReg());
-+  }
-+  void unboxBoolean(const Address& src, Register dest) { load32(src, dest); }
-+  void unboxBoolean(const BaseIndex& src, Register dest) { load32(src, dest); }
-+  void unboxDouble(const ValueOperand& operand, FloatRegister dest) {
-+    as_mtvsrd(dest, operand.valueReg());
-+  }
-+  void unboxDouble(const Address& src, FloatRegister dest) {
-+    loadDouble(src, dest);
-+  }
-+  void unboxDouble(const BaseIndex& src, FloatRegister dest) {
-+    loadDouble(src, dest);
-+  }
-+  void unboxString(const ValueOperand& operand, Register dest) {
-+    unboxNonDouble(operand, dest, JSVAL_TYPE_STRING);
-+  }
-+  void unboxString(const Address& src, Register dest) {
-+    unboxNonDouble(src, dest, JSVAL_TYPE_STRING);
-+  }
-+  void unboxSymbol(const ValueOperand& operand, Register dest) {
-+    unboxNonDouble(operand, dest, JSVAL_TYPE_SYMBOL);
-+  }
-+  void unboxSymbol(const Address& src, Register dest) {
-+    unboxNonDouble(src, dest, JSVAL_TYPE_SYMBOL);
-+  }
-+  void unboxBigInt(const ValueOperand& operand, Register dest) {
-+    unboxNonDouble(operand, dest, JSVAL_TYPE_BIGINT);
-+  }
-+  void unboxBigInt(const Address& src, Register dest) {
-+    unboxNonDouble(src, dest, JSVAL_TYPE_BIGINT);
-+  }
-+  void unboxObject(const ValueOperand& src, Register dest) {
-+    unboxNonDouble(src, dest, JSVAL_TYPE_OBJECT);
-+  }
-+  void unboxObject(const Address& src, Register dest) {
-+    unboxNonDouble(src, dest, JSVAL_TYPE_OBJECT);
-+  }
-+  void unboxObject(const BaseIndex& src, Register dest) {
-+    unboxNonDouble(src, dest, JSVAL_TYPE_OBJECT);
-+  }
-+  void unboxValue(const ValueOperand& src, AnyRegister dest, JSValueType type) {
-+    if (dest.isFloat()) {
-+      unboxDouble(src, dest.fpu());
-+    } else {
-+      unboxNonDouble(src, dest.gpr(), type);
-+    }
-+  }
-+  void unboxObjectOrNull(const Address& src, Register dest) {
-+    loadPtr(src, dest);
-+    // Object pointers have the object tag in high bits; null has a different
-+    // tag. Clear the top bits to get either a valid pointer or zero.
-+    as_rldicl(dest, dest, 0, 64 - JSVAL_TAG_SHIFT);
-+  }
-+
-+  void tagValue(JSValueType type, Register payload, ValueOperand dest) {
-+    MOZ_ASSERT(type != JSVAL_TYPE_UNDEFINED && type != JSVAL_TYPE_NULL);
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    MOZ_ASSERT(scratch != payload && scratch != dest.valueReg());
-+    tagValueWithScratch(type, payload, dest, scratch);
-+  }
-+  void tagValueWithScratch(JSValueType type, Register payload,
-+                           ValueOperand dest, Register scratch) {
-+    movePtr(ImmShiftedTag(type), scratch);
-+    if (type == JSVAL_TYPE_INT32 || type == JSVAL_TYPE_BOOLEAN ||
-+        type == JSVAL_TYPE_MAGIC) {
-+      if (payload != dest.valueReg()) {
-+        movePtr(payload, dest.valueReg());
-+      }
-+      as_rldicl(dest.valueReg(), dest.valueReg(), 0, 32);
-+      as_or_(dest.valueReg(), dest.valueReg(), scratch);
-+    } else {
-+      if (payload != dest.valueReg()) {
-+        movePtr(payload, dest.valueReg());
-+      }
-+      as_or_(dest.valueReg(), dest.valueReg(), scratch);
-+    }
-+  }
-+  void boxValue(JSValueType type, Register src, Register dest) {
-+    MOZ_ASSERT(src != dest);
-+    MOZ_ASSERT(type != JSVAL_TYPE_UNDEFINED && type != JSVAL_TYPE_NULL);
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    boxValueWithScratch(type, src, dest, scratch);
-+  }
-+  void boxValueWithScratch(JSValueType type, Register src, Register dest,
-+                           Register scratch) {
-+    if (type == JSVAL_TYPE_INT32 || type == JSVAL_TYPE_BOOLEAN ||
-+        type == JSVAL_TYPE_MAGIC) {
-+      as_rldicl(dest, src, 0, 32);
-+      movePtr(ImmShiftedTag(type), scratch);
-+      as_or_(dest, dest, scratch);
-+    } else {
-+      movePtr(ImmShiftedTag(type), scratch);
-+      xs_mr(dest, src);
-+      as_or_(dest, dest, scratch);
-+    }
-+  }
-+  void boxValue(Register type, Register src, Register dest) {
-+    MOZ_ASSERT(src != dest);
-+
-+#ifdef DEBUG
-+    Label done, isNullOrUndefined, isBoolean, isInt32OrMagic;
-+
-+    // Use ma_cmp + ma_b instead of asMasm().branch32() because
-+    // MacroAssembler is not yet fully defined at this point.
-+    Condition cond;
-+    cond = ma_cmp(type, Imm32(JSVAL_TYPE_NULL), Equal, true);
-+    ma_b(cond, &isNullOrUndefined);
-+    cond = ma_cmp(type, Imm32(JSVAL_TYPE_UNDEFINED), Equal, true);
-+    ma_b(cond, &isNullOrUndefined);
-+    cond = ma_cmp(type, Imm32(JSVAL_TYPE_BOOLEAN), Equal, true);
-+    ma_b(cond, &isBoolean);
-+    cond = ma_cmp(type, Imm32(JSVAL_TYPE_INT32), Equal, true);
-+    ma_b(cond, &isInt32OrMagic);
-+    cond = ma_cmp(type, Imm32(JSVAL_TYPE_MAGIC), Equal, true);
-+    ma_b(cond, &isInt32OrMagic);
-+    // GCThing types aren't supported, because as_rldicl truncates
-+    // payloads above UINT32_MAX.
-+    breakpoint();
-+    {
-+      bind(&isNullOrUndefined);
-+
-+      // Ensure no payload for null and undefined.
-+      cond = ma_cmp(src, ImmWord(0), Equal);
-+      ma_b(cond, &done);
-+      breakpoint();
-+    }
-+    {
-+      bind(&isBoolean);
-+
-+      // Ensure boolean values are either 0 or 1.
-+      cond = ma_cmp(src, Imm32(1), BelowOrEqual, true);
-+      ma_b(cond, &done);
-+      breakpoint();
-+    }
-+    {
-+      bind(&isInt32OrMagic);
-+
-+      // Ensure |src| is sign-extended.
-+      UseScratchRegisterScope debugTemps(*this);
-+      Register debugScratch = debugTemps.Acquire();
-+      as_extsw(debugScratch, src);
-+      cond = ma_cmp(src, debugScratch, Equal);
-+      ma_b(cond, &done);
-+      breakpoint();
-+    }
-+    bind(&done);
-+#endif
-+
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    MOZ_ASSERT(scratch != dest && scratch != src && scratch != type);
-+    // Build tag: (type | JSVAL_TAG_MAX_DOUBLE) << JSVAL_TAG_SHIFT
-+    move32(Imm32(JSVAL_TAG_MAX_DOUBLE), scratch);
-+    as_or_(scratch, scratch, type);
-+    x_sldi(scratch, scratch, JSVAL_TAG_SHIFT);
-+    // Insert 32-bit payload.
-+    as_rldicl(dest, src, 0, 32);
-+    as_or_(dest, dest, scratch);
-+  }
-+
-+  // ===============================================================
-+  // Value store/load/push/pop
-+
-+  void storeValue(ValueOperand val, const Address& dest) {
-+    storePtr(val.valueReg(), dest);
-+  }
-+  void storeValue(ValueOperand val, const BaseIndex& dest) {
-+    storePtr(val.valueReg(), dest);
-+  }
-+  void storeValue(JSValueType type, Register reg, Address dest) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    MOZ_ASSERT(dest.base != scratch);
-+    boxValue(type, reg, scratch);
-+    storePtr(scratch, dest);
-+  }
-+  void storeValue(JSValueType type, Register reg, BaseIndex dest) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    MOZ_ASSERT(dest.base != scratch);
-+    boxValue(type, reg, scratch);
-+    storePtr(scratch, dest);
-+  }
-+  void storeValue(const Value& val, Address dest) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    MOZ_ASSERT(dest.base != scratch);
-+    if (val.isGCThing()) {
-+      CodeOffset off = movWithPatch(ImmWord(val.asRawBits()), scratch);
-+      writeDataRelocation(off, val);
-+    } else {
-+      movePtr(ImmWord(val.asRawBits()), scratch);
-+    }
-+    storePtr(scratch, dest);
-+  }
-+  void storeValue(const Value& val, BaseIndex dest) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    MOZ_ASSERT(dest.base != scratch);
-+    if (val.isGCThing()) {
-+      CodeOffset off = movWithPatch(ImmWord(val.asRawBits()), scratch);
-+      writeDataRelocation(off, val);
-+    } else {
-+      movePtr(ImmWord(val.asRawBits()), scratch);
-+    }
-+    storePtr(scratch, dest);
-+  }
-+  void storeValue(const Address& src, const Address& dest, Register temp) {
-+    loadPtr(src, temp);
-+    storePtr(temp, dest);
-+  }
-+
-+  void storePrivateValue(Register src, const Address& dest) {
-+    storePtr(src, dest);
-+  }
-+  void storePrivateValue(ImmGCPtr imm, const Address& dest) {
-+    storePtr(imm, dest);
-+  }
-+
-+  void loadValue(Address src, ValueOperand val) {
-+    loadPtr(src, val.valueReg());
-+  }
-+  void loadValue(const BaseIndex& src, ValueOperand val) {
-+    loadPtr(src, val.valueReg());
-+  }
-+  void loadUnalignedValue(const Address& src, ValueOperand dest) {
-+    loadPtr(src, dest.valueReg());
-+  }
-+
-+  void pushValue(ValueOperand val) { push(val.valueReg()); }
-+  void popValue(ValueOperand val) { pop(val.valueReg()); }
-+  void pushValue(const Value& val) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    if (val.isGCThing()) {
-+      CodeOffset off = movWithPatch(ImmWord(val.asRawBits()), scratch);
-+      writeDataRelocation(off, val);
-+    } else {
-+      movePtr(ImmWord(val.asRawBits()), scratch);
-+    }
-+    push(scratch);
-+  }
-+  void pushValue(JSValueType type, Register reg) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    boxValue(type, reg, scratch);
-+    push(scratch);
-+  }
-+  void pushValue(const Address& addr) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    loadPtr(addr, scratch);
-+    push(scratch);
-+  }
-+  void pushValue(const BaseIndex& addr, Register scratch) {
-+    loadPtr(addr, scratch);
-+    push(scratch);
-+  }
-+
-+  // ===============================================================
-+  // Load instructions
-+
-+  FaultingCodeOffset load8SignExtend(const Address& address, Register dest) {
-+    FaultingCodeOffset fco;
-+    if (is_intN(address.offset, 16)) {
-+      fco = FaultingCodeOffset(
-+          as_lbz(dest, address.base, address.offset).getOffset());
-+    } else {
-+      UseScratchRegisterScope temps(*this);
-+      Register scratch = temps.Acquire();
-+      MOZ_ASSERT(scratch != dest);
-+      movePtr(ImmWord(address.offset), scratch);
-+      fco =
-+          FaultingCodeOffset(as_lbzx(dest, address.base, scratch).getOffset());
-+    }
-+    as_extsb(dest, dest);
-+    return fco;
-+  }
-+  FaultingCodeOffset load8SignExtend(const BaseIndex& src, Register dest) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    computeScaledAddress(src, scratch);
-+    FaultingCodeOffset fco;
-+    if (is_intN(src.offset, 16)) {
-+      fco = FaultingCodeOffset(as_lbz(dest, scratch, src.offset).getOffset());
-+    } else {
-+      MOZ_ASSERT(scratch != dest);
-+      movePtr(ImmWord(src.offset), dest);
-+      fco = FaultingCodeOffset(as_lbzx(dest, scratch, dest).getOffset());
-+    }
-+    as_extsb(dest, dest);
-+    return fco;
-+  }
-+  FaultingCodeOffset load8ZeroExtend(const Address& address, Register dest) {
-+    if (is_intN(address.offset, 16)) {
-+      return FaultingCodeOffset(
-+          as_lbz(dest, address.base, address.offset).getOffset());
-+    }
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    MOZ_ASSERT(scratch != dest);
-+    movePtr(ImmWord(address.offset), scratch);
-+    return FaultingCodeOffset(as_lbzx(dest, address.base, scratch).getOffset());
-+  }
-+  FaultingCodeOffset load8ZeroExtend(const BaseIndex& src, Register dest) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    computeScaledAddress(src, scratch);
-+    if (is_intN(src.offset, 16)) {
-+      return FaultingCodeOffset(as_lbz(dest, scratch, src.offset).getOffset());
-+    }
-+    MOZ_ASSERT(scratch != dest);
-+    movePtr(ImmWord(src.offset), dest);
-+    return FaultingCodeOffset(as_lbzx(dest, scratch, dest).getOffset());
-+  }
-+  FaultingCodeOffset load16SignExtend(const Address& address, Register dest) {
-+    if (is_intN(address.offset, 16)) {
-+      return FaultingCodeOffset(
-+          as_lha(dest, address.base, address.offset).getOffset());
-+    }
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    MOZ_ASSERT(scratch != dest);
-+    movePtr(ImmWord(address.offset), scratch);
-+    return FaultingCodeOffset(as_lhax(dest, address.base, scratch).getOffset());
-+  }
-+  FaultingCodeOffset load16SignExtend(const BaseIndex& src, Register dest) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    computeScaledAddress(src, scratch);
-+    if (is_intN(src.offset, 16)) {
-+      return FaultingCodeOffset(as_lha(dest, scratch, src.offset).getOffset());
-+    }
-+    MOZ_ASSERT(scratch != dest);
-+    movePtr(ImmWord(src.offset), dest);
-+    return FaultingCodeOffset(as_lhax(dest, scratch, dest).getOffset());
-+  }
-+  template <typename S>
-+  void load16UnalignedSignExtend(const S& src, Register dest) {
-+    load16SignExtend(src, dest);
-+  }
-+  FaultingCodeOffset load16ZeroExtend(const Address& address, Register dest) {
-+    if (is_intN(address.offset, 16)) {
-+      return FaultingCodeOffset(
-+          as_lhz(dest, address.base, address.offset).getOffset());
-+    }
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    MOZ_ASSERT(scratch != dest);
-+    movePtr(ImmWord(address.offset), scratch);
-+    return FaultingCodeOffset(as_lhzx(dest, address.base, scratch).getOffset());
-+  }
-+  FaultingCodeOffset load16ZeroExtend(const BaseIndex& src, Register dest) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    computeScaledAddress(src, scratch);
-+    if (is_intN(src.offset, 16)) {
-+      return FaultingCodeOffset(as_lhz(dest, scratch, src.offset).getOffset());
-+    }
-+    MOZ_ASSERT(scratch != dest);
-+    movePtr(ImmWord(src.offset), dest);
-+    return FaultingCodeOffset(as_lhzx(dest, scratch, dest).getOffset());
-+  }
-+  template <typename S>
-+  void load16UnalignedZeroExtend(const S& src, Register dest) {
-+    load16ZeroExtend(src, dest);
-+  }
-+
-+  FaultingCodeOffset load32(const Address& address, Register dest) {
-+    // lwa is DS-form (14-bit displacement × 4 = 16-bit-signed effective
-+    // range, 4-byte alignment required). lwax is X-form indexed, no
-+    // alignment constraint. Both sign-extend in one instruction; only
-+    // the misaligned 16-bit-fitting case still needs lwz + extsw.
-+    if (is_intN(address.offset, 16) && (address.offset & 3) == 0) {
-+      return FaultingCodeOffset(
-+          as_lwa(dest, address.base, address.offset).getOffset());
-+    }
-+    if (is_intN(address.offset, 16)) {
-+      FaultingCodeOffset fco(
-+          as_lwz(dest, address.base, address.offset).getOffset());
-+      as_extsw(dest, dest);
-+      return fco;
-+    }
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    MOZ_ASSERT(scratch != dest);
-+    movePtr(ImmWord(address.offset), scratch);
-+    return FaultingCodeOffset(as_lwax(dest, address.base, scratch).getOffset());
-+  }
-+  FaultingCodeOffset load32(const BaseIndex& address, Register dest) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    computeScaledAddress(address, scratch);
-+    if (is_intN(address.offset, 16) && (address.offset & 3) == 0) {
-+      return FaultingCodeOffset(
-+          as_lwa(dest, scratch, address.offset).getOffset());
-+    }
-+    if (is_intN(address.offset, 16)) {
-+      FaultingCodeOffset fco(as_lwz(dest, scratch, address.offset).getOffset());
-+      as_extsw(dest, dest);
-+      return fco;
-+    }
-+    MOZ_ASSERT(scratch != dest);
-+    movePtr(ImmWord(address.offset), dest);
-+    return FaultingCodeOffset(as_lwax(dest, scratch, dest).getOffset());
-+  }
-+  void load32(AbsoluteAddress address, Register dest) {
-+    movePtr(ImmWord((uintptr_t)address.addr), dest);
-+    as_lwa(dest, dest, 0);
-+  }
-+  void load32(wasm::SymbolicAddress address, Register dest) {
-+    movePtr(address, dest);
-+    as_lwa(dest, dest, 0);
-+  }
-+  template <typename S>
-+  void load32Unaligned(const S& src, Register dest) {
-+    load32(src, dest);
-+  }
-+
-+  FaultingCodeOffset load64(const Address& address, Register64 dest) {
-+    return loadPtr(address, dest.reg);
-+  }
-+  FaultingCodeOffset load64(const BaseIndex& address, Register64 dest) {
-+    return loadPtr(address, dest.reg);
-+  }
-+  template <typename S>
-+  void load64Unaligned(const S& src, Register64 dest) {
-+    load64(src, dest);
-+  }
-+
-+  FaultingCodeOffset loadPtr(const Address& address, Register dest) {
-+    // as_ld (DS-form) requires 4-byte aligned offset.
-+    if (is_intN(address.offset, 16) && !(address.offset & 0x3)) {
-+      return FaultingCodeOffset(
-+          as_ld(dest, address.base, address.offset).getOffset());
-+    }
-+    if (HasPOWER10() && is_intN((intptr_t)address.offset, 34)) {
-+      return FaultingCodeOffset(
-+          as_pld(dest, address.base, (int64_t)address.offset, /*R=*/false)
-+              .getOffset());
-+    }
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    MOZ_ASSERT(scratch != dest);
-+    movePtr(ImmWord(address.offset), scratch);
-+    return FaultingCodeOffset(as_ldx(dest, address.base, scratch).getOffset());
-+  }
-+  FaultingCodeOffset loadPtr(const BaseIndex& src, Register dest) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    computeScaledAddress(src, scratch);
-+    if (is_intN(src.offset, 16) && !(src.offset & 0x3)) {
-+      return FaultingCodeOffset(as_ld(dest, scratch, src.offset).getOffset());
-+    }
-+    MOZ_ASSERT(scratch != dest);
-+    movePtr(ImmWord(src.offset), dest);
-+    return FaultingCodeOffset(as_ldx(dest, scratch, dest).getOffset());
-+  }
-+  void loadPtr(AbsoluteAddress address, Register dest) {
-+    movePtr(ImmWord((uintptr_t)address.addr), dest);
-+    as_ld(dest, dest, 0);
-+  }
-+  void loadPtr(wasm::SymbolicAddress address, Register dest) {
-+    movePtr(address, dest);
-+    as_ld(dest, dest, 0);
-+  }
-+
-+  void loadPrivate(const Address& address, Register dest) {
-+    loadPtr(address, dest);
-+  }
-+
-+  FaultingCodeOffset loadDouble(const Address& addr, FloatRegister dest) {
-+    if (is_intN(addr.offset, 16)) {
-+      return FaultingCodeOffset(
-+          as_lfd(dest, addr.base, addr.offset).getOffset());
-+    }
-+    if (HasPOWER10() && is_intN((intptr_t)addr.offset, 34)) {
-+      return FaultingCodeOffset(
-+          as_plfd(dest, addr.base, (int64_t)addr.offset, /*R=*/false)
-+              .getOffset());
-+    }
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    movePtr(ImmWord(addr.offset), scratch);
-+    return FaultingCodeOffset(as_lfdx(dest, addr.base, scratch).getOffset());
-+  }
-+  FaultingCodeOffset loadDouble(const BaseIndex& src, FloatRegister dest) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    computeScaledAddress(src, scratch);
-+    if (is_intN(src.offset, 16)) {
-+      return FaultingCodeOffset(as_lfd(dest, scratch, src.offset).getOffset());
-+    }
-+    Register scratch2 = temps.Acquire();
-+    movePtr(ImmWord(src.offset), scratch2);
-+    return FaultingCodeOffset(as_lfdx(dest, scratch, scratch2).getOffset());
-+  }
-+  FaultingCodeOffset loadFloat32(const Address& addr, FloatRegister dest) {
-+    if (is_intN(addr.offset, 16)) {
-+      return FaultingCodeOffset(
-+          as_lfs(dest, addr.base, addr.offset).getOffset());
-+    }
-+    if (HasPOWER10() && is_intN((intptr_t)addr.offset, 34)) {
-+      return FaultingCodeOffset(
-+          as_plfs(dest, addr.base, (int64_t)addr.offset, /*R=*/false)
-+              .getOffset());
-+    }
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    movePtr(ImmWord(addr.offset), scratch);
-+    return FaultingCodeOffset(as_lfsx(dest, addr.base, scratch).getOffset());
-+  }
-+  FaultingCodeOffset loadFloat32(const BaseIndex& src, FloatRegister dest) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    computeScaledAddress(src, scratch);
-+    if (is_intN(src.offset, 16)) {
-+      return FaultingCodeOffset(as_lfs(dest, scratch, src.offset).getOffset());
-+    }
-+    Register scratch2 = temps.Acquire();
-+    movePtr(ImmWord(src.offset), scratch2);
-+    return FaultingCodeOffset(as_lfsx(dest, scratch, scratch2).getOffset());
-+  }
-+  // Load a FP constant into `dest`.
-+  //
-+  // +0.0 / +0.0f: `xxlxor dest, dest, dest` (1 insn). No register clobbers.
-+  //
-+  // POWER9 non-zero: constant pool load via `addpcis r16, hi; lfd/lfs fD,
-+  // lo(r16); nop`. 2 real insns + nop, no LR clobber, no Return Address
-+  // Stack corruption. lfs auto-expands single-precision to double, so no
-+  // separate xscvspdpn step. Clobbers r16 (SavedScratchRegister). Pool
-+  // entries are shared across duplicate constants.
-+  //
-+  // POWER8 non-zero: inline `movePtr + mtvsrd(+xscvspdpn)` path. We do NOT
-+  // use the bcl-based pool path on POWER8: bcl clobbers LR and corrupts
-+  // the Return Address Stack, which causes catastrophic mispredicts in
-+  // hot FP-constant loops (~200x slowdown observed on cmp-bitselect.js).
-+  //
-+  // Precondition: must not be called inside an `enterNoPool` region when
-+  // HasPOWER9() is true (the pool path calls `allocEntry` which asserts
-+  // `inhibitPools_ == 0`). Audit-verified that no such call site exists
-+  // today; the POWER8 inline path is unaffected.
-+  void loadConstantDouble(double dp, FloatRegister dest) {
-+    if (mozilla::IsPositiveZero(dp)) {
-+      as_xxlxor(dest, dest, dest);
-+      return;
-+    }
-+    if (HasPOWER9()) {
-+      loadFromPoolFloat64(dest, dp);
-+      return;
-+    }
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    union {
-+      double d;
-+      uint64_t u;
-+    } u;
-+    u.d = dp;
-+    movePtr(ImmWord(u.u), scratch);
-+    as_mtvsrd(dest, scratch);
-+  }
-+  void loadConstantFloat32(float f, FloatRegister dest) {
-+    if (mozilla::IsPositiveZero(f)) {
-+      as_xxlxor(dest, dest, dest);
-+      return;
-+    }
-+    if (HasPOWER9()) {
-+      loadFromPoolFloat32(dest, f);
-+      return;
-+    }
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    union {
-+      float f;
-+      uint32_t u;
-+    } u;
-+    u.f = f;
-+    movePtr(ImmWord(u.u), scratch);
-+    x_sldi(scratch, scratch, 32);
-+    as_mtvsrd(dest, scratch);
-+    as_xscvspdpn(dest, dest);
-+  }
-+
-+  void notBoolean(const ValueOperand& val) {
-+    as_xori(val.valueReg(), val.valueReg(), 1);
-+  }
-+
-+  [[nodiscard]] Register extractTag(const Address& address, Register scratch) {
-+    loadPtr(address, scratch);
-+    x_srdi(scratch, scratch, JSVAL_TAG_SHIFT);
-+    return scratch;
-+  }
-+  [[nodiscard]] Register extractTag(const BaseIndex& address,
-+                                    Register scratch) {
-+    if (scratch == r0) {
-+      // r0 cannot be used as a base register in D-form/X-form loads,
-+      // so we need a separate temp for the intermediate address.
-+      UseScratchRegisterScope temps(*this);
-+      Register base = temps.Acquire();
-+      computeScaledAddress(address, base);
-+      loadPtr(Address(base, address.offset), scratch);
-+    } else {
-+      // scratch is a pool register (r11/r12) or another GPR that can
-+      // serve as a base register, so reuse it for the address computation.
-+      computeScaledAddress(address, scratch);
-+      loadPtr(Address(scratch, address.offset), scratch);
-+    }
-+    x_srdi(scratch, scratch, JSVAL_TAG_SHIFT);
-+    return scratch;
-+  }
-+  [[nodiscard]] Register extractTag(const ValueOperand& value,
-+                                    Register scratch) {
-+    splitTag(value, scratch);
-+    return scratch;
-+  }
-+
-+  [[nodiscard]] Register extractObject(const Address& address,
-+                                       Register scratch) {
-+    loadPtr(address, scratch);
-+    as_rldicl(scratch, scratch, 0, 64 - JSVAL_TAG_SHIFT);
-+    return scratch;
-+  }
-+  [[nodiscard]] Register extractObject(const ValueOperand& value,
-+                                       Register scratch) {
-+    unboxObject(value, scratch);
-+    return scratch;
-+  }
-+  [[nodiscard]] Register extractInt32(const ValueOperand& value,
-+                                      Register scratch) {
-+    unboxInt32(value, scratch);
-+    return scratch;
-+  }
-+  [[nodiscard]] Register extractString(const ValueOperand& value,
-+                                       Register scratch) {
-+    unboxString(value, scratch);
-+    return scratch;
-+  }
-+  [[nodiscard]] Register extractSymbol(const ValueOperand& value,
-+                                       Register scratch) {
-+    unboxSymbol(value, scratch);
-+    return scratch;
-+  }
-+  [[nodiscard]] Register extractBoolean(const ValueOperand& value,
-+                                        Register scratch) {
-+    unboxBoolean(value, scratch);
-+    return scratch;
-+  }
-+
-+  void testObjectSet(Condition cond, const ValueOperand& value, Register dest) {
-+    MOZ_ASSERT(cond == Equal || cond == NotEqual);
-+    {
-+      UseScratchRegisterScope temps(*this);
-+      Register tag = temps.Acquire();
-+      splitTag(value, tag);
-+      uint32_t t = JSVAL_TAG_OBJECT;
-+      as_xoris(tag, tag, t >> 16);
-+      as_cmplwi(tag, t & 0xFFFF);
-+    }
-+    ma_cmp_set(dest, cond);
-+  }
-+  void testUndefinedSet(Condition cond, const ValueOperand& value,
-+                        Register dest) {
-+    MOZ_ASSERT(cond == Equal || cond == NotEqual);
-+    {
-+      UseScratchRegisterScope temps(*this);
-+      Register tag = temps.Acquire();
-+      splitTag(value, tag);
-+      // Use xoris+cmplwi to compare without a second scratch.
-+      uint32_t t = JSVAL_TAG_UNDEFINED;
-+      as_xoris(tag, tag, t >> 16);
-+      as_cmplwi(tag, t & 0xFFFF);
-+    }
-+    ma_cmp_set(dest, cond);
-+  }
-+  void testNullSet(Condition cond, const ValueOperand& value, Register dest) {
-+    MOZ_ASSERT(cond == Equal || cond == NotEqual);
-+    {
-+      UseScratchRegisterScope temps(*this);
-+      Register tag = temps.Acquire();
-+      splitTag(value, tag);
-+      uint32_t t = JSVAL_TAG_NULL;
-+      as_xoris(tag, tag, t >> 16);
-+      as_cmplwi(tag, t & 0xFFFF);
-+    }
-+    ma_cmp_set(dest, cond);
-+  }
-+
-+  BufferOffset ret() {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    as_ld(scratch, StackPointer, 0);
-+    as_addi(StackPointer, StackPointer, 8);
-+    xs_mtlr(scratch);
-+    return as_blr();
-+  }
-+
-+  void j(Label* dest) { jump(dest); }
-+
-+  void getWasmAnyRefGCThingChunk(Register anyref, Register dest) {
-+    static_assert(js::gc::ChunkShift == 20);
-+    as_rldicr(dest, anyref, 0, 43);
-+  }
-+
-+  template <typename T>
-+  void loadUnboxedValue(const T& address, MIRType type, AnyRegister dest) {
-+    if (dest.isFloat()) {
-+      loadInt32OrDouble(address, dest.fpu());
-+    } else {
-+      unboxNonDouble(address, dest.gpr(), ValueTypeFromMIRType(type));
-+    }
-+  }
-+
-+  void loadInt32OrDouble(const Address& src, FloatRegister dest);
-+  void loadInt32OrDouble(const BaseIndex& addr, FloatRegister dest);
-+
-+  // ===============================================================
-+  // Store instructions
-+
-+  FaultingCodeOffset store8(Register src, const Address& address) {
-+    if (is_intN(address.offset, 16)) {
-+      return FaultingCodeOffset(
-+          as_stb(src, address.base, address.offset).getOffset());
-+    }
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    movePtr(ImmWord(address.offset), scratch);
-+    return FaultingCodeOffset(as_stbx(src, address.base, scratch).getOffset());
-+  }
-+  FaultingCodeOffset store8(Register src, const BaseIndex& address) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    computeScaledAddress(address, scratch);
-+    if (is_intN(address.offset, 16)) {
-+      return FaultingCodeOffset(
-+          as_stb(src, scratch, address.offset).getOffset());
-+    }
-+    Register scratch2 = temps.Acquire();
-+    movePtr(ImmWord(address.offset), scratch2);
-+    return FaultingCodeOffset(as_stbx(src, scratch, scratch2).getOffset());
-+  }
-+  void store8(Imm32 imm, const Address& address) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    move32(imm, scratch);
-+    store8(scratch, address);
-+  }
-+  void store8(Imm32 imm, const BaseIndex& address) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    move32(imm, scratch);
-+    store8(scratch, address);
-+  }
-+
-+  FaultingCodeOffset store16(Register src, const Address& address) {
-+    if (is_intN(address.offset, 16)) {
-+      return FaultingCodeOffset(
-+          as_sth(src, address.base, address.offset).getOffset());
-+    }
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    movePtr(ImmWord(address.offset), scratch);
-+    return FaultingCodeOffset(as_sthx(src, address.base, scratch).getOffset());
-+  }
-+  FaultingCodeOffset store16(Register src, const BaseIndex& address) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    computeScaledAddress(address, scratch);
-+    if (is_intN(address.offset, 16)) {
-+      return FaultingCodeOffset(
-+          as_sth(src, scratch, address.offset).getOffset());
-+    }
-+    Register scratch2 = temps.Acquire();
-+    movePtr(ImmWord(address.offset), scratch2);
-+    return FaultingCodeOffset(as_sthx(src, scratch, scratch2).getOffset());
-+  }
-+  void store16(Imm32 imm, const Address& address) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    move32(imm, scratch);
-+    store16(scratch, address);
-+  }
-+  void store16(Imm32 imm, const BaseIndex& address) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    move32(imm, scratch);
-+    store16(scratch, address);
-+  }
-+  template <typename T>
-+  void store16Unaligned(Register src, const T& dest) {
-+    store16(src, dest);
-+  }
-+
-+  FaultingCodeOffset store32(Register src, const Address& address) {
-+    if (is_intN(address.offset, 16)) {
-+      return FaultingCodeOffset(
-+          as_stw(src, address.base, address.offset).getOffset());
-+    }
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    movePtr(ImmWord(address.offset), scratch);
-+    return FaultingCodeOffset(as_stwx(src, address.base, scratch).getOffset());
-+  }
-+  FaultingCodeOffset store32(Register src, const BaseIndex& address) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    computeScaledAddress(address, scratch);
-+    if (is_intN(address.offset, 16)) {
-+      return FaultingCodeOffset(
-+          as_stw(src, scratch, address.offset).getOffset());
-+    }
-+    Register scratch2 = temps.Acquire();
-+    movePtr(ImmWord(address.offset), scratch2);
-+    return FaultingCodeOffset(as_stwx(src, scratch, scratch2).getOffset());
-+  }
-+  void store32(Register src, AbsoluteAddress address) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    movePtr(ImmWord((uintptr_t)address.addr), scratch);
-+    as_stw(src, scratch, 0);
-+  }
-+  void store32(Imm32 src, const Address& address) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    move32(src, scratch);
-+    store32(scratch, address);
-+  }
-+  void store32(Imm32 src, const BaseIndex& address) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    move32(src, scratch);
-+    store32(scratch, address);
-+  }
-+  template <typename T>
-+  void store32Unaligned(Register src, const T& dest) {
-+    store32(src, dest);
-+  }
-+
-+  void store64(Imm64 imm, Address address) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    movePtr(ImmWord(imm.value), scratch);
-+    storePtr(scratch, address);
-+  }
-+  void store64(Imm64 imm, const BaseIndex& address) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    movePtr(ImmWord(imm.value), scratch);
-+    storePtr(scratch, address);
-+  }
-+  FaultingCodeOffset store64(Register64 src, Address address) {
-+    return storePtr(src.reg, address);
-+  }
-+  FaultingCodeOffset store64(Register64 src, const BaseIndex& address) {
-+    return storePtr(src.reg, address);
-+  }
-+  template <typename T>
-+  void store64Unaligned(Register64 src, const T& dest) {
-+    store64(src, dest);
-+  }
-+
-+  template <typename T>
-+  void storePtr(ImmWord imm, T address) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    movePtr(imm, scratch);
-+    storePtr(scratch, address);
-+  }
-+  template <typename T>
-+  void storePtr(ImmPtr imm, T address) {
-+    storePtr(ImmWord(uintptr_t(imm.value)), address);
-+  }
-+  template <typename T>
-+  void storePtr(ImmGCPtr imm, T address) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    movePtr(imm, scratch);
-+    storePtr(scratch, address);
-+  }
-+  void storePtr(Register src, AbsoluteAddress dest) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    movePtr(ImmWord((uintptr_t)dest.addr), scratch);
-+    as_std(src, scratch, 0);
-+  }
-+  FaultingCodeOffset storePtr(Register src, const Address& address) {
-+    // as_std (DS-form) requires 4-byte aligned offset.
-+    if (is_intN(address.offset, 16) && !(address.offset & 0x3)) {
-+      return FaultingCodeOffset(
-+          as_std(src, address.base, address.offset).getOffset());
-+    }
-+    if (HasPOWER10() && is_intN((intptr_t)address.offset, 34)) {
-+      return FaultingCodeOffset(
-+          as_pstd(src, address.base, (int64_t)address.offset, /*R=*/false)
-+              .getOffset());
-+    }
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    movePtr(ImmWord(address.offset), scratch);
-+    return FaultingCodeOffset(as_stdx(src, address.base, scratch).getOffset());
-+  }
-+  FaultingCodeOffset storePtr(Register src, const BaseIndex& address) {
-+    UseScratchRegisterScope temps(*this);
-+    Register scratch = temps.Acquire();
-+    computeScaledAddress(address, scratch);
-+    if (is_intN(address.offset, 16) && !(address.offset & 0x3)) {
-+      return FaultingCodeOffset(
-+          as_std(src, scratch, address.offset).getOffset());
-+    }
-+    Register scratch2 = temps.Acquire();
-+    movePtr(ImmWord(address.offset), scratch2);
-+    return FaultingCodeOffset(as_stdx(src, scratch, scratch2).getOffset());
-+  }
-+
-+  // ===============================================================
-+  // Misc
-+
-+  void handleFailureWithHandlerTail(Label* profilerExitTail, Label* bailoutTail,
-+                                    uint32_t* returnValueCheckOffset);
-+
-+  inline void incrementInt32Value(const Address& addr);
-+
-+  void zeroDouble(FloatRegister reg) { as_xxlxor(reg, reg, reg); }
-+
-+  void writeCodePointer(CodeLabel* label) {
-+    label->patchAt()->bind(currentOffset());
-+    label->setLinkMode(CodeLabel::RawPointer);
-+    m_buffer.ensureSpace(sizeof(void*));
-+    writeInst(-1);
-+    writeInst(-1);
-+  }
-+  void writeDataRelocation(const Value& val) {
-+    if (val.isGCThing()) {
-+      gc::Cell* cell = val.toGCThing();
-+      if (cell && gc::IsInsideNursery(cell)) {
-+        embedsNurseryPointers_ = true;
-+      }
-+      dataRelocations_.writeUnsigned(currentOffset());
-+    }
-+  }
-+  void writeDataRelocation(CodeOffset off, const Value& val) {
-+    if (val.isGCThing()) {
-+      gc::Cell* cell = val.toGCThing();
-+      if (cell && gc::IsInsideNursery(cell)) {
-+        embedsNurseryPointers_ = true;
-+      }
-+      dataRelocations_.writeUnsigned(off.offset());
-+    }
-+  }
-+
-+  CodeOffset toggledJump(Label* label) {
-+    CodeOffset ret(nextOffset().getOffset());
-+    jump(label);
-+    return ret;
-+  }
-+  CodeOffset toggledCall(JitCode* target, bool enabled);
-+  // 8 instructions for load64 + mtctr + bctrl = 10 instructions total.
-+  static size_t ToggledCallSize(uint8_t* code) { return 10 * sizeof(uint32_t); }
-+
-+  void checkStackAlignment() {}
-+
-+  static void calculateAlignedStackPointer(void** stackPointer) {
-+    *stackPointer = reinterpret_cast<void*>((uintptr_t(*stackPointer)) &
-+                                            ~(ABIStackAlignment - 1));
-+  }
-+
-+  void lea(Operand addr, Register dest) {
-+    // x86-ism; on PPC, compute effective address manually.
-+    MOZ_CRASH("PPC64: lea not supported; use computeEffectiveAddress");
-+  }
-+
-+  void abiret() { as_blr(); }
-+
-+  void profilerEnterFrame(Register framePtr, Register scratch);
-+  void profilerExitFrame();
-+
-+  void outOfLineWasmTruncateToInt32Check(
-+      FloatRegister input, Register output, MIRType fromType, TruncFlags flags,
-+      Label* rejoin, const wasm::TrapSiteDesc& trapSiteDesc);
-+  void outOfLineWasmTruncateToInt64Check(
-+      FloatRegister input, Register64 output, MIRType fromType,
-+      TruncFlags flags, Label* rejoin, const wasm::TrapSiteDesc& trapSiteDesc);
-+
-+  void wasmLoadImpl(const wasm::MemoryAccessDesc& access, Register memoryBase,
-+                    Register ptr, Register ptrScratch, AnyRegister output);
-+  void wasmStoreImpl(const wasm::MemoryAccessDesc& access, AnyRegister value,
-+                     Register memoryBase, Register ptr, Register ptrScratch);
-+  void wasmLoadI64Impl(const wasm::MemoryAccessDesc& access,
-+                       Register memoryBase, Register ptr, Register ptrScratch,
-+                       Register64 output);
-+  void wasmStoreI64Impl(const wasm::MemoryAccessDesc& access, Register64 value,
-+                        Register memoryBase, Register ptr, Register ptrScratch);
-+
-+  // Last-byte probing load to enforce wasm-spec atomicity for multi-byte
-+  // wasm accesses on POWER ISA. POWER permits unaligned page-spanning
-+  // accesses to commit one half before the other half takes a DSI; wasm
-+  // requires atomicity. Touching the last byte of the upcoming access
-+  // with a 1-byte lbzx triggers SIGSEGV (→ wasm trap via the signal
-+  // handler) before the actual access executes — POWER's precise-
-+  // interrupt model guarantees the subsequent access is never
-+  // architecturally executed if the probe faults.
-+  //
-+  // Wasm linear memory is one contiguous mapped region followed by an
-+  // mprotect'd guard, so last-byte-mapped ⇒ all-bytes-mapped, and a
-+  // single-byte probe is sufficient regardless of access size.
-+  //
-+  // No-op when HasPOWER9() (real POWER9/POWER10 silicon handles page-
-+  // spanning unaligned stores atomically at the µarch level), and when
-+  // access size is 1. Never called on the atomic path: atomic ops are
-+  // naturally aligned per wasm spec + ISA-enforced lwarx alignment, so
-+  // they cannot span pages; misaligned atomics take a precise SIGBUS
-+  // before any commit.
-+  //
-+  // 2 instructions when emitted (addi + lbzx).
-+  void wasmProbeLastByte(const wasm::MemoryAccessDesc& access,
-+                         Register memoryBase, Register ptr);
-+};
-+
-+typedef MacroAssemblerPPC64Compat MacroAssemblerSpecific;
-+
-+}  // namespace jit
-+}  // namespace js
-+
-+#endif /* jit_ppc64_MacroAssembler_ppc64_h */
-diff --git a/js/src/jit/ppc64/MoveEmitter-ppc64.cpp b/js/src/jit/ppc64/MoveEmitter-ppc64.cpp
-new file mode 100644
-index 000000000000..989d3f61f121
---- /dev/null
-+++ b/js/src/jit/ppc64/MoveEmitter-ppc64.cpp
-@@ -0,0 +1,357 @@
-+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
-+ * vim: set ts=8 sts=2 et sw=2 tw=80:
-+ * This Source Code Form is subject to the terms of the Mozilla Public
-+ * License, v. 2.0. If a copy of the MPL was not distributed with this
-+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-+
-+#include "jit/ppc64/MoveEmitter-ppc64.h"
-+
-+#include "jit/MacroAssembler-inl.h"
-+
-+using namespace js;
-+using namespace js::jit;
-+
-+void MoveEmitterPPC64::breakCycle(const MoveOperand& from,
-+                                  const MoveOperand& to, MoveOp::Type type,
-+                                  uint32_t slotId) {
-+  switch (type) {
-+    case MoveOp::FLOAT32:
-+      if (to.isMemory()) {
-+        ScratchFloat32Scope fpscratch32(masm);
-+        masm.loadFloat32(getAdjustedAddress(to), fpscratch32);
-+        masm.storeFloat32(fpscratch32, cycleSlot(slotId));
-+      } else {
-+        masm.storeFloat32(to.floatReg(), cycleSlot(slotId));
-+      }
-+      break;
-+    case MoveOp::DOUBLE:
-+      if (to.isMemory()) {
-+        ScratchDoubleScope fpscratch64(masm);
-+        masm.loadDouble(getAdjustedAddress(to), fpscratch64);
-+        masm.storeDouble(fpscratch64, cycleSlot(slotId));
-+      } else {
-+        masm.storeDouble(to.floatReg(), cycleSlot(slotId));
-+      }
-+      break;
-+    case MoveOp::INT32:
-+      if (to.isMemory()) {
-+        UseScratchRegisterScope temps(masm);
-+        Register scratch = temps.Acquire();
-+        masm.load32(getAdjustedAddress(to), scratch);
-+        masm.store32(scratch, cycleSlot(0));
-+      } else {
-+        masm.store32(to.reg(), cycleSlot(0));
-+      }
-+      break;
-+    case MoveOp::GENERAL:
-+      if (to.isMemory()) {
-+        UseScratchRegisterScope temps(masm);
-+        Register scratch = temps.Acquire();
-+        masm.loadPtr(getAdjustedAddress(to), scratch);
-+        masm.storePtr(scratch, cycleSlot(0));
-+      } else {
-+        masm.storePtr(to.reg(), cycleSlot(0));
-+      }
-+      break;
-+    case MoveOp::SIMD128:
-+      if (to.isMemory()) {
-+        ScratchSimd128Scope scratch(masm);
-+        masm.loadUnalignedSimd128(getAdjustedAddress(to), scratch);
-+        masm.storeUnalignedSimd128(scratch, cycleSlot(slotId));
-+      } else {
-+        masm.storeUnalignedSimd128(to.floatReg(), cycleSlot(slotId));
-+      }
-+      break;
-+    default:
-+      MOZ_CRASH("Unexpected move type");
-+  }
-+}
-+
-+void MoveEmitterPPC64::completeCycle(const MoveOperand& from,
-+                                     const MoveOperand& to, MoveOp::Type type,
-+                                     uint32_t slotId) {
-+  switch (type) {
-+    case MoveOp::FLOAT32:
-+      if (to.isMemory()) {
-+        ScratchFloat32Scope fpscratch32(masm);
-+        masm.loadFloat32(cycleSlot(slotId), fpscratch32);
-+        masm.storeFloat32(fpscratch32, getAdjustedAddress(to));
-+      } else {
-+        masm.loadFloat32(cycleSlot(slotId), to.floatReg());
-+      }
-+      break;
-+    case MoveOp::DOUBLE:
-+      if (to.isMemory()) {
-+        ScratchDoubleScope fpscratch64(masm);
-+        masm.loadDouble(cycleSlot(slotId), fpscratch64);
-+        masm.storeDouble(fpscratch64, getAdjustedAddress(to));
-+      } else {
-+        masm.loadDouble(cycleSlot(slotId), to.floatReg());
-+      }
-+      break;
-+    case MoveOp::INT32:
-+      MOZ_ASSERT(slotId == 0);
-+      if (to.isMemory()) {
-+        UseScratchRegisterScope temps(masm);
-+        Register scratch = temps.Acquire();
-+        masm.load32(cycleSlot(0), scratch);
-+        masm.store32(scratch, getAdjustedAddress(to));
-+      } else {
-+        masm.load32(cycleSlot(0), to.reg());
-+      }
-+      break;
-+    case MoveOp::GENERAL:
-+      MOZ_ASSERT(slotId == 0);
-+      if (to.isMemory()) {
-+        UseScratchRegisterScope temps(masm);
-+        Register scratch = temps.Acquire();
-+        masm.loadPtr(cycleSlot(0), scratch);
-+        masm.storePtr(scratch, getAdjustedAddress(to));
-+      } else {
-+        masm.loadPtr(cycleSlot(0), to.reg());
-+      }
-+      break;
-+    case MoveOp::SIMD128:
-+      if (to.isMemory()) {
-+        ScratchSimd128Scope scratch(masm);
-+        masm.loadUnalignedSimd128(cycleSlot(slotId), scratch);
-+        masm.storeUnalignedSimd128(scratch, getAdjustedAddress(to));
-+      } else {
-+        masm.loadUnalignedSimd128(cycleSlot(slotId), to.floatReg());
-+      }
-+      break;
-+    default:
-+      MOZ_CRASH("Unexpected move type");
-+  }
-+}
-+
-+void MoveEmitterPPC64::emit(const MoveResolver& moves) {
-+  if (moves.numCycles()) {
-+    // SpillSlotSize must be wide enough for the widest cycled value
-+    // (SIMD128 = 16 bytes). The stride below assumes the same. See
-+    // Architecture-ppc64.h for the rationale.
-+    static_assert(SpillSlotSize == 16);
-+    masm.reserveStack(moves.numCycles() * SpillSlotSize);
-+    pushedAtCycle_ = masm.framePushed();
-+  }
-+
-+  for (size_t i = 0; i < moves.numMoves(); i++) {
-+    emit(moves.getMove(i));
-+  }
-+}
-+
-+Address MoveEmitterPPC64::cycleSlot(uint32_t slot, uint32_t subslot) const {
-+  int32_t offset = masm.framePushed() - pushedAtCycle_;
-+  // Stride must match the per-cycle reservation in emit(); using a
-+  // narrower stride causes adjacent SIMD128 slots to overlap.
-+  return Address(StackPointer, offset + slot * SpillSlotSize + subslot);
-+}
-+
-+int32_t MoveEmitterPPC64::getAdjustedOffset(const MoveOperand& operand) {
-+  MOZ_ASSERT(operand.isMemoryOrEffectiveAddress());
-+  if (operand.base() != StackPointer) {
-+    return operand.disp();
-+  }
-+
-+  return operand.disp() + masm.framePushed() - pushedAtStart_;
-+}
-+
-+Address MoveEmitterPPC64::getAdjustedAddress(const MoveOperand& operand) {
-+  return Address(operand.base(), getAdjustedOffset(operand));
-+}
-+
-+void MoveEmitterPPC64::emitMove(const MoveOperand& from,
-+                                const MoveOperand& to) {
-+  if (from.isGeneralReg()) {
-+    if (to.isGeneralReg()) {
-+      masm.movePtr(from.reg(), to.reg());
-+    } else if (to.isMemory()) {
-+      masm.storePtr(from.reg(), getAdjustedAddress(to));
-+    } else {
-+      MOZ_CRASH("Invalid emitMove arguments.");
-+    }
-+  } else if (from.isMemory()) {
-+    if (to.isGeneralReg()) {
-+      masm.loadPtr(getAdjustedAddress(from), to.reg());
-+    } else if (to.isMemory()) {
-+      UseScratchRegisterScope temps(masm);
-+      Register scratch = temps.Acquire();
-+      masm.loadPtr(getAdjustedAddress(from), scratch);
-+      masm.storePtr(scratch, getAdjustedAddress(to));
-+    } else {
-+      MOZ_CRASH("Invalid emitMove arguments.");
-+    }
-+  } else if (from.isEffectiveAddress()) {
-+    if (to.isGeneralReg()) {
-+      masm.computeEffectiveAddress(getAdjustedAddress(from), to.reg());
-+    } else if (to.isMemory()) {
-+      UseScratchRegisterScope temps(masm);
-+      Register scratch = temps.Acquire();
-+      masm.computeEffectiveAddress(getAdjustedAddress(from), scratch);
-+      masm.storePtr(scratch, getAdjustedAddress(to));
-+    } else {
-+      MOZ_CRASH("Invalid emitMove arguments.");
-+    }
-+  } else {
-+    MOZ_CRASH("Invalid emitMove arguments.");
-+  }
-+}
-+
-+void MoveEmitterPPC64::emitInt32Move(const MoveOperand& from,
-+                                     const MoveOperand& to) {
-+  if (from.isGeneralReg()) {
-+    if (to.isGeneralReg()) {
-+      masm.move32(from.reg(), to.reg());
-+    } else if (to.isMemory()) {
-+      masm.store32(from.reg(), getAdjustedAddress(to));
-+    } else {
-+      MOZ_CRASH("Invalid emitInt32Move arguments.");
-+    }
-+  } else if (from.isMemory()) {
-+    if (to.isGeneralReg()) {
-+      masm.load32(getAdjustedAddress(from), to.reg());
-+    } else if (to.isMemory()) {
-+      UseScratchRegisterScope temps(masm);
-+      Register scratch = temps.Acquire();
-+      masm.load32(getAdjustedAddress(from), scratch);
-+      masm.store32(scratch, getAdjustedAddress(to));
-+    } else {
-+      MOZ_CRASH("Invalid emitInt32Move arguments.");
-+    }
-+  } else if (from.isEffectiveAddress()) {
-+    if (to.isGeneralReg()) {
-+      masm.computeEffectiveAddress(getAdjustedAddress(from), to.reg());
-+    } else if (to.isMemory()) {
-+      UseScratchRegisterScope temps(masm);
-+      Register scratch = temps.Acquire();
-+      masm.computeEffectiveAddress(getAdjustedAddress(from), scratch);
-+      masm.store32(scratch, getAdjustedAddress(to));
-+    } else {
-+      MOZ_CRASH("Invalid emitInt32Move arguments.");
-+    }
-+  } else {
-+    MOZ_CRASH("Invalid emitInt32Move arguments.");
-+  }
-+}
-+
-+void MoveEmitterPPC64::emitFloat32Move(const MoveOperand& from,
-+                                       const MoveOperand& to) {
-+  if (from.isFloatReg()) {
-+    if (to.isFloatReg()) {
-+      masm.moveFloat32(from.floatReg(), to.floatReg());
-+    } else {
-+      MOZ_ASSERT(to.isMemory());
-+      masm.storeFloat32(from.floatReg(), getAdjustedAddress(to));
-+    }
-+  } else if (to.isFloatReg()) {
-+    MOZ_ASSERT(from.isMemory());
-+    masm.loadFloat32(getAdjustedAddress(from), to.floatReg());
-+  } else {
-+    MOZ_ASSERT(from.isMemory());
-+    MOZ_ASSERT(to.isMemory());
-+    ScratchFloat32Scope fpscratch32(masm);
-+    masm.loadFloat32(getAdjustedAddress(from), fpscratch32);
-+    masm.storeFloat32(fpscratch32, getAdjustedAddress(to));
-+  }
-+}
-+
-+void MoveEmitterPPC64::emitDoubleMove(const MoveOperand& from,
-+                                      const MoveOperand& to) {
-+  if (from.isFloatReg()) {
-+    if (to.isFloatReg()) {
-+      masm.moveDouble(from.floatReg(), to.floatReg());
-+    } else if (to.isGeneralReg()) {
-+      // FPR -> GPR: use mfvsrd directly.
-+      masm.as_mfvsrd(to.reg(), from.floatReg());
-+    } else {
-+      MOZ_ASSERT(to.isMemory());
-+      masm.storeDouble(from.floatReg(), getAdjustedAddress(to));
-+    }
-+  } else if (to.isFloatReg()) {
-+    if (from.isMemory()) {
-+      masm.loadDouble(getAdjustedAddress(from), to.floatReg());
-+    } else {
-+      // GPR -> FPR: use mtvsrd directly.
-+      masm.as_mtvsrd(to.floatReg(), from.reg());
-+    }
-+  } else {
-+    MOZ_ASSERT(from.isMemory());
-+    MOZ_ASSERT(to.isMemory());
-+    ScratchDoubleScope fpscratch64(masm);
-+    masm.loadDouble(getAdjustedAddress(from), fpscratch64);
-+    masm.storeDouble(fpscratch64, getAdjustedAddress(to));
-+  }
-+}
-+
-+void MoveEmitterPPC64::emitSimd128Move(const MoveOperand& from,
-+                                       const MoveOperand& to) {
-+  if (from.isFloatReg()) {
-+    if (to.isFloatReg()) {
-+      masm.moveSimd128(from.floatReg(), to.floatReg());
-+    } else {
-+      MOZ_ASSERT(to.isMemory());
-+      masm.storeUnalignedSimd128(from.floatReg(), getAdjustedAddress(to));
-+    }
-+  } else if (to.isFloatReg()) {
-+    MOZ_ASSERT(from.isMemory());
-+    masm.loadUnalignedSimd128(getAdjustedAddress(from), to.floatReg());
-+  } else {
-+    MOZ_ASSERT(from.isMemory());
-+    MOZ_ASSERT(to.isMemory());
-+    ScratchSimd128Scope scratch(masm);
-+    masm.loadUnalignedSimd128(getAdjustedAddress(from), scratch);
-+    masm.storeUnalignedSimd128(scratch, getAdjustedAddress(to));
-+  }
-+}
-+
-+void MoveEmitterPPC64::emit(const MoveOp& move) {
-+  const MoveOperand& from = move.from();
-+  const MoveOperand& to = move.to();
-+
-+  if (move.isCycleEnd() && move.isCycleBegin()) {
-+    breakCycle(from, to, move.endCycleType(), move.cycleBeginSlot());
-+    completeCycle(from, to, move.type(), move.cycleEndSlot());
-+    return;
-+  }
-+
-+  if (move.isCycleEnd()) {
-+    MOZ_ASSERT(inCycle_);
-+    completeCycle(from, to, move.type(), move.cycleEndSlot());
-+    MOZ_ASSERT(inCycle_ > 0);
-+    inCycle_--;
-+    return;
-+  }
-+
-+  if (move.isCycleBegin()) {
-+    breakCycle(from, to, move.endCycleType(), move.cycleBeginSlot());
-+    inCycle_++;
-+  }
-+
-+  switch (move.type()) {
-+    case MoveOp::FLOAT32:
-+      emitFloat32Move(from, to);
-+      break;
-+    case MoveOp::DOUBLE:
-+      emitDoubleMove(from, to);
-+      break;
-+    case MoveOp::SIMD128:
-+      emitSimd128Move(from, to);
-+      break;
-+    case MoveOp::INT32:
-+      emitInt32Move(from, to);
-+      break;
-+    case MoveOp::GENERAL:
-+      emitMove(from, to);
-+      break;
-+    default:
-+      MOZ_CRASH("Unexpected move type");
-+  }
-+}
-+
-+void MoveEmitterPPC64::assertDone() { MOZ_ASSERT(inCycle_ == 0); }
-+
-+void MoveEmitterPPC64::finish() {
-+  assertDone();
-+
-+  masm.freeStack(masm.framePushed() - pushedAtStart_);
-+}
-diff --git a/js/src/jit/ppc64/MoveEmitter-ppc64.h b/js/src/jit/ppc64/MoveEmitter-ppc64.h
-new file mode 100644
-index 000000000000..a9faa34de6bb
---- /dev/null
-+++ b/js/src/jit/ppc64/MoveEmitter-ppc64.h
-@@ -0,0 +1,64 @@
-+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
-+ * vim: set ts=8 sts=2 et sw=2 tw=80:
-+ * This Source Code Form is subject to the terms of the Mozilla Public
-+ * License, v. 2.0. If a copy of the MPL was not distributed with this
-+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-+
-+#ifndef jit_ppc64_MoveEmitter_ppc64_h
-+#define jit_ppc64_MoveEmitter_ppc64_h
-+
-+#include "jit/MacroAssembler.h"
-+#include "jit/MoveResolver.h"
-+
-+namespace js {
-+namespace jit {
-+
-+class MoveEmitterPPC64 {
-+  void emitDoubleMove(const MoveOperand& from, const MoveOperand& to);
-+  void emitSimd128Move(const MoveOperand& from, const MoveOperand& to);
-+  void breakCycle(const MoveOperand& from, const MoveOperand& to,
-+                  MoveOp::Type type, uint32_t slot);
-+  void completeCycle(const MoveOperand& from, const MoveOperand& to,
-+                     MoveOp::Type type, uint32_t slot);
-+
-+ protected:
-+  uint32_t inCycle_;
-+  MacroAssembler& masm;
-+
-+  uint32_t pushedAtStart_;
-+
-+  int32_t pushedAtCycle_;
-+
-+  void assertDone();
-+  Address cycleSlot(uint32_t slot, uint32_t subslot = 0) const;
-+  int32_t getAdjustedOffset(const MoveOperand& operand);
-+  Address getAdjustedAddress(const MoveOperand& operand);
-+
-+  void emitMove(const MoveOperand& from, const MoveOperand& to);
-+  void emitInt32Move(const MoveOperand& from, const MoveOperand& to);
-+  void emitFloat32Move(const MoveOperand& from, const MoveOperand& to);
-+  void emit(const MoveOp& move);
-+
-+ public:
-+  explicit MoveEmitterPPC64(MacroAssembler& masm)
-+      : inCycle_(0),
-+        masm(masm),
-+        pushedAtStart_(masm.framePushed()),
-+        pushedAtCycle_(-1) {}
-+
-+  ~MoveEmitterPPC64() { assertDone(); }
-+
-+  void emit(const MoveResolver& moves);
-+  void finish();
-+  // setScratchRegister is part of the cross-arch MoveEmitter interface
-+  // but we never spill, so there's no scratch to set. No-op kept for
-+  // shared-code compatibility.
-+  void setScratchRegister(Register reg) {}
-+};
-+
-+typedef MoveEmitterPPC64 MoveEmitter;
-+
-+}  // namespace jit
-+}  // namespace js
-+
-+#endif /* jit_ppc64_MoveEmitter_ppc64_h */
-diff --git a/js/src/jit/ppc64/SharedICHelpers-ppc64-inl.h b/js/src/jit/ppc64/SharedICHelpers-ppc64-inl.h
-new file mode 100644
-index 000000000000..aa874dfd6732
---- /dev/null
-+++ b/js/src/jit/ppc64/SharedICHelpers-ppc64-inl.h
-@@ -0,0 +1,83 @@
-+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
-+ * vim: set ts=8 sts=2 et sw=2 tw=80:
-+ * This Source Code Form is subject to the terms of the Mozilla Public
-+ * License, v. 2.0. If a copy of the MPL was not distributed with this
-+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-+
-+#ifndef jit_ppc64_SharedICHelpers_ppc64_inl_h
-+#define jit_ppc64_SharedICHelpers_ppc64_inl_h
-+
-+#include "jit/BaselineFrame.h"
-+#include "jit/SharedICHelpers.h"
-+
-+#include "jit/MacroAssembler-inl.h"
-+
-+namespace js {
-+namespace jit {
-+
-+inline void EmitBaselineTailCallVM(TrampolinePtr target, MacroAssembler& masm,
-+                                   uint32_t argSize) {
-+#ifdef DEBUG
-+  Register scratch = R2.scratchReg();
-+
-+  // Compute frame size.
-+  masm.movePtr(FramePointer, scratch);
-+  masm.subPtr(StackPointer, scratch);
-+
-+  // Store frame size without VMFunction arguments for debug assertions.
-+  masm.subPtr(Imm32(argSize), scratch);
-+  Address frameSizeAddr(FramePointer,
-+                        BaselineFrame::reverseOffsetOfDebugFrameSize());
-+  masm.store32(scratch, frameSizeAddr);
-+  masm.addPtr(Imm32(argSize), scratch);
-+#endif
-+
-+  // Push frame descriptor and perform the tail call.
-+  masm.push(FrameDescriptor(FrameType::BaselineJS));
-+
-+  // The return address is in LR (set by the original bl/bctrl call).
-+  // The VMWrapper code will push it via pushReturnAddress().
-+
-+  masm.jump(target);
-+}
-+
-+inline void EmitBaselineCallVM(TrampolinePtr target, MacroAssembler& masm) {
-+  masm.push(FrameDescriptor(FrameType::BaselineStub));
-+  masm.call(target);
-+}
-+
-+inline void EmitBaselineEnterStubFrame(MacroAssembler& masm, Register scratch) {
-+  MOZ_ASSERT(scratch != ICTailCallReg);
-+
-+#ifdef DEBUG
-+  // Compute frame size.
-+  masm.movePtr(FramePointer, scratch);
-+  masm.subPtr(StackPointer, scratch);
-+
-+  Address frameSizeAddr(FramePointer,
-+                        BaselineFrame::reverseOffsetOfDebugFrameSize());
-+  masm.store32(scratch, frameSizeAddr);
-+#endif
-+
-+  // Note: when making changes here, don't forget to update
-+  // BaselineStubFrame if needed.
-+
-+  // Push frame descriptor and return address.
-+  // LR holds the return address; read it into ICTailCallReg to push.
-+  masm.Push(FrameDescriptor(FrameType::BaselineJS));
-+  masm.xs_mflr(ICTailCallReg);
-+  masm.Push(ICTailCallReg);
-+
-+  // Save old frame pointer, stack pointer and stub reg.
-+  masm.Push(FramePointer);
-+  masm.movePtr(StackPointer, FramePointer);
-+  masm.Push(ICStubReg);
-+
-+  // Stack should remain aligned.
-+  masm.assertStackAlignment(sizeof(Value), 0);
-+}
-+
-+}  // namespace jit
-+}  // namespace js
-+
-+#endif /* jit_ppc64_SharedICHelpers_ppc64_inl_h */
-diff --git a/js/src/jit/ppc64/SharedICHelpers-ppc64.h b/js/src/jit/ppc64/SharedICHelpers-ppc64.h
-new file mode 100644
-index 000000000000..31ba830d2609
---- /dev/null
-+++ b/js/src/jit/ppc64/SharedICHelpers-ppc64.h
-@@ -0,0 +1,97 @@
-+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
-+ * vim: set ts=8 sts=2 et sw=2 tw=80:
-+ * This Source Code Form is subject to the terms of the Mozilla Public
-+ * License, v. 2.0. If a copy of the MPL was not distributed with this
-+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-+
-+#ifndef jit_ppc64_SharedICHelpers_ppc64_h
-+#define jit_ppc64_SharedICHelpers_ppc64_h
-+
-+#include "jit/BaselineIC.h"
-+#include "jit/JitFrames.h"
-+#include "jit/MacroAssembler.h"
-+#include "jit/SharedICRegisters.h"
-+
-+namespace js {
-+namespace jit {
-+
-+// Distance from sp to the top Value inside an IC stub (no return address on
-+// the stack on PPC64).
-+static const size_t ICStackValueOffset = 0;
-+
-+struct BaselineStubFrame {
-+  uintptr_t savedFrame;
-+  uintptr_t savedStub;
-+  uintptr_t returnAddress;
-+  uintptr_t descriptor;
-+};
-+
-+inline void EmitRestoreTailCallReg(MacroAssembler& masm) {
-+  // On PPC64, LR always holds the return address after a bl/bctrl call.
-+  // No-op: LR is the hardware link register, not a GPR on the stack.
-+}
-+
-+inline void EmitRepushTailCallReg(MacroAssembler& masm) {
-+  // No-op: LR already holds the return address.
-+}
-+
-+inline void EmitCallIC(MacroAssembler& masm, CodeOffset* callOffset) {
-+  // The stub pointer must already be in ICStubReg.
-+  // Load stubcode pointer from the ICStub.
-+  // R2 won't be active when we call ICs, so we can use it as scratch.
-+  masm.loadPtr(Address(ICStubReg, ICStub::offsetOfStubCode()), R2.scratchReg());
-+
-+  // Call the stubcode. On PPC64 call(Register) emits mtctr + bctrl,
-+  // which sets LR to the address after bctrl.
-+  masm.call(R2.scratchReg());
-+  *callOffset = CodeOffset(masm.currentOffset());
-+}
-+
-+inline void EmitReturnFromIC(MacroAssembler& masm) {
-+  // Return via hardware LR (set by the original bl/bctrl call).
-+  masm.as_blr();
-+}
-+
-+inline void EmitBaselineLeaveStubFrame(MacroAssembler& masm) {
-+  masm.loadPtr(
-+      Address(FramePointer, BaselineStubFrameLayout::ICStubOffsetFromFP),
-+      ICStubReg);
-+
-+  masm.movePtr(FramePointer, StackPointer);
-+  masm.Pop(FramePointer);
-+
-+  // Load the return address and restore it to LR.
-+  masm.Pop(ICTailCallReg);
-+  masm.xs_mtlr(ICTailCallReg);
-+
-+  // Discard the frame descriptor.
-+  {
-+    UseScratchRegisterScope temps(masm);
-+    Register scratch = temps.Acquire();
-+    masm.Pop(scratch);
-+  }
-+}
-+
-+template <typename AddrType>
-+inline void EmitPreBarrier(MacroAssembler& masm, const AddrType& addr,
-+                           MIRType type) {
-+  // On PPC64, LR is clobbered by guardedCallPreBarrier. Save it first.
-+  masm.xs_mflr(r0);
-+  masm.push(r0);
-+  masm.guardedCallPreBarrier(addr, type);
-+  masm.pop(r0);
-+  masm.xs_mtlr(r0);
-+}
-+
-+inline void EmitStubGuardFailure(MacroAssembler& masm) {
-+  // Load next stub into ICStubReg.
-+  masm.loadPtr(Address(ICStubReg, ICCacheIRStub::offsetOfNext()), ICStubReg);
-+
-+  // Return address is in LR. Jump to the next stubcode.
-+  masm.jump(Address(ICStubReg, ICStub::offsetOfStubCode()));
-+}
-+
-+}  // namespace jit
-+}  // namespace js
-+
-+#endif /* jit_ppc64_SharedICHelpers_ppc64_h */
-diff --git a/js/src/jit/ppc64/SharedICRegisters-ppc64.h b/js/src/jit/ppc64/SharedICRegisters-ppc64.h
-new file mode 100644
-index 000000000000..ddf67342f855
---- /dev/null
-+++ b/js/src/jit/ppc64/SharedICRegisters-ppc64.h
-@@ -0,0 +1,46 @@
-+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
-+ * vim: set ts=8 sts=2 et sw=2 tw=80:
-+ * This Source Code Form is subject to the terms of the Mozilla Public
-+ * License, v. 2.0. If a copy of the MPL was not distributed with this
-+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-+
-+#ifndef jit_ppc64_SharedICRegisters_ppc64_h
-+#define jit_ppc64_SharedICRegisters_ppc64_h
-+
-+#include "jit/ppc64/Assembler-ppc64.h"
-+#include "jit/Registers.h"
-+#include "jit/RegisterSets.h"
-+
-+namespace js {
-+namespace jit {
-+
-+// ValueOperands R0, R1, and R2.
-+// R0 == JSReturnReg, and R2 uses registers not preserved across calls. R1 value
-+// should be preserved across calls.
-+static constexpr ValueOperand R0(r5);
-+static constexpr ValueOperand R1(r15);
-+static constexpr ValueOperand R2(r4);
-+
-+// ICTailCallReg and ICStubReg.
-+// On PPC64, LR is not a GPR, so ICTailCallReg must be a normal GPR.
-+// PPC64 ELFv2 has no volatile non-arg GPRs (r3-r10 are all arg regs), so
-+// using an arg register risks clobbering by ABI calls with enough arguments.
-+// We use callee-saved registers instead, matching MIPS64/RISC-V strategy.
-+// These are excluded from BaselineICAvailableGeneralRegs.
-+static constexpr Register ICTailCallReg = r27;
-+static constexpr Register ICStubReg = r26;
-+
-+// FloatReg0 must be equal to ReturnFloatReg.
-+static constexpr FloatRegister FloatReg0 = {FloatRegisters::f1,
-+                                            FloatRegisters::Double};
-+static constexpr FloatRegister FloatReg1 = {FloatRegisters::f2,
-+                                            FloatRegisters::Double};
-+static constexpr FloatRegister FloatReg2 = {FloatRegisters::f3,
-+                                            FloatRegisters::Double};
-+static constexpr FloatRegister FloatReg3 = {FloatRegisters::f4,
-+                                            FloatRegisters::Double};
-+
-+}  // namespace jit
-+}  // namespace js
-+
-+#endif /* jit_ppc64_SharedICRegisters_ppc64_h */
-diff --git a/js/src/jit/ppc64/Simulator-ppc64.cpp b/js/src/jit/ppc64/Simulator-ppc64.cpp
-new file mode 100644
-index 000000000000..8b29eb3add04
---- /dev/null
-+++ b/js/src/jit/ppc64/Simulator-ppc64.cpp
-@@ -0,0 +1,7296 @@
-+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
-+ * vim: set ts=8 sts=2 et sw=2 tw=80:
-+ * This Source Code Form is subject to the terms of the Mozilla Public
-+ * License, v. 2.0. If a copy of the MPL was not distributed with this
-+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-+
-+#include "jit/ppc64/Simulator-ppc64.h"
-+
-+#include <cinttypes>
-+#include <cmath>
-+#include <cstring>
-+#include <float.h>
-+#include <limits>
-+
-+#include "jit/AtomicOperations.h"
-+#include "jit/ppc64/Assembler-ppc64.h"
-+#include "js/Conversions.h"
-+#include "threading/LockGuard.h"
-+#include "vm/Float16.h"
-+#include "vm/JSContext.h"
-+#include "vm/Runtime.h"
-+#include "wasm/WasmInstance.h"
-+#include "wasm/WasmSignalHandlers.h"
-+
-+#define I8(v) static_cast<int8_t>(v)
-+#define I16(v) static_cast<int16_t>(v)
-+#define U16(v) static_cast<uint16_t>(v)
-+#define I32(v) static_cast<int32_t>(v)
-+#define U32(v) static_cast<uint32_t>(v)
-+#define I64(v) static_cast<int64_t>(v)
-+#define U64(v) static_cast<uint64_t>(v)
-+#define I128(v) static_cast<__int128_t>(v)
-+#define U128(v) static_cast<__uint128_t>(v)
-+
-+namespace js {
-+namespace jit {
-+
-+static int64_t MultiplyHighSigned(int64_t u, int64_t v) {
-+  uint64_t u0, v0, w0;
-+  int64_t u1, v1, w1, w2, t;
-+
-+  u0 = u & 0xFFFFFFFFL;
-+  u1 = u >> 32;
-+  v0 = v & 0xFFFFFFFFL;
-+  v1 = v >> 32;
-+
-+  w0 = u0 * v0;
-+  t = u1 * v0 + (w0 >> 32);
-+  w1 = t & 0xFFFFFFFFL;
-+  w2 = t >> 32;
-+  w1 = u0 * v1 + w1;
-+
-+  return u1 * v1 + w2 + (w1 >> 32);
-+}
-+
-+static uint64_t MultiplyHighUnsigned(uint64_t u, uint64_t v) {
-+  uint64_t u0, v0, w0;
-+  uint64_t u1, v1, w1, w2, t;
-+
-+  u0 = u & 0xFFFFFFFFL;
-+  u1 = u >> 32;
-+  v0 = v & 0xFFFFFFFFL;
-+  v1 = v >> 32;
-+
-+  w0 = u0 * v0;
-+  t = u1 * v0 + (w0 >> 32);
-+  w1 = t & 0xFFFFFFFFL;
-+  w2 = t >> 32;
-+  w1 = u0 * v1 + w1;
-+
-+  return u1 * v1 + w2 + (w1 >> 32);
-+}
-+
-+inline constexpr uint32_t RotateLeft32(uint32_t value, uint32_t shift) {
-+  return (value << shift) | (value >> ((32 - shift) & 31));
-+}
-+
-+inline constexpr uint64_t RotateLeft64(uint64_t value, uint64_t shift) {
-+  return (value << shift) | (value >> ((64 - shift) & 63));
-+}
-+
-+// Generate a 64-bit mask with bits mb..me set (PPC numbering: 0 = MSB = bit
-+// 63 in C).  When mb <= me, a contiguous range is set; when mb > me, the
-+// mask wraps around (bits 0..me and mb..63 are set).
-+static inline uint64_t MASK64(unsigned mb, unsigned me) {
-+  MOZ_ASSERT(mb < 64 && me < 64);
-+  uint64_t mask_begin = ~0ULL >> mb;
-+  uint64_t mask_end = ~0ULL << (63 - me);
-+  if (mb <= me) {
-+    return mask_begin & mask_end;
-+  }
-+  return mask_begin | mask_end;
-+}
-+
-+static inline uint32_t MASK32(unsigned mb, unsigned me) {
-+  MOZ_ASSERT(mb < 32 && me < 32);
-+  uint32_t mask_begin = ~0U >> mb;
-+  uint32_t mask_end = ~0U << (31 - me);
-+  if (mb <= me) {
-+    return mask_begin & mask_end;
-+  }
-+  return mask_begin | mask_end;
-+}
-+
-+// Count leading zeros.
-+static inline int CountLeadingZeros64(uint64_t value) {
-+  if (value == 0) return 64;
-+  return __builtin_clzll(value);
-+}
-+
-+static inline int CountLeadingZeros32(uint32_t value) {
-+  if (value == 0) return 32;
-+  return __builtin_clz(value);
-+}
-+
-+static inline int CountTrailingZeros64(uint64_t value) {
-+  if (value == 0) return 64;
-+  return __builtin_ctzll(value);
-+}
-+
-+static inline int CountTrailingZeros32(uint32_t value) {
-+  if (value == 0) return 32;
-+  return __builtin_ctz(value);
-+}
-+
-+static inline int PopCount64(uint64_t value) {
-+  return __builtin_popcountll(value);
-+}
-+
-+static inline int PopCount32(uint32_t value) {
-+  return __builtin_popcount(value);
-+}
-+
-+static inline uint64_t PopCountPerByte(uint64_t value) {
-+  uint64_t result = 0;
-+  for (int i = 0; i < 8; i++) {
-+    uint8_t byte = (value >> (i * 8)) & 0xFF;
-+    result |= (uint64_t)__builtin_popcount(byte) << (i * 8);
-+  }
-+  return result;
-+}
-+
-+// PPC64 C argument slots: PPC64 ELFv2 ABI does not require C argument
-+// slots on the stack for register-passed arguments, but we reserve the
-+// link area (32 bytes).
-+const int kCArgSlotCount = 0;
-+const int kCArgsSlotsSize = kCArgSlotCount * sizeof(uintptr_t);
-+
-+// -----------------------------------------------------------------------------
-+// PPC64 SimInstruction.
-+
-+class SimInstruction {
-+ public:
-+  enum {
-+    kInstrSize = 4,
-+    kPCReadOffset = 0
-+  };
-+
-+  inline Instr instructionBits() const {
-+    return *reinterpret_cast<const Instr*>(this);
-+  }
-+
-+  inline void setInstructionBits(Instr value) {
-+    *reinterpret_cast<Instr*>(this) = value;
-+  }
-+
-+  inline int bit(int nr) const { return (instructionBits() >> nr) & 1; }
-+
-+  inline uint32_t bits(int hi, int lo) const {
-+    return (instructionBits() >> lo) & ((2U << (hi - lo)) - 1);
-+  }
-+
-+  inline uint32_t opcode() const { return bits(31, 26); }
-+
-+  inline uint32_t rtValue() const { return bits(25, 21); }
-+  inline uint32_t rsValue() const { return bits(25, 21); }
-+  inline uint32_t raValue() const { return bits(20, 16); }
-+  inline uint32_t rbValue() const { return bits(15, 11); }
-+  inline uint32_t rcValue() const { return bits(10, 6); }
-+
-+  inline uint32_t boValue() const { return bits(25, 21); }
-+  inline uint32_t biValue() const { return bits(20, 16); }
-+
-+  // D-form 16-bit immediate (sign-extend to get signed value).
-+  inline int16_t imm16Value() const { return I16(bits(15, 0)); }
-+  inline uint16_t uimm16Value() const { return U16(bits(15, 0)); }
-+
-+  // DS-form 14-bit displacement (bits 2..15, 4-byte aligned).
-+  inline int16_t ds14Value() const {
-+    return I16(bits(15, 2) << 2);
-+  }
-+
-+  // B-form 14-bit branch displacement (bits 2..15, 4-byte aligned).
-+  inline int32_t bd16Value() const {
-+    int16_t raw = I16(bits(15, 2) << 2);
-+    return (int32_t)raw;
-+  }
-+
-+  // I-form 24-bit branch offset (bits 2..25, sign-extended, 4-byte aligned).
-+  inline int32_t li26Value() const {
-+    int32_t raw = I32(bits(25, 2) << 2);
-+    // Sign-extend from 26 bits.
-+    return (raw << 6) >> 6;
-+  }
-+
-+  // Extended opcode for X-form / XO-form (bits 1..10).
-+  inline uint32_t xoValue() const { return bits(10, 1); }
-+
-+  // Extended opcode for XL-form (bits 1..10).
-+  inline uint32_t xlValue() const { return bits(10, 1); }
-+
-+  // MD-form SH field: sh[0:4] in instruction bits 15:11, sh[5] in bit 1.
-+  // Assembler encodes: ((sh & 0x1f) << 11) | ((sh & 0x20) >> 4).
-+  inline uint32_t mdSHValue() const {
-+    return bits(15, 11) | (bit(1) << 5);
-+  }
-+  // mb/me for MD-form (rldicl/rldicr/rldic/rldimi): 6-bit field split as
-+  // mb[0:4] in instruction bits 10:6 and mb[5] in bit 5.
-+  inline uint32_t mdMBValue() const {
-+    return bits(10, 6) | (bit(5) << 5);
-+  }
-+  inline uint32_t mdMEValue() const { return mdMBValue(); }
-+
-+  // MD-form XO (bits 2..4).
-+  inline uint32_t mdXOValue() const { return bits(4, 2); }
-+
-+  // MDS-form (rldcl, rldcr): mb[0:4] in bits 10:6, mb[5] in bit 5.
-+  inline uint32_t mdsMBValue() const {
-+    return bits(10, 6) | (bit(5) << 5);
-+  }
-+
-+  // M-form fields (32-bit rotate/mask).
-+  inline uint32_t mSHValue() const { return bits(15, 11); }
-+  inline uint32_t mMBValue() const { return bits(10, 6); }
-+  inline uint32_t mMEValue() const { return bits(5, 1); }
-+
-+  // Rc bit.
-+  inline bool rcBit() const { return bit(0); }
-+
-+  // AA bit for branch instructions.
-+  inline bool aaBit() const { return bit(1); }
-+
-+  // LK bit for branch instructions.
-+  inline bool lkBit() const { return bit(0); }
-+
-+  // OE bit for XO-form arithmetic.
-+  inline bool oeBit() const { return bit(10); }
-+
-+  // L bit for compare instructions (bit 21).
-+  inline bool lBit() const { return bit(21); }
-+
-+  // BF field (bits 23..25) for compares.
-+  inline uint32_t bfValue() const { return bits(25, 23); }
-+
-+  bool isTrap() const {
-+    uint32_t instr = instructionBits();
-+    // PPC_trap = 0x7FE00008 (tw 31,0,0).
-+    // Don't treat the call-redirection instruction or wasm trap as a
-+    // debugger trap.
-+    if (instr == kCallRedirInstr) return false;
-+    if (instr == 0x7FE00008) return false;
-+    // Any other tw instruction with TO=31 is a trap.
-+    if (opcode() == 31 && (xoValue() == 4)) return true;
-+    return false;
-+  }
-+
-+ private:
-+  SimInstruction() = delete;
-+  SimInstruction(const SimInstruction& other) = delete;
-+  void operator=(const SimInstruction& other) = delete;
-+};
-+
-+// -----------------------------------------------------------------------------
-+// ICache.
-+
-+class CachePage {
-+ public:
-+  static const int LINE_VALID = 0;
-+  static const int LINE_INVALID = 1;
-+
-+  static const int kPageShift = 12;
-+  static const int kPageSize = 1 << kPageShift;
-+  static const int kPageMask = kPageSize - 1;
-+  static const int kLineShift = 2;
-+  static const int kLineLength = 1 << kLineShift;
-+  static const int kLineMask = kLineLength - 1;
-+
-+  CachePage() { memset(&validity_map_, LINE_INVALID, sizeof(validity_map_)); }
-+
-+  char* validityByte(int offset) {
-+    return &validity_map_[offset >> kLineShift];
-+  }
-+
-+  char* cachedData(int offset) { return &data_[offset]; }
-+
-+ private:
-+  char data_[kPageSize];
-+  static const int kValidityMapSize = kPageSize >> kLineShift;
-+  char validity_map_[kValidityMapSize];
-+};
-+
-+class AutoLockSimulatorCache : public LockGuard<Mutex> {
-+  using Base = LockGuard<Mutex>;
-+
-+ public:
-+  explicit AutoLockSimulatorCache()
-+      : Base(SimulatorProcess::singleton_->cacheLock_) {}
-+};
-+
-+mozilla::Atomic<size_t, mozilla::ReleaseAcquire>
-+    SimulatorProcess::ICacheCheckingDisableCount(1);
-+SimulatorProcess* SimulatorProcess::singleton_ = nullptr;
-+
-+int64_t Simulator::StopSimAt = -1;
-+
-+// -----------------------------------------------------------------------------
-+// Simulator Create / Destroy.
-+
-+Simulator* Simulator::Create() {
-+  auto sim = MakeUnique<Simulator>();
-+  if (!sim) {
-+    return nullptr;
-+  }
-+
-+  if (!sim->init()) {
-+    return nullptr;
-+  }
-+
-+  int64_t stopAt;
-+  char* stopAtStr = getenv("PPC64_SIM_STOP_AT");
-+  if (stopAtStr && sscanf(stopAtStr, "%" PRIi64, &stopAt) == 1) {
-+    fprintf(stderr, "\nStopping simulation at icount %" PRIi64 "\n", stopAt);
-+    Simulator::StopSimAt = stopAt;
-+  }
-+
-+  return sim.release();
-+}
-+
-+void Simulator::Destroy(Simulator* sim) { js_delete(sim); }
-+
-+// -----------------------------------------------------------------------------
-+// Debugger.
-+
-+class ppc64Debugger {
-+ public:
-+  explicit ppc64Debugger(Simulator* sim) : sim_(sim) {}
-+
-+  void stop(SimInstruction* instr);
-+  void debug();
-+  void printAllRegs();
-+  void printAllRegsIncludingFPU();
-+
-+ private:
-+  static const Instr kBreakpointInstr = 0x7FE00008;  // PPC_trap
-+  static const Instr kNopInstr = 0x60000000;          // PPC_nop
-+
-+  Simulator* sim_;
-+
-+  int64_t getRegisterValue(int regnum);
-+  int64_t getFPURegisterValueLong(int regnum);
-+  float getFPURegisterValueFloat(int regnum);
-+  double getFPURegisterValueDouble(int regnum);
-+  bool getValue(const char* desc, int64_t* value);
-+
-+  bool setBreakpoint(SimInstruction* breakpc);
-+  bool deleteBreakpoint(SimInstruction* breakpc);
-+
-+  void undoBreakpoints();
-+  void redoBreakpoints();
-+};
-+
-+[[maybe_unused]] static void UNIMPLEMENTED() {
-+  printf("UNIMPLEMENTED instruction.\n");
-+  MOZ_CRASH();
-+}
-+[[maybe_unused]] static void UNREACHABLE() {
-+  printf("UNREACHABLE instruction.\n");
-+  MOZ_CRASH();
-+}
-+[[maybe_unused]] static void UNSUPPORTED() {
-+  printf("Unsupported instruction.\n");
-+  MOZ_CRASH();
-+}
-+
-+void ppc64Debugger::stop(SimInstruction* instr) {
-+  uint32_t code = 0;
-+  char* msg = *reinterpret_cast<char**>(sim_->get_pc() +
-+                                        SimInstruction::kInstrSize);
-+  if (!sim_->watchedStops_[code].desc_) {
-+    sim_->watchedStops_[code].desc_ = msg;
-+  }
-+  if (code != kMaxStopCode) {
-+    printf("Simulator hit stop %u: %s\n", code, msg);
-+  } else {
-+    printf("Simulator hit %s\n", msg);
-+  }
-+  sim_->set_pc(sim_->get_pc() + 2 * SimInstruction::kInstrSize);
-+  debug();
-+}
-+
-+int64_t ppc64Debugger::getRegisterValue(int regnum) {
-+  if (regnum == kPCRegister) {
-+    return sim_->get_pc();
-+  }
-+  return sim_->getRegister(regnum);
-+}
-+
-+int64_t ppc64Debugger::getFPURegisterValueLong(int regnum) {
-+  return sim_->getFpuRegister(regnum);
-+}
-+
-+float ppc64Debugger::getFPURegisterValueFloat(int regnum) {
-+  return sim_->getFpuRegisterFloat(regnum);
-+}
-+
-+double ppc64Debugger::getFPURegisterValueDouble(int regnum) {
-+  return sim_->getFpuRegisterDouble(regnum);
-+}
-+
-+bool ppc64Debugger::getValue(const char* desc, int64_t* value) {
-+  Register reg = Register::FromName(desc);
-+  if (reg != InvalidReg) {
-+    *value = getRegisterValue(reg.code());
-+    return true;
-+  }
-+
-+  if (strncmp(desc, "0x", 2) == 0) {
-+    return sscanf(desc + 2, "%" PRIx64, reinterpret_cast<uint64_t*>(value)) ==
-+           1;
-+  }
-+  return sscanf(desc, "%" PRIu64, reinterpret_cast<uint64_t*>(value)) == 1;
-+}
-+
-+bool ppc64Debugger::setBreakpoint(SimInstruction* breakpc) {
-+  if (sim_->break_pc_ != nullptr) {
-+    return false;
-+  }
-+
-+  sim_->break_pc_ = breakpc;
-+  sim_->break_instr_ = breakpc->instructionBits();
-+  return true;
-+}
-+
-+bool ppc64Debugger::deleteBreakpoint(SimInstruction* breakpc) {
-+  if (sim_->break_pc_ != nullptr) {
-+    sim_->break_pc_->setInstructionBits(sim_->break_instr_);
-+  }
-+
-+  sim_->break_pc_ = nullptr;
-+  sim_->break_instr_ = 0;
-+  return true;
-+}
-+
-+void ppc64Debugger::undoBreakpoints() {
-+  if (sim_->break_pc_) {
-+    sim_->break_pc_->setInstructionBits(sim_->break_instr_);
-+  }
-+}
-+
-+void ppc64Debugger::redoBreakpoints() {
-+  if (sim_->break_pc_) {
-+    sim_->break_pc_->setInstructionBits(kBreakpointInstr);
-+  }
-+}
-+
-+void ppc64Debugger::printAllRegs() {
-+  int64_t value;
-+  for (uint32_t i = 0; i < Registers::Total; i++) {
-+    value = getRegisterValue(i);
-+    printf("%3s: 0x%016" PRIx64 " %20" PRIi64 "   ", Registers::GetName(i),
-+           value, value);
-+
-+    if (i % 2) {
-+      printf("\n");
-+    }
-+  }
-+  printf("\n");
-+
-+  value = getRegisterValue(Simulator::pc);
-+  printf("  pc: 0x%016" PRIx64 "\n", value);
-+  printf("  lr: 0x%016" PRIx64 "\n", sim_->getLR());
-+  printf(" ctr: 0x%016" PRIx64 "\n", sim_->getCTR());
-+  printf("  cr: 0x%08x\n", sim_->getCR());
-+  printf(" xer: 0x%016" PRIx64 "\n", sim_->getXER());
-+}
-+
-+void ppc64Debugger::printAllRegsIncludingFPU() {
-+  printAllRegs();
-+
-+  printf("\n\n");
-+  for (uint32_t i = 0; i < FloatRegisters::TotalPhys; i++) {
-+    printf("%3s: 0x%016" PRIx64 "\tflt: %-8.4g\tdbl: %-16.4g\n",
-+           FloatRegisters::GetName(i), getFPURegisterValueLong(i),
-+           getFPURegisterValueFloat(i), getFPURegisterValueDouble(i));
-+  }
-+}
-+
-+static char* ReadLine(const char* prompt) {
-+  UniqueChars result;
-+  char lineBuf[256];
-+  int offset = 0;
-+  bool keepGoing = true;
-+  fprintf(stdout, "%s", prompt);
-+  fflush(stdout);
-+  while (keepGoing) {
-+    if (fgets(lineBuf, sizeof(lineBuf), stdin) == nullptr) {
-+      return nullptr;
-+    }
-+    int len = strlen(lineBuf);
-+    if (len > 0 && lineBuf[len - 1] == '\n') {
-+      keepGoing = false;
-+    }
-+    if (!result) {
-+      result.reset(js_pod_malloc<char>(len + 1));
-+      if (!result) {
-+        return nullptr;
-+      }
-+    } else {
-+      int new_len = offset + len + 1;
-+      char* new_result = js_pod_malloc<char>(new_len);
-+      if (!new_result) {
-+        return nullptr;
-+      }
-+      memcpy(new_result, result.get(), offset * sizeof(char));
-+      result.reset(new_result);
-+    }
-+    memcpy(result.get() + offset, lineBuf, len * sizeof(char));
-+    offset += len;
-+  }
-+
-+  MOZ_ASSERT(result);
-+  result[offset] = '\0';
-+  return result.release();
-+}
-+
-+static void DisassembleInstruction(uint64_t pc) {
-+  printf("  0x%016" PRIx64 ":  %08x\n", pc,
-+         *reinterpret_cast<uint32_t*>(pc));
-+}
-+
-+void ppc64Debugger::debug() {
-+  intptr_t lastPC = -1;
-+  bool done = false;
-+
-+#define COMMAND_SIZE 63
-+#define ARG_SIZE 255
-+
-+#define STR(a) #a
-+#define XSTR(a) STR(a)
-+
-+  char cmd[COMMAND_SIZE + 1];
-+  char arg1[ARG_SIZE + 1];
-+  char arg2[ARG_SIZE + 1];
-+  char* argv[3] = {cmd, arg1, arg2};
-+
-+  cmd[COMMAND_SIZE] = 0;
-+  arg1[ARG_SIZE] = 0;
-+  arg2[ARG_SIZE] = 0;
-+
-+  undoBreakpoints();
-+
-+  while (!done && (sim_->get_pc() != Simulator::end_sim_pc)) {
-+    if (lastPC != sim_->get_pc()) {
-+      DisassembleInstruction(sim_->get_pc());
-+      lastPC = sim_->get_pc();
-+    }
-+    char* line = ReadLine("sim> ");
-+    if (line == nullptr) {
-+      break;
-+    } else {
-+      char* last_input = sim_->lastDebuggerInput();
-+      if (strcmp(line, "\n") == 0 && last_input != nullptr) {
-+        line = last_input;
-+      } else {
-+        sim_->setLastDebuggerInput(line);
-+      }
-+      int argc = sscanf(line,
-+                              "%" XSTR(COMMAND_SIZE) "s "
-+                              "%" XSTR(ARG_SIZE) "s "
-+                              "%" XSTR(ARG_SIZE) "s",
-+                              cmd, arg1, arg2);
-+      if ((strcmp(cmd, "si") == 0) || (strcmp(cmd, "stepi") == 0)) {
-+        SimInstruction* instr =
-+            reinterpret_cast<SimInstruction*>(sim_->get_pc());
-+        if (!instr->isTrap()) {
-+          sim_->instructionDecode(instr);
-+        } else {
-+          printf("/!\\ Jumping over generated breakpoint.\n");
-+          sim_->set_pc(sim_->get_pc() + SimInstruction::kInstrSize);
-+        }
-+        sim_->icount_++;
-+      } else if ((strcmp(cmd, "c") == 0) || (strcmp(cmd, "cont") == 0)) {
-+        sim_->instructionDecode(
-+            reinterpret_cast<SimInstruction*>(sim_->get_pc()));
-+        sim_->icount_++;
-+        done = true;
-+      } else if ((strcmp(cmd, "p") == 0) || (strcmp(cmd, "print") == 0)) {
-+        if (argc == 2) {
-+          int64_t value;
-+          if (strcmp(arg1, "all") == 0) {
-+            printAllRegs();
-+          } else if (strcmp(arg1, "allf") == 0) {
-+            printAllRegsIncludingFPU();
-+          } else {
-+            Register reg = Register::FromName(arg1);
-+            FloatRegisters::Code fReg = FloatRegisters::FromName(arg1);
-+            if (reg != InvalidReg) {
-+              value = getRegisterValue(reg.code());
-+              printf("%s: 0x%016" PRIx64 " %20" PRIi64 " \n", arg1, value,
-+                     value);
-+            } else if (fReg != FloatRegisters::Invalid) {
-+              printf("%3s: 0x%016" PRIx64 "\tflt: %-8.4g\tdbl: %-16.4g\n",
-+                     FloatRegisters::GetName(fReg),
-+                     getFPURegisterValueLong(fReg),
-+                     getFPURegisterValueFloat(fReg),
-+                     getFPURegisterValueDouble(fReg));
-+            } else {
-+              printf("%s unrecognized\n", arg1);
-+            }
-+          }
-+        } else {
-+          printf("print <register> or print <fpu register> single\n");
-+        }
-+      } else if (strcmp(cmd, "stack") == 0 || strcmp(cmd, "mem") == 0) {
-+        int64_t* cur = nullptr;
-+        int64_t* end = nullptr;
-+        int next_arg = 1;
-+
-+        if (strcmp(cmd, "stack") == 0) {
-+          cur = reinterpret_cast<int64_t*>(sim_->getRegister(Simulator::sp));
-+        } else {
-+          int64_t value;
-+          if (!getValue(arg1, &value)) {
-+            printf("%s unrecognized\n", arg1);
-+            continue;
-+          }
-+          cur = reinterpret_cast<int64_t*>(value);
-+          next_arg++;
-+        }
-+
-+        int64_t words;
-+        if (argc == next_arg) {
-+          words = 10;
-+        } else {
-+          if (!getValue(argv[next_arg], &words)) {
-+            words = 10;
-+          }
-+        }
-+        end = cur + words;
-+
-+        while (cur < end) {
-+          printf("  %p:  0x%016" PRIx64 " %20" PRIi64, cur, *cur, *cur);
-+          printf("\n");
-+          cur++;
-+        }
-+
-+      } else if ((strcmp(cmd, "disasm") == 0) || (strcmp(cmd, "dpc") == 0) ||
-+                 (strcmp(cmd, "di") == 0)) {
-+        uint8_t* cur = nullptr;
-+        uint8_t* end = nullptr;
-+
-+        if (argc == 1) {
-+          cur = reinterpret_cast<uint8_t*>(sim_->get_pc());
-+          end = cur + (10 * SimInstruction::kInstrSize);
-+        } else if (argc == 2) {
-+          Register reg = Register::FromName(arg1);
-+          if (reg != InvalidReg || strncmp(arg1, "0x", 2) == 0) {
-+            int64_t value;
-+            if (getValue(arg1, &value)) {
-+              cur = reinterpret_cast<uint8_t*>(value);
-+              end = cur + (10 * SimInstruction::kInstrSize);
-+            }
-+          } else {
-+            int64_t value;
-+            if (getValue(arg1, &value)) {
-+              cur = reinterpret_cast<uint8_t*>(sim_->get_pc());
-+              end = cur + (value * SimInstruction::kInstrSize);
-+            }
-+          }
-+        } else {
-+          int64_t value1;
-+          int64_t value2;
-+          if (getValue(arg1, &value1) && getValue(arg2, &value2)) {
-+            cur = reinterpret_cast<uint8_t*>(value1);
-+            end = cur + (value2 * SimInstruction::kInstrSize);
-+          }
-+        }
-+
-+        while (cur < end) {
-+          DisassembleInstruction(uint64_t(cur));
-+          cur += SimInstruction::kInstrSize;
-+        }
-+      } else if (strcmp(cmd, "gdb") == 0) {
-+        printf("relinquishing control to gdb\n");
-+#if defined(__x86_64__)
-+        asm("int $3");
-+#elif defined(__aarch64__)
-+        asm("brk #0xf000");
-+#endif
-+        printf("regaining control from gdb\n");
-+      } else if (strcmp(cmd, "break") == 0) {
-+        if (argc == 2) {
-+          int64_t value;
-+          if (getValue(arg1, &value)) {
-+            if (!setBreakpoint(reinterpret_cast<SimInstruction*>(value))) {
-+              printf("setting breakpoint failed\n");
-+            }
-+          } else {
-+            printf("%s unrecognized\n", arg1);
-+          }
-+        } else {
-+          printf("break <address>\n");
-+        }
-+      } else if (strcmp(cmd, "del") == 0) {
-+        if (!deleteBreakpoint(nullptr)) {
-+          printf("deleting breakpoint failed\n");
-+        }
-+      } else if (strcmp(cmd, "flags") == 0) {
-+        printf("CR: 0x%08x   XER: 0x%016" PRIx64 "\n", sim_->getCR(),
-+               sim_->getXER());
-+      } else if (strcmp(cmd, "stop") == 0) {
-+        int64_t value;
-+        intptr_t stop_pc = sim_->get_pc() - 2 * SimInstruction::kInstrSize;
-+        SimInstruction* stop_instr =
-+            reinterpret_cast<SimInstruction*>(stop_pc);
-+        SimInstruction* msg_address = reinterpret_cast<SimInstruction*>(
-+            stop_pc + SimInstruction::kInstrSize);
-+        if ((argc == 2) && (strcmp(arg1, "unstop") == 0)) {
-+          if (sim_->isStopInstruction(stop_instr)) {
-+            stop_instr->setInstructionBits(kNopInstr);
-+            msg_address->setInstructionBits(kNopInstr);
-+          } else {
-+            printf("Not at debugger stop.\n");
-+          }
-+        } else if (argc == 3) {
-+          if (strcmp(arg1, "info") == 0) {
-+            if (strcmp(arg2, "all") == 0) {
-+              printf("Stop information:\n");
-+              for (uint32_t i = kMaxWatchpointCode + 1; i <= kMaxStopCode;
-+                   i++) {
-+                sim_->printStopInfo(i);
-+              }
-+            } else if (getValue(arg2, &value)) {
-+              sim_->printStopInfo(value);
-+            } else {
-+              printf("Unrecognized argument.\n");
-+            }
-+          } else if (strcmp(arg1, "enable") == 0) {
-+            if (strcmp(arg2, "all") == 0) {
-+              for (uint32_t i = kMaxWatchpointCode + 1; i <= kMaxStopCode;
-+                   i++) {
-+                sim_->enableStop(i);
-+              }
-+            } else if (getValue(arg2, &value)) {
-+              sim_->enableStop(value);
-+            } else {
-+              printf("Unrecognized argument.\n");
-+            }
-+          } else if (strcmp(arg1, "disable") == 0) {
-+            if (strcmp(arg2, "all") == 0) {
-+              for (uint32_t i = kMaxWatchpointCode + 1; i <= kMaxStopCode;
-+                   i++) {
-+                sim_->disableStop(i);
-+              }
-+            } else if (getValue(arg2, &value)) {
-+              sim_->disableStop(value);
-+            } else {
-+              printf("Unrecognized argument.\n");
-+            }
-+          }
-+        } else {
-+          printf("Wrong usage. Use help command for more information.\n");
-+        }
-+      } else if ((strcmp(cmd, "h") == 0) || (strcmp(cmd, "help") == 0)) {
-+        printf("cont\n");
-+        printf("  continue execution (alias 'c')\n");
-+        printf("stepi\n");
-+        printf("  step one instruction (alias 'si')\n");
-+        printf("print <register>\n");
-+        printf("  print register content (alias 'p')\n");
-+        printf("  use register name 'all' to print all registers\n");
-+        printf("stack [<words>]\n");
-+        printf("  dump stack content, default dump 10 words)\n");
-+        printf("mem <address> [<words>]\n");
-+        printf("  dump memory content, default dump 10 words)\n");
-+        printf("flags\n");
-+        printf("  print CR and XER\n");
-+        printf("disasm [<instructions>]\n");
-+        printf("disasm [<address/register>]\n");
-+        printf("disasm [[<address/register>] <instructions>]\n");
-+        printf("  disassemble code, default is 10 instructions\n");
-+        printf("  from pc (alias 'di')\n");
-+        printf("gdb\n");
-+        printf("  enter gdb\n");
-+        printf("break <address>\n");
-+        printf("  set a break point on the address\n");
-+        printf("del\n");
-+        printf("  delete the breakpoint\n");
-+      } else {
-+        printf("Unknown command: %s\n", cmd);
-+      }
-+    }
-+  }
-+
-+  redoBreakpoints();
-+
-+#undef COMMAND_SIZE
-+#undef ARG_SIZE
-+
-+#undef STR
-+#undef XSTR
-+}
-+
-+// -----------------------------------------------------------------------------
-+// ICache helpers.
-+
-+static bool AllOnOnePage(uintptr_t start, int size) {
-+  intptr_t start_page = (start & ~CachePage::kPageMask);
-+  intptr_t end_page = ((start + size) & ~CachePage::kPageMask);
-+  return start_page == end_page;
-+}
-+
-+void Simulator::setLastDebuggerInput(char* input) {
-+  js_free(lastDebuggerInput_);
-+  lastDebuggerInput_ = input;
-+}
-+
-+static CachePage* GetCachePageLocked(SimulatorProcess::ICacheMap& i_cache,
-+                                     void* page) {
-+  SimulatorProcess::ICacheMap::AddPtr p = i_cache.lookupForAdd(page);
-+  if (p) {
-+    return p->value();
-+  }
-+  AutoEnterOOMUnsafeRegion oomUnsafe;
-+  CachePage* new_page = js_new<CachePage>();
-+  if (!new_page || !i_cache.add(p, page, new_page)) {
-+    oomUnsafe.crash("Simulator CachePage");
-+  }
-+  return new_page;
-+}
-+
-+static void FlushOnePageLocked(SimulatorProcess::ICacheMap& i_cache,
-+                               intptr_t start, int size) {
-+  MOZ_ASSERT(size <= CachePage::kPageSize);
-+  MOZ_ASSERT(AllOnOnePage(start, size - 1));
-+  MOZ_ASSERT((start & CachePage::kLineMask) == 0);
-+  MOZ_ASSERT((size & CachePage::kLineMask) == 0);
-+  void* page = reinterpret_cast<void*>(start & (~CachePage::kPageMask));
-+  int offset = (start & CachePage::kPageMask);
-+  CachePage* cache_page = GetCachePageLocked(i_cache, page);
-+  char* valid_bytemap = cache_page->validityByte(offset);
-+  memset(valid_bytemap, CachePage::LINE_INVALID, size >> CachePage::kLineShift);
-+}
-+
-+static void FlushICacheLocked(SimulatorProcess::ICacheMap& i_cache,
-+                              void* start_addr, size_t size) {
-+  intptr_t start = reinterpret_cast<intptr_t>(start_addr);
-+  int intra_line = (start & CachePage::kLineMask);
-+  start -= intra_line;
-+  size += intra_line;
-+  size = ((size - 1) | CachePage::kLineMask) + 1;
-+  int offset = (start & CachePage::kPageMask);
-+  while (!AllOnOnePage(start, size - 1)) {
-+    int bytes_to_flush = CachePage::kPageSize - offset;
-+    FlushOnePageLocked(i_cache, start, bytes_to_flush);
-+    start += bytes_to_flush;
-+    size -= bytes_to_flush;
-+    MOZ_ASSERT((start & CachePage::kPageMask) == 0);
-+    offset = 0;
-+  }
-+  if (size != 0) {
-+    FlushOnePageLocked(i_cache, start, size);
-+  }
-+}
-+
-+/* static */
-+void SimulatorProcess::checkICacheLocked(SimInstruction* instr) {
-+  intptr_t address = reinterpret_cast<intptr_t>(instr);
-+  void* page = reinterpret_cast<void*>(address & (~CachePage::kPageMask));
-+  void* line = reinterpret_cast<void*>(address & (~CachePage::kLineMask));
-+  int offset = (address & CachePage::kPageMask);
-+  CachePage* cache_page = GetCachePageLocked(icache(), page);
-+  char* cache_valid_byte = cache_page->validityByte(offset);
-+  bool cache_hit = (*cache_valid_byte == CachePage::LINE_VALID);
-+  char* cached_line = cache_page->cachedData(offset & ~CachePage::kLineMask);
-+
-+  if (cache_hit) {
-+    mozilla::DebugOnly<int> cmpret =
-+        memcmp(reinterpret_cast<void*>(instr), cache_page->cachedData(offset),
-+               SimInstruction::kInstrSize);
-+    MOZ_ASSERT(cmpret == 0);
-+  } else {
-+    memcpy(cached_line, line, CachePage::kLineLength);
-+    *cache_valid_byte = CachePage::LINE_VALID;
-+  }
-+}
-+
-+HashNumber SimulatorProcess::ICacheHasher::hash(const Lookup& l) {
-+  return U32(reinterpret_cast<uintptr_t>(l)) >> 2;
-+}
-+
-+bool SimulatorProcess::ICacheHasher::match(const Key& k, const Lookup& l) {
-+  MOZ_ASSERT((reinterpret_cast<intptr_t>(k) & CachePage::kPageMask) == 0);
-+  MOZ_ASSERT((reinterpret_cast<intptr_t>(l) & CachePage::kPageMask) == 0);
-+  return k == l;
-+}
-+
-+/* static */
-+void SimulatorProcess::FlushICache(void* start_addr, size_t size) {
-+  if (!ICacheCheckingDisableCount) {
-+    AutoLockSimulatorCache als;
-+    js::jit::FlushICacheLocked(icache(), start_addr, size);
-+  }
-+}
-+
-+// -----------------------------------------------------------------------------
-+// Redirection.
-+
-+class Redirection {
-+  friend class SimulatorProcess;
-+
-+  Redirection(void* nativeFunction, ABIFunctionType type)
-+      : nativeFunction_(nativeFunction),
-+        swiInstruction_(kCallRedirInstr),
-+        type_(type),
-+        next_(nullptr) {
-+    next_ = SimulatorProcess::redirection();
-+    if (!SimulatorProcess::ICacheCheckingDisableCount) {
-+      FlushICacheLocked(SimulatorProcess::icache(), addressOfSwiInstruction(),
-+                        SimInstruction::kInstrSize);
-+    }
-+    SimulatorProcess::setRedirection(this);
-+  }
-+
-+ public:
-+  void* addressOfSwiInstruction() { return &swiInstruction_; }
-+  void* nativeFunction() const { return nativeFunction_; }
-+  ABIFunctionType type() const { return type_; }
-+
-+  static Redirection* Get(void* nativeFunction, ABIFunctionType type) {
-+    AutoLockSimulatorCache als;
-+
-+    Redirection* current = SimulatorProcess::redirection();
-+    for (; current != nullptr; current = current->next_) {
-+      if (current->nativeFunction_ == nativeFunction) {
-+        MOZ_ASSERT(current->type() == type);
-+        return current;
-+      }
-+    }
-+
-+    AutoEnterOOMUnsafeRegion oomUnsafe;
-+    Redirection* redir = js_pod_malloc<Redirection>(1);
-+    if (!redir) {
-+      oomUnsafe.crash("Simulator redirection");
-+    }
-+    new (redir) Redirection(nativeFunction, type);
-+    return redir;
-+  }
-+
-+  static Redirection* FromSwiInstruction(SimInstruction* swiInstruction) {
-+    uint8_t* addrOfSwi = reinterpret_cast<uint8_t*>(swiInstruction);
-+    uint8_t* addrOfRedirection =
-+        addrOfSwi - offsetof(Redirection, swiInstruction_);
-+    return reinterpret_cast<Redirection*>(addrOfRedirection);
-+  }
-+
-+ private:
-+  void* nativeFunction_;
-+  uint32_t swiInstruction_;
-+  ABIFunctionType type_;
-+  Redirection* next_;
-+};
-+
-+// -----------------------------------------------------------------------------
-+// Simulator constructor / destructor / init.
-+
-+Simulator::Simulator() {
-+  stack_ = nullptr;
-+  stackLimit_ = 0;
-+  pc_modified_ = false;
-+  icount_ = 0;
-+  break_count_ = 0;
-+  break_pc_ = nullptr;
-+  break_instr_ = 0;
-+  single_stepping_ = false;
-+  single_step_callback_ = nullptr;
-+  single_step_callback_arg_ = nullptr;
-+
-+  for (int i = 0; i < Register::kNumSimuRegisters; i++) {
-+    registers_[i] = 0;
-+  }
-+  for (int i = 0; i < Simulator::FPURegister::kNumFPURegisters; i++) {
-+    FPUregisters_[i] = 0;
-+  }
-+
-+  LR_ = 0;
-+  CTR_ = 0;
-+  CR_ = 0;
-+  XER_ = 0;
-+  FPSCR_ = 0;
-+  LLBit_ = false;
-+  LLAddr_ = 0;
-+  lastLLValue_ = 0;
-+
-+  // Initialize PC and LR to a known bad value that will cause an
-+  // access violation if the simulator ever tries to execute it.
-+  registers_[pc] = bad_ra;
-+  LR_ = bad_ra;
-+
-+  lastDebuggerInput_ = nullptr;
-+}
-+
-+bool Simulator::init() {
-+  static const size_t stackSize = 2 * 1024 * 1024;
-+  stack_ = js_pod_malloc<char>(stackSize);
-+  if (!stack_) {
-+    return false;
-+  }
-+
-+  // Leave a safety margin of 1MB to prevent overrunning the stack.
-+  stackLimit_ = reinterpret_cast<uintptr_t>(stack_) + 1024 * 1024;
-+
-+  // The sp is initialized to point to the bottom (high address) of the
-+  // allocated stack area.
-+  registers_[sp] = reinterpret_cast<int64_t>(stack_) + stackSize - 64;
-+
-+  // Zero-initialize VR namespace. Simulated PPC64 does not guarantee any
-+  // value in VRs at entry, but zeroing avoids uninitialized-read false
-+  // positives in tools and makes regression traces deterministic.
-+  memset(VRregisters_, 0, sizeof(VRregisters_));
-+
-+  return true;
-+}
-+
-+Simulator::~Simulator() { js_free(stack_); }
-+
-+SimulatorProcess::SimulatorProcess()
-+    : cacheLock_(mutexid::SimulatorCacheLock), redirection_(nullptr) {
-+  if (getenv("PPC64_SIM_ICACHE_CHECKS")) {
-+    ICacheCheckingDisableCount = 0;
-+  }
-+}
-+
-+SimulatorProcess::~SimulatorProcess() {
-+  Redirection* r = redirection_;
-+  while (r) {
-+    Redirection* next = r->next_;
-+    js_delete(r);
-+    r = next;
-+  }
-+}
-+
-+/* static */
-+void* Simulator::RedirectNativeFunction(void* nativeFunction,
-+                                        ABIFunctionType type) {
-+  Redirection* redirection = Redirection::Get(nativeFunction, type);
-+  return redirection->addressOfSwiInstruction();
-+}
-+
-+Simulator* Simulator::Current() {
-+  JSContext* cx = TlsContext.get();
-+  MOZ_ASSERT(CurrentThreadCanAccessRuntime(cx->runtime()));
-+  return cx->simulator();
-+}
-+
-+// -----------------------------------------------------------------------------
-+// Register accessors.
-+
-+void Simulator::setRegister(int reg, int64_t value) {
-+  MOZ_ASSERT((reg >= 0) && (reg < Register::kNumSimuRegisters));
-+  if (reg == pc) {
-+    pc_modified_ = true;
-+  }
-+  registers_[reg] = value;
-+}
-+
-+int64_t Simulator::getRegister(int reg) const {
-+  MOZ_ASSERT((reg >= 0) && (reg < Register::kNumSimuRegisters));
-+  return registers_[reg] + ((reg == pc) ? SimInstruction::kPCReadOffset : 0);
-+}
-+
-+void Simulator::setFpuRegister(int fpureg, int64_t value) {
-+  MOZ_ASSERT((fpureg >= 0) &&
-+             (fpureg < Simulator::FPURegister::kNumFPURegisters));
-+  FPUregisters_[fpureg] = value;
-+}
-+
-+void Simulator::setFpuRegisterWord(int fpureg, int32_t value) {
-+  MOZ_ASSERT((fpureg >= 0) &&
-+             (fpureg < Simulator::FPURegister::kNumFPURegisters));
-+  int32_t* pword;
-+  pword = reinterpret_cast<int32_t*>(&FPUregisters_[fpureg]);
-+  *pword = value;
-+}
-+
-+// Promote f32 → f64 preserving NaN payload, like PPC64's `lfs` and
-+// `xscvspdpn`. The plain C cast `(double)f32_nan` is permitted by the
-+// standard to quiet a signaling NaN, which on x86/ARM hosts visibly
-+// transforms 0x7FA00000 (sNaN) into a qNaN such as 0x7FE00000 — breaking
-+// every wasm test that loads a constant sNaN bit pattern. Manually
-+// reconstruct the f64 NaN with the same sign + payload (payload shifted
-+// left by 29 to fill the wider mantissa).
-+static double promoteFloatPreservingNaN(float f) {
-+  uint32_t bits;
-+  memcpy(&bits, &f, sizeof(bits));
-+  if ((bits & 0x7F800000u) == 0x7F800000u && (bits & 0x007FFFFFu) != 0u) {
-+    uint64_t sign = uint64_t(bits >> 31) & 1u;
-+    uint64_t payload = uint64_t(bits & 0x007FFFFFu);
-+    uint64_t dbits = (sign << 63) | (uint64_t(0x7FFu) << 52) | (payload << 29);
-+    double d;
-+    memcpy(&d, &dbits, sizeof(d));
-+    return d;
-+  }
-+  return (double)f;
-+}
-+
-+// Demote f64 → f32 preserving NaN payload (non-signaling: matches PPC64
-+// `stfs` / `xscvdpspn`, and wasm `lfs`-equivalent stores). Truncates the
-+// lower 29 bits of the f64 payload (those bits cannot be represented in
-+// the narrower f32 mantissa); if the truncation would yield a payload of
-+// zero (which would degrade the NaN to an Infinity), force the LSB so
-+// the result is still a NaN. This intentionally does NOT set the quiet
-+// bit — that's the job of the explicit-quieting op `xscvdpsp` and
-+// f32.demote_f64's wasm-level lowering.
-+static float demoteDoublePreservingNaN(double d) {
-+  uint64_t bits;
-+  memcpy(&bits, &d, sizeof(bits));
-+  if ((bits & 0x7FF0000000000000ULL) == 0x7FF0000000000000ULL &&
-+      (bits & 0x000FFFFFFFFFFFFFULL) != 0) {
-+    uint32_t sign = uint32_t(bits >> 63) & 1u;
-+    uint32_t payload = uint32_t((bits >> 29) & 0x007FFFFFu);
-+    if (payload == 0) payload = 1;
-+    uint32_t fbits = (sign << 31) | 0x7F800000u | payload;
-+    float f;
-+    memcpy(&f, &fbits, sizeof(f));
-+    return f;
-+  }
-+  return (float)d;
-+}
-+
-+void Simulator::setFpuRegisterFloat(int fpureg, float value) {
-+  MOZ_ASSERT((fpureg >= 0) &&
-+             (fpureg < Simulator::FPURegister::kNumFPURegisters));
-+  // ELFv2 ABI: single-precision values in FPRs are stored as their
-+  // double-precision representation. Promote and store the full 8 bytes,
-+  // not just the low 4. (Otherwise the upper 4 bytes are stale, matching
-+  // the layout that fctid/fcfid/lfd would read but NOT what the JIT and
-+  // the C ABI expect for a 'float' parameter.) Use the NaN-preserving
-+  // helper so a signaling-NaN return value isn't quieted into a qNaN.
-+  double promoted = promoteFloatPreservingNaN(value);
-+  memcpy(&FPUregisters_[fpureg], &promoted, sizeof(promoted));
-+}
-+
-+void Simulator::setFpuRegisterDouble(int fpureg, double value) {
-+  MOZ_ASSERT((fpureg >= 0) &&
-+             (fpureg < Simulator::FPURegister::kNumFPURegisters));
-+  *mozilla::BitwiseCast<double*>(&FPUregisters_[fpureg]) = value;
-+}
-+
-+int64_t Simulator::getFpuRegister(int fpureg) const {
-+  MOZ_ASSERT((fpureg >= 0) &&
-+             (fpureg < Simulator::FPURegister::kNumFPURegisters));
-+  return FPUregisters_[fpureg];
-+}
-+
-+int32_t Simulator::getFpuRegisterWord(int fpureg) const {
-+  MOZ_ASSERT((fpureg >= 0) &&
-+             (fpureg < Simulator::FPURegister::kNumFPURegisters));
-+  return *mozilla::BitwiseCast<int32_t*>(&FPUregisters_[fpureg]);
-+}
-+
-+int32_t Simulator::getFpuRegisterSignedWord(int fpureg) const {
-+  MOZ_ASSERT((fpureg >= 0) &&
-+             (fpureg < Simulator::FPURegister::kNumFPURegisters));
-+  return *mozilla::BitwiseCast<int32_t*>(&FPUregisters_[fpureg]);
-+}
-+
-+float Simulator::getFpuRegisterFloat(int fpureg) const {
-+  MOZ_ASSERT((fpureg >= 0) &&
-+             (fpureg < Simulator::FPURegister::kNumFPURegisters));
-+  // ELFv2 ABI: single-precision values are passed/returned in FPRs as their
-+  // double-precision representation. Read the full 8 bytes as double, then
-+  // narrow to float — matching the `frsp` the C callee would do, and matching
-+  // what real PPC64 hardware sees when the FPR was loaded via `lfs`. Use the
-+  // NaN-preserving helper so a signaling-NaN parameter isn't quieted.
-+  double promoted;
-+  memcpy(&promoted, &FPUregisters_[fpureg], sizeof(promoted));
-+  return demoteDoublePreservingNaN(promoted);
-+}
-+
-+double Simulator::getFpuRegisterDouble(int fpureg) const {
-+  MOZ_ASSERT((fpureg >= 0) &&
-+             (fpureg < Simulator::FPURegister::kNumFPURegisters));
-+  return *mozilla::BitwiseCast<double*>(&FPUregisters_[fpureg]);
-+}
-+
-+void Simulator::setVRBytes(int vreg, const uint8_t bytes[16]) {
-+  MOZ_ASSERT((vreg >= 0) && (vreg < kNumVRRegisters));
-+  memcpy(VRregisters_[vreg], bytes, 16);
-+}
-+
-+void Simulator::getVRBytes(int vreg, uint8_t bytes[16]) const {
-+  MOZ_ASSERT((vreg >= 0) && (vreg < kNumVRRegisters));
-+  memcpy(bytes, VRregisters_[vreg], 16);
-+}
-+
-+void Simulator::getVSR128(int vsr, uint8_t bytes[16]) const {
-+  MOZ_ASSERT((vsr >= 0) && (vsr < kNumFPURegisters + kNumVRRegisters));
-+  if (vsr < kNumFPURegisters) {
-+    // VSR 0-31: FPR view. The FPR scalar lives in BE DW0 of the VSR,
-+    // which on PPC64LE register storage maps to LE bytes 8-15.
-+    // DW1 is undefined per ISA; we model it as zero.
-+    // `lfd f0,(mem); xxlor <vr>,f0,f0; stxvx <vr>,...` writes the
-+    // double's 8 bytes to the HIGH half of the 16-byte store (LE
-+    // bytes 8-15).
-+    int64_t val = FPUregisters_[vsr];
-+    memset(bytes, 0, 8);
-+    memcpy(bytes + 8, &val, 8);
-+  } else {
-+    memcpy(bytes, VRregisters_[vsr - kNumFPURegisters], 16);
-+  }
-+}
-+
-+void Simulator::setVSR128(int vsr, const uint8_t bytes[16]) {
-+  MOZ_ASSERT((vsr >= 0) && (vsr < kNumFPURegisters + kNumVRRegisters));
-+  if (vsr < kNumFPURegisters) {
-+    // FPR scalar at BE DW0 = LE bytes 8-15. DW1 is architecturally
-+    // discarded on VSR-to-FPR writes.
-+    int64_t val;
-+    memcpy(&val, bytes + 8, 8);
-+    FPUregisters_[vsr] = val;
-+  } else {
-+    memcpy(VRregisters_[vsr - kNumFPURegisters], bytes, 16);
-+  }
-+}
-+
-+void Simulator::setCallResultDouble(double result) {
-+  setFpuRegisterDouble(Simulator::f1, result);
-+}
-+
-+void Simulator::setCallResultFloat(float result) {
-+  setFpuRegisterFloat(Simulator::f1, result);
-+}
-+
-+void Simulator::setCallResult(int64_t res) { setRegister(r3, res); }
-+
-+#ifdef XP_DARWIN
-+void Simulator::setCallResult(intptr_t res) {
-+  setRegister(r3, I64(res));
-+}
-+#endif
-+
-+void Simulator::setCallResult(__int128 res) {
-+  setRegister(r3, I64(res));
-+  setRegister(r4, I64(res >> 64));
-+}
-+
-+void Simulator::set_pc(int64_t value) {
-+  pc_modified_ = true;
-+  registers_[pc] = value;
-+}
-+
-+bool Simulator::has_bad_pc() const {
-+  return ((registers_[pc] == bad_ra) || (registers_[pc] == end_sim_pc));
-+}
-+
-+int64_t Simulator::get_pc() const { return registers_[pc]; }
-+
-+JS::ProfilingFrameIterator::RegisterState Simulator::registerState() {
-+  wasm::RegisterState state;
-+  state.pc = (void*)get_pc();
-+  state.fp = (void*)getRegister(fp);
-+  state.sp = (void*)getRegister(sp);
-+  state.lr = (void*)getLR();
-+  return state;
-+}
-+
-+// -----------------------------------------------------------------------------
-+// Memory access helpers.
-+
-+uint8_t Simulator::readBU(uint64_t addr) {
-+  if (handleWasmSegFault(addr, 1)) {
-+    return 0xff;
-+  }
-+  uint8_t* ptr = reinterpret_cast<uint8_t*>(addr);
-+  return *ptr;
-+}
-+
-+int8_t Simulator::readB(uint64_t addr) {
-+  if (handleWasmSegFault(addr, 1)) {
-+    return -1;
-+  }
-+  int8_t* ptr = reinterpret_cast<int8_t*>(addr);
-+  return *ptr;
-+}
-+
-+void Simulator::writeB(uint64_t addr, uint8_t value) {
-+  if (handleWasmSegFault(addr, 1)) {
-+    return;
-+  }
-+  uint8_t* ptr = reinterpret_cast<uint8_t*>(addr);
-+  *ptr = value;
-+}
-+
-+void Simulator::writeB(uint64_t addr, int8_t value) {
-+  if (handleWasmSegFault(addr, 1)) {
-+    return;
-+  }
-+  int8_t* ptr = reinterpret_cast<int8_t*>(addr);
-+  *ptr = value;
-+}
-+
-+uint16_t Simulator::readHU(uint64_t addr, SimInstruction* instr) {
-+  if (handleWasmSegFault(addr, 2)) {
-+    return 0xffff;
-+  }
-+  uint16_t* ptr = reinterpret_cast<uint16_t*>(addr);
-+  return *ptr;
-+}
-+
-+int16_t Simulator::readH(uint64_t addr, SimInstruction* instr) {
-+  if (handleWasmSegFault(addr, 2)) {
-+    return -1;
-+  }
-+  int16_t* ptr = reinterpret_cast<int16_t*>(addr);
-+  return *ptr;
-+}
-+
-+void Simulator::writeH(uint64_t addr, uint16_t value, SimInstruction* instr) {
-+  if (handleWasmSegFault(addr, 2)) {
-+    return;
-+  }
-+  uint16_t* ptr = reinterpret_cast<uint16_t*>(addr);
-+  LLBit_ = false;
-+  *ptr = value;
-+}
-+
-+void Simulator::writeH(uint64_t addr, int16_t value, SimInstruction* instr) {
-+  if (handleWasmSegFault(addr, 2)) {
-+    return;
-+  }
-+  int16_t* ptr = reinterpret_cast<int16_t*>(addr);
-+  LLBit_ = false;
-+  *ptr = value;
-+}
-+
-+uint32_t Simulator::readWU(uint64_t addr, SimInstruction* instr) {
-+  if (handleWasmSegFault(addr, 4)) {
-+    return -1;
-+  }
-+  uint32_t* ptr = reinterpret_cast<uint32_t*>(addr);
-+  return *ptr;
-+}
-+
-+int32_t Simulator::readW(uint64_t addr, SimInstruction* instr) {
-+  if (handleWasmSegFault(addr, 4)) {
-+    return -1;
-+  }
-+  int32_t* ptr = reinterpret_cast<int32_t*>(addr);
-+  return *ptr;
-+}
-+
-+void Simulator::writeW(uint64_t addr, uint32_t value, SimInstruction* instr) {
-+  if (handleWasmSegFault(addr, 4)) {
-+    return;
-+  }
-+  uint32_t* ptr = reinterpret_cast<uint32_t*>(addr);
-+  LLBit_ = false;
-+  *ptr = value;
-+}
-+
-+void Simulator::writeW(uint64_t addr, int32_t value, SimInstruction* instr) {
-+  if (handleWasmSegFault(addr, 4)) {
-+    return;
-+  }
-+  int32_t* ptr = reinterpret_cast<int32_t*>(addr);
-+  LLBit_ = false;
-+  *ptr = value;
-+}
-+
-+int64_t Simulator::readDW(uint64_t addr, SimInstruction* instr) {
-+  if (handleWasmSegFault(addr, 8)) {
-+    return -1;
-+  }
-+  int64_t* ptr = reinterpret_cast<int64_t*>(addr);
-+  return *ptr;
-+}
-+
-+void Simulator::writeDW(uint64_t addr, int64_t value, SimInstruction* instr) {
-+  if (handleWasmSegFault(addr, 8)) {
-+    return;
-+  }
-+  int64_t* ptr = reinterpret_cast<int64_t*>(addr);
-+  LLBit_ = false;
-+  *ptr = value;
-+}
-+
-+double Simulator::readD(uint64_t addr, SimInstruction* instr) {
-+  if (handleWasmSegFault(addr, 8)) {
-+    return NAN;
-+  }
-+  double* ptr = reinterpret_cast<double*>(addr);
-+  return *ptr;
-+}
-+
-+void Simulator::writeD(uint64_t addr, double value, SimInstruction* instr) {
-+  if (handleWasmSegFault(addr, 8)) {
-+    return;
-+  }
-+  double* ptr = reinterpret_cast<double*>(addr);
-+  LLBit_ = false;
-+  *ptr = value;
-+}
-+
-+// Byte-wide load-reserve / store-conditional (lbarx / stbcx.).
-+// Byte accesses have no alignment requirement.
-+uint8_t Simulator::loadLinkedB(uint64_t addr, SimInstruction* instr) {
-+  if (handleWasmSegFault(addr, 1)) {
-+    return 0;
-+  }
-+  volatile uint8_t* ptr = reinterpret_cast<volatile uint8_t*>(addr);
-+  uint8_t value = *ptr;
-+  lastLLValue_ = value;
-+  LLAddr_ = addr;
-+  LLBit_ = true;
-+  return value;
-+}
-+
-+int Simulator::storeConditionalB(uint64_t addr, uint8_t value,
-+                                 SimInstruction* instr) {
-+  if (addr != LLAddr_) {
-+    printf("stbcx. to bad address: 0x%016" PRIx64 ", pc=0x%016" PRIxPTR
-+           ", expected: 0x%016" PRIxPTR "\n",
-+           addr, reinterpret_cast<intptr_t>(instr), LLAddr_);
-+    MOZ_CRASH();
-+  }
-+  SharedMem<uint8_t*> ptr =
-+      SharedMem<uint8_t*>::shared(reinterpret_cast<uint8_t*>(addr));
-+  if (!LLBit_) {
-+    return 0;
-+  }
-+  LLBit_ = false;
-+  LLAddr_ = 0;
-+  uint8_t expected = uint8_t(lastLLValue_);
-+  uint8_t old =
-+      AtomicOperations::compareExchangeSeqCst(ptr, expected, value);
-+  return (old == expected) ? 1 : 0;
-+}
-+
-+// Halfword-wide load-reserve / store-conditional (lharx / sthcx.).
-+// 2-byte aligned per ISA.
-+uint16_t Simulator::loadLinkedH(uint64_t addr, SimInstruction* instr) {
-+  if ((addr & 1) == 0) {
-+    if (handleWasmSegFault(addr, 2)) {
-+      return 0;
-+    }
-+    volatile uint16_t* ptr = reinterpret_cast<volatile uint16_t*>(addr);
-+    uint16_t value = *ptr;
-+    lastLLValue_ = value;
-+    LLAddr_ = addr;
-+    LLBit_ = true;
-+    return value;
-+  }
-+  printf("Unaligned lharx at 0x%016" PRIx64 ", pc=0x%016" PRIxPTR "\n", addr,
-+         reinterpret_cast<intptr_t>(instr));
-+  MOZ_CRASH();
-+  return 0;
-+}
-+
-+int Simulator::storeConditionalH(uint64_t addr, uint16_t value,
-+                                 SimInstruction* instr) {
-+  if (addr != LLAddr_) {
-+    printf("sthcx. to bad address: 0x%016" PRIx64 ", pc=0x%016" PRIxPTR
-+           ", expected: 0x%016" PRIxPTR "\n",
-+           addr, reinterpret_cast<intptr_t>(instr), LLAddr_);
-+    MOZ_CRASH();
-+  }
-+  if ((addr & 1) == 0) {
-+    SharedMem<uint16_t*> ptr =
-+        SharedMem<uint16_t*>::shared(reinterpret_cast<uint16_t*>(addr));
-+    if (!LLBit_) {
-+      return 0;
-+    }
-+    LLBit_ = false;
-+    LLAddr_ = 0;
-+    uint16_t expected = uint16_t(lastLLValue_);
-+    uint16_t old =
-+        AtomicOperations::compareExchangeSeqCst(ptr, expected, value);
-+    return (old == expected) ? 1 : 0;
-+  }
-+  printf("Unaligned sthcx. at 0x%016" PRIx64 ", pc=0x%016" PRIxPTR "\n", addr,
-+         reinterpret_cast<intptr_t>(instr));
-+  MOZ_CRASH();
-+  return 0;
-+}
-+
-+int32_t Simulator::loadLinkedW(uint64_t addr, SimInstruction* instr) {
-+  if ((addr & 3) == 0) {
-+    if (handleWasmSegFault(addr, 4)) {
-+      return -1;
-+    }
-+
-+    volatile int32_t* ptr = reinterpret_cast<volatile int32_t*>(addr);
-+    int32_t value = *ptr;
-+    lastLLValue_ = value;
-+    LLAddr_ = addr;
-+    LLBit_ = true;
-+    return value;
-+  }
-+  printf("Unaligned lwarx at 0x%016" PRIx64 ", pc=0x%016" PRIxPTR "\n", addr,
-+         reinterpret_cast<intptr_t>(instr));
-+  MOZ_CRASH();
-+  return 0;
-+}
-+
-+int Simulator::storeConditionalW(uint64_t addr, int32_t value,
-+                                 SimInstruction* instr) {
-+  if (addr != LLAddr_) {
-+    printf("stwcx. to bad address: 0x%016" PRIx64 ", pc=0x%016" PRIxPTR
-+           ", expected: 0x%016" PRIxPTR "\n",
-+           addr, reinterpret_cast<intptr_t>(instr), LLAddr_);
-+    MOZ_CRASH();
-+  }
-+
-+  if ((addr & 3) == 0) {
-+    SharedMem<int32_t*> ptr =
-+        SharedMem<int32_t*>::shared(reinterpret_cast<int32_t*>(addr));
-+
-+    if (!LLBit_) {
-+      return 0;
-+    }
-+
-+    LLBit_ = false;
-+    LLAddr_ = 0;
-+    int32_t expected = int32_t(lastLLValue_);
-+    int32_t old =
-+        AtomicOperations::compareExchangeSeqCst(ptr, expected, value);
-+    return (old == expected) ? 1 : 0;
-+  }
-+  printf("Unaligned stwcx. at 0x%016" PRIx64 ", pc=0x%016" PRIxPTR "\n", addr,
-+         reinterpret_cast<intptr_t>(instr));
-+  MOZ_CRASH();
-+  return 0;
-+}
-+
-+int64_t Simulator::loadLinkedD(uint64_t addr, SimInstruction* instr) {
-+  if ((addr & kPointerAlignmentMask) == 0) {
-+    if (handleWasmSegFault(addr, 8)) {
-+      return -1;
-+    }
-+
-+    volatile int64_t* ptr = reinterpret_cast<volatile int64_t*>(addr);
-+    int64_t value = *ptr;
-+    lastLLValue_ = value;
-+    LLAddr_ = addr;
-+    LLBit_ = true;
-+    return value;
-+  }
-+  printf("Unaligned ldarx at 0x%016" PRIx64 ", pc=0x%016" PRIxPTR "\n", addr,
-+         reinterpret_cast<intptr_t>(instr));
-+  MOZ_CRASH();
-+  return 0;
-+}
-+
-+int Simulator::storeConditionalD(uint64_t addr, int64_t value,
-+                                 SimInstruction* instr) {
-+  if (addr != LLAddr_) {
-+    printf("stdcx. to bad address: 0x%016" PRIx64 ", pc=0x%016" PRIxPTR
-+           ", expected: 0x%016" PRIxPTR "\n",
-+           addr, reinterpret_cast<intptr_t>(instr), LLAddr_);
-+    MOZ_CRASH();
-+  }
-+
-+  if ((addr & kPointerAlignmentMask) == 0) {
-+    SharedMem<int64_t*> ptr =
-+        SharedMem<int64_t*>::shared(reinterpret_cast<int64_t*>(addr));
-+
-+    if (!LLBit_) {
-+      return 0;
-+    }
-+
-+    LLBit_ = false;
-+    LLAddr_ = 0;
-+    int64_t expected = lastLLValue_;
-+    int64_t old =
-+        AtomicOperations::compareExchangeSeqCst(ptr, expected, value);
-+    return (old == expected) ? 1 : 0;
-+  }
-+  printf("Unaligned stdcx. at 0x%016" PRIx64 ", pc=0x%016" PRIxPTR "\n", addr,
-+         reinterpret_cast<intptr_t>(instr));
-+  MOZ_CRASH();
-+  return 0;
-+}
-+
-+// -----------------------------------------------------------------------------
-+// Stack limit / recursion helpers.
-+
-+uintptr_t Simulator::stackLimit() const { return stackLimit_; }
-+
-+uintptr_t* Simulator::addressOfStackLimit() { return &stackLimit_; }
-+
-+bool Simulator::overRecursed(uintptr_t newsp) const {
-+  if (newsp == 0) {
-+    newsp = getRegister(sp);
-+  }
-+  return newsp <= stackLimit();
-+}
-+
-+bool Simulator::overRecursedWithExtra(uint32_t extra) const {
-+  uintptr_t newsp = getRegister(sp) - extra;
-+  return newsp <= stackLimit();
-+}
-+
-+void Simulator::format(SimInstruction* instr, const char* format) {
-+  printf("Simulator found unsupported instruction:\n 0x%016" PRIxPTR
-+         ": %08x %s\n",
-+         reinterpret_cast<intptr_t>(instr), instr->instructionBits(), format);
-+  MOZ_CRASH();
-+}
-+
-+// -----------------------------------------------------------------------------
-+// softwareInterrupt - handle kCallRedirInstr (PPC_stop) and PPC_trap.
-+
-+ABI_FUNCTION_TYPE_SIM_PROTOTYPES
-+
-+void Simulator::softwareInterrupt(SimInstruction* instr) {
-+  uint32_t instrBits = instr->instructionBits();
-+
-+  if (instrBits == kCallRedirInstr) {
-+    Redirection* redirection = Redirection::FromSwiInstruction(instr);
-+    uintptr_t nativeFn =
-+        reinterpret_cast<uintptr_t>(redirection->nativeFunction());
-+
-+    // Get the SP for reading stack arguments.
-+    int64_t* sp_ = reinterpret_cast<int64_t*>(getRegister(sp));
-+    // Skip past the PPC64 ELFv2 link area (4 doublewords = 32 bytes).
-+    sp_ = reinterpret_cast<int64_t*>(reinterpret_cast<uintptr_t>(sp_) + 32);
-+
-+    // PPC64 ELFv2: integer args in r3-r10, FP args in f1-f13.
-+    int64_t a0_ = getRegister(r3);
-+    int64_t a1_ = getRegister(r4);
-+    int64_t a2_ = getRegister(r5);
-+    int64_t a3_ = getRegister(r6);
-+    int64_t a4_ = getRegister(r7);
-+    int64_t a5_ = getRegister(r8);
-+    int64_t a6_ = getRegister(r9);
-+    int64_t a7_ = getRegister(r10);
-+    // PPC64 ELFv2: FP args in f1-f13, mapped to f0_s..f12_s and f0_d..f12_d.
-+    float f0_s = getFpuRegisterFloat(Simulator::f1);
-+    float f1_s = getFpuRegisterFloat(Simulator::f2);
-+    float f2_s = getFpuRegisterFloat(Simulator::f3);
-+    float f3_s = getFpuRegisterFloat(Simulator::f4);
-+    float f4_s = getFpuRegisterFloat(Simulator::f5);
-+    float f5_s = getFpuRegisterFloat(Simulator::f6);
-+    float f6_s = getFpuRegisterFloat(Simulator::f7);
-+    float f7_s = getFpuRegisterFloat(Simulator::f8);
-+    float f8_s = getFpuRegisterFloat(Simulator::f9);
-+    float f9_s = getFpuRegisterFloat(Simulator::f10);
-+    float f10_s = getFpuRegisterFloat(Simulator::f11);
-+    float f11_s = getFpuRegisterFloat(Simulator::f12);
-+    float f12_s = getFpuRegisterFloat(Simulator::f13);
-+    double f0_d = getFpuRegisterDouble(Simulator::f1);
-+    double f1_d = getFpuRegisterDouble(Simulator::f2);
-+    double f2_d = getFpuRegisterDouble(Simulator::f3);
-+    double f3_d = getFpuRegisterDouble(Simulator::f4);
-+    double f4_d = getFpuRegisterDouble(Simulator::f5);
-+    double f5_d = getFpuRegisterDouble(Simulator::f6);
-+    double f6_d = getFpuRegisterDouble(Simulator::f7);
-+    double f7_d = getFpuRegisterDouble(Simulator::f8);
-+    double f8_d = getFpuRegisterDouble(Simulator::f9);
-+    double f9_d = getFpuRegisterDouble(Simulator::f10);
-+    double f10_d = getFpuRegisterDouble(Simulator::f11);
-+    double f11_d = getFpuRegisterDouble(Simulator::f12);
-+    double f12_d = getFpuRegisterDouble(Simulator::f13);
-+
-+    // Suppress unused-variable warnings for higher FP arg registers.
-+    // They exist for ABI completeness but few function types use >5 FP args.
-+    (void)f4_s; (void)f5_s; (void)f6_s; (void)f7_s; (void)f8_s; (void)f9_s;
-+    (void)f10_s; (void)f11_s; (void)f12_s;
-+    (void)f4_d; (void)f5_d; (void)f6_d; (void)f7_d; (void)f8_d; (void)f9_d;
-+    (void)f10_d; (void)f11_d; (void)f12_d;
-+
-+    int64_t saved_lr = getLR();
-+
-+    bool stack_aligned = (getRegister(sp) & (ABIStackAlignment - 1)) == 0;
-+    if (!stack_aligned) {
-+      fprintf(stderr, "Runtime call with unaligned stack!\n");
-+      MOZ_CRASH();
-+    }
-+
-+    if (single_stepping_) {
-+      single_step_callback_(single_step_callback_arg_, this, nullptr);
-+    }
-+
-+    switch (redirection->type()) {
-+      ABI_FUNCTION_TYPE_PPC64_SIM_DISPATCH
-+
-+      default:
-+        MOZ_CRASH("Unknown function type.");
-+    }
-+
-+    if (single_stepping_) {
-+      single_step_callback_(single_step_callback_arg_, this, nullptr);
-+    }
-+
-+    setLR(saved_lr);
-+    set_pc(getLR());
-+  } else if (instrBits == 0x7FE00008) {
-+    // PPC_trap: used for wasm traps.
-+    uint8_t* newPC;
-+    if (wasm::HandleIllegalInstruction(registerState(), &newPC)) {
-+      set_pc(int64_t(newPC));
-+      return;
-+    }
-+    MOZ_CRASH("Unexpected trap instruction");
-+  } else {
-+    // Other trap-like instructions: enter debugger.
-+    ppc64Debugger dbg(this);
-+    dbg.debug();
-+  }
-+}
-+
-+// -----------------------------------------------------------------------------
-+// Stop/breakpoint helpers.
-+
-+bool Simulator::isWatchpoint(uint32_t code) {
-+  return (code <= kMaxWatchpointCode);
-+}
-+
-+void Simulator::printWatchpoint(uint32_t code) {
-+  ppc64Debugger dbg(this);
-+  ++break_count_;
-+  printf("\n---- break %d marker: %20" PRIi64 "  (instr count: %20" PRIi64
-+         ") ----\n",
-+         code, break_count_, icount_);
-+  dbg.printAllRegs();
-+}
-+
-+void Simulator::handleStop(uint32_t code, SimInstruction* instr) {
-+  if (isEnabledStop(code)) {
-+    ppc64Debugger dbg(this);
-+    dbg.stop(instr);
-+  } else {
-+    set_pc(get_pc() + SimInstruction::kInstrSize);
-+  }
-+}
-+
-+bool Simulator::isStopInstruction(SimInstruction* instr) {
-+  return instr->instructionBits() == kCallRedirInstr;
-+}
-+
-+bool Simulator::isEnabledStop(uint32_t code) {
-+  MOZ_ASSERT(code <= kMaxStopCode);
-+  MOZ_ASSERT(code > kMaxWatchpointCode);
-+  return !(watchedStops_[code].count_ & kStopDisabledBit);
-+}
-+
-+void Simulator::enableStop(uint32_t code) {
-+  if (!isEnabledStop(code)) {
-+    watchedStops_[code].count_ &= ~kStopDisabledBit;
-+  }
-+}
-+
-+void Simulator::disableStop(uint32_t code) {
-+  if (isEnabledStop(code)) {
-+    watchedStops_[code].count_ |= kStopDisabledBit;
-+  }
-+}
-+
-+void Simulator::increaseStopCounter(uint32_t code) {
-+  MOZ_ASSERT(code <= kMaxStopCode);
-+  if ((watchedStops_[code].count_ & ~(1 << 31)) == 0x7fffffff) {
-+    printf(
-+        "Stop counter for code %i has overflowed.\n"
-+        "Enabling this code and reseting the counter to 0.\n",
-+        code);
-+    watchedStops_[code].count_ = 0;
-+    enableStop(code);
-+  } else {
-+    watchedStops_[code].count_++;
-+  }
-+}
-+
-+void Simulator::printStopInfo(uint32_t code) {
-+  if (code <= kMaxWatchpointCode) {
-+    printf("That is a watchpoint, not a stop.\n");
-+    return;
-+  } else if (code > kMaxStopCode) {
-+    printf("Code too large, only %u stops can be used\n", kMaxStopCode + 1);
-+    return;
-+  }
-+  const char* state = isEnabledStop(code) ? "Enabled" : "Disabled";
-+  int32_t count = watchedStops_[code].count_ & ~kStopDisabledBit;
-+  if (count != 0) {
-+    if (watchedStops_[code].desc_) {
-+      printf("stop %i - 0x%x: \t%s, \tcounter = %i, \t%s\n", code, code,
-+             state, count, watchedStops_[code].desc_);
-+    } else {
-+      printf("stop %i - 0x%x: \t%s, \tcounter = %i\n", code, code, state,
-+             count);
-+    }
-+  }
-+}
-+
-+// =============================================================================
-+// Instruction decoders.
-+// =============================================================================
-+
-+// Compute effective address for D-form instructions.
-+// If RA==0, the base is 0 (not GPR[0]).
-+static inline int64_t DFormEA(Simulator* sim, SimInstruction* instr,
-+                              int16_t offset) {
-+  uint32_t ra = instr->raValue();
-+  int64_t base = (ra == 0) ? 0 : sim->getRegister(ra);
-+  return base + offset;
-+}
-+
-+// Compute effective address for DS-form instructions.
-+static inline int64_t DSFormEA(Simulator* sim, SimInstruction* instr,
-+                               int16_t offset) {
-+  uint32_t ra = instr->raValue();
-+  int64_t base = (ra == 0) ? 0 : sim->getRegister(ra);
-+  return base + offset;
-+}
-+
-+// Compute effective address for X-form indexed instructions.
-+// If RA==0, base is 0 (not GPR[0]).
-+static inline int64_t XFormEA(Simulator* sim, SimInstruction* instr) {
-+  uint32_t ra = instr->raValue();
-+  uint32_t rb = instr->rbValue();
-+  int64_t base = (ra == 0) ? 0 : sim->getRegister(ra);
-+  return base + sim->getRegister(rb);
-+}
-+
-+// Compute effective address for X-form indexed updates (RA != 0 required).
-+static inline int64_t XFormEAUpdate(Simulator* sim, SimInstruction* instr) {
-+  uint32_t ra = instr->raValue();
-+  uint32_t rb = instr->rbValue();
-+  return sim->getRegister(ra) + sim->getRegister(rb);
-+}
-+
-+// -----------------------------------------------------------------------------
-+// decodeDFormALU: addi, addis, ori, oris, xori, xoris, andi., andis.,
-+//                 cmpi, cmpli, subfic, addic, addic., mulli, twi
-+
-+void Simulator::decodeDFormALU(SimInstruction* instr) {
-+  uint32_t opcode = instr->opcode();
-+  uint32_t rt = instr->rtValue();
-+  uint32_t ra = instr->raValue();
-+  int16_t si = instr->imm16Value();
-+  uint16_t ui = instr->uimm16Value();
-+
-+  switch (opcode) {
-+    case 14: {
-+      // addi: RT = (RA|0) + SI
-+      int64_t base = (ra == 0) ? 0 : getRegister(ra);
-+      setRegister(rt, base + (int64_t)si);
-+      break;
-+    }
-+    case 15: {
-+      // addis: RT = (RA|0) + (SI << 16)
-+      int64_t base = (ra == 0) ? 0 : getRegister(ra);
-+      setRegister(rt, base + ((int64_t)si << 16));
-+      break;
-+    }
-+    case 24: {
-+      // ori: RA = RS | UI
-+      setRegister(ra, getRegister(rt) | (uint64_t)ui);
-+      break;
-+    }
-+    case 25: {
-+      // oris: RA = RS | (UI << 16)
-+      setRegister(ra, getRegister(rt) | ((uint64_t)ui << 16));
-+      break;
-+    }
-+    case 26: {
-+      // xori: RA = RS ^ UI
-+      setRegister(ra, getRegister(rt) ^ (uint64_t)ui);
-+      break;
-+    }
-+    case 27: {
-+      // xoris: RA = RS ^ (UI << 16)
-+      setRegister(ra, getRegister(rt) ^ ((uint64_t)ui << 16));
-+      break;
-+    }
-+    case 28: {
-+      // andi.: RA = RS & UI, update CR0
-+      int64_t result = getRegister(rt) & (uint64_t)ui;
-+      setRegister(ra, result);
-+      updateCR0(result);
-+      break;
-+    }
-+    case 29: {
-+      // andis.: RA = RS & (UI << 16), update CR0
-+      int64_t result = getRegister(rt) & ((uint64_t)ui << 16);
-+      setRegister(ra, result);
-+      updateCR0(result);
-+      break;
-+    }
-+    case 11: {
-+      // cmpi: compare RA with SI, signed
-+      uint32_t bf = instr->bfValue();
-+      bool l = instr->lBit();
-+      if (l) {
-+        // 64-bit compare
-+        setCRFieldCmp(bf, getRegister(ra), (int64_t)si);
-+      } else {
-+        // 32-bit compare
-+        int32_t ra32 = I32(getRegister(ra));
-+        setCRFieldCmp(bf, (int64_t)ra32, (int64_t)(int32_t)si);
-+      }
-+      break;
-+    }
-+    case 10: {
-+      // cmpli: compare RA with UI, unsigned
-+      uint32_t bf = instr->bfValue();
-+      bool l = instr->lBit();
-+      if (l) {
-+        // 64-bit unsigned compare
-+        setCRFieldCmpU(bf, U64(getRegister(ra)), (uint64_t)ui);
-+      } else {
-+        // 32-bit unsigned compare
-+        uint32_t ra32 = U32(getRegister(ra));
-+        setCRFieldCmpU(bf, (uint64_t)ra32, (uint64_t)ui);
-+      }
-+      break;
-+    }
-+    case 8: {
-+      // subfic: RT = SI - RA, set CA
-+      uint64_t ra_val = U64(getRegister(ra));
-+      uint64_t imm = U64((int64_t)si);
-+      uint64_t result = imm + ~ra_val + 1;
-+      setRegister(rt, I64(result));
-+      // CA is set if there is a carry out of the addition (~RA + IMM + 1).
-+      // Equivalently, CA = (IMM >= RA) for unsigned interpretation of the
-+      // full 64-bit subtraction.
-+      bool carry = (imm >= ra_val) || (imm == 0 && ra_val == 0);
-+      // More precise: carry = (~ra_val + imm) would overflow, or adding 1
-+      // overflows.
-+      uint64_t tmp = ~ra_val + imm;
-+      carry = (tmp < ~ra_val) || (tmp < imm) || (result < tmp);
-+      // Simplify: CA if no borrow.
-+      carry = (U64((int64_t)si) >= ra_val);
-+      if (ra_val == 0) carry = true;
-+      // Actually, subfic CA: carry out of ~RA + IMM + 1.
-+      // CA = (IMM > RA - 1) when RA != 0, CA = 1 when RA == 0.
-+      // Or just: the unsigned result of (SI - RA) is valid (no borrow).
-+      // Let's compute it correctly:
-+      {
-+        __uint128_t wide = (__uint128_t)(~ra_val) + (__uint128_t)imm + 1;
-+        carry = (wide >> 64) != 0;
-+      }
-+      setXERCA(carry);
-+      break;
-+    }
-+    case 12: {
-+      // addic: RT = RA + SI, set CA
-+      uint64_t ra_val = U64(getRegister(ra));
-+      uint64_t imm = U64((int64_t)si);
-+      uint64_t result = ra_val + imm;
-+      setRegister(rt, I64(result));
-+      setXERCA(result < ra_val);
-+      break;
-+    }
-+    case 13: {
-+      // addic.: RT = RA + SI, set CA, update CR0
-+      uint64_t ra_val = U64(getRegister(ra));
-+      uint64_t imm = U64((int64_t)si);
-+      uint64_t result = ra_val + imm;
-+      setRegister(rt, I64(result));
-+      setXERCA(result < ra_val);
-+      updateCR0(I64(result));
-+      break;
-+    }
-+    case 7: {
-+      // mulli: RT = RA * SI (low 64 bits)
-+      int64_t result = getRegister(ra) * (int64_t)si;
-+      setRegister(rt, result);
-+      break;
-+    }
-+    case 3: {
-+      // twi: Trap Word Immediate. We don't implement trapping in the
-+      // simulator; just continue.
-+      break;
-+    }
-+    default:
-+      MOZ_CRASH_UNSAFE_PRINTF("decodeDFormALU: unhandled opcode %u", opcode);
-+  }
-+}
-+
-+// -----------------------------------------------------------------------------
-+// decodeDFormLoad: lwz(32), lbz(34), lhz(40), lha(42), lfs(48), lfd(50)
-+//   and update variants
-+
-+void Simulator::decodeDFormLoad(SimInstruction* instr) {
-+  uint32_t opcode = instr->opcode();
-+  uint32_t rt = instr->rtValue();
-+  int16_t si = instr->imm16Value();
-+  uint64_t ea = DFormEA(this, instr, si);
-+
-+  switch (opcode) {
-+    case 32:
-+      // lwz
-+      setRegister(rt, U64(readWU(ea, instr)));
-+      break;
-+    case 33: {
-+      // lwzu: RA != 0, load and update RA
-+      setRegister(rt, U64(readWU(ea, instr)));
-+      setRegister(instr->raValue(), ea);
-+      break;
-+    }
-+    case 34:
-+      // lbz
-+      setRegister(rt, U64(readBU(ea)));
-+      break;
-+    case 35: {
-+      // lbzu
-+      setRegister(rt, U64(readBU(ea)));
-+      setRegister(instr->raValue(), ea);
-+      break;
-+    }
-+    case 40:
-+      // lhz
-+      setRegister(rt, U64(readHU(ea, instr)));
-+      break;
-+    case 41: {
-+      // lhzu
-+      setRegister(rt, U64(readHU(ea, instr)));
-+      setRegister(instr->raValue(), ea);
-+      break;
-+    }
-+    case 42:
-+      // lha (half-word, sign-extended)
-+      setRegister(rt, (int64_t)readH(ea, instr));
-+      break;
-+    case 43: {
-+      // lhau
-+      setRegister(rt, (int64_t)readH(ea, instr));
-+      setRegister(instr->raValue(), ea);
-+      break;
-+    }
-+    case 48: {
-+      // lfs: load float single, widen to double in FPR (NaN-preserving;
-+      // matches Power ISA `lfs` which uses xscvspdpn semantics)
-+      if (handleWasmSegFault(ea, 4)) break;
-+      float val = *reinterpret_cast<float*>(ea);
-+      setFpuRegisterDouble(rt, promoteFloatPreservingNaN(val));
-+      break;
-+    }
-+    case 49: {
-+      // lfsu
-+      if (handleWasmSegFault(ea, 4)) break;
-+      float val = *reinterpret_cast<float*>(ea);
-+      setFpuRegisterDouble(rt, promoteFloatPreservingNaN(val));
-+      setRegister(instr->raValue(), ea);
-+      break;
-+    }
-+    case 50: {
-+      // lfd: load float double
-+      double val = readD(ea, instr);
-+      setFpuRegisterDouble(rt, val);
-+      break;
-+    }
-+    case 51: {
-+      // lfdu
-+      double val = readD(ea, instr);
-+      setFpuRegisterDouble(rt, val);
-+      setRegister(instr->raValue(), ea);
-+      break;
-+    }
-+    default:
-+      MOZ_CRASH_UNSAFE_PRINTF("decodeDFormLoad: unhandled opcode %u", opcode);
-+  }
-+}
-+
-+// -----------------------------------------------------------------------------
-+// decodeDFormStore: stw(36), stwu(37), stb(38), sth(44), stfs(52), stfd(54)
-+//   and update variants
-+
-+void Simulator::decodeDFormStore(SimInstruction* instr) {
-+  uint32_t opcode = instr->opcode();
-+  uint32_t rs = instr->rsValue();
-+  int16_t si = instr->imm16Value();
-+
-+  // For stores, the effective address calculation differs for update forms:
-+  // - Non-update: EA = (RA|0) + D
-+  // - Update: EA = RA + D (RA must not be 0)
-+  bool isUpdate = false;
-+  switch (opcode) {
-+    case 37: case 39: case 45: case 53: case 55:
-+      isUpdate = true;
-+      break;
-+  }
-+
-+  uint64_t ea;
-+  if (isUpdate) {
-+    ea = getRegister(instr->raValue()) + (int64_t)si;
-+  } else {
-+    ea = DFormEA(this, instr, si);
-+  }
-+
-+  switch (opcode) {
-+    case 36:
-+      // stw
-+      writeW(ea, I32(getRegister(rs)), instr);
-+      break;
-+    case 38:
-+      // stb
-+      writeB(ea, (uint8_t)(getRegister(rs) & 0xFF));
-+      break;
-+    case 39:
-+      // stbu
-+      writeB(ea, (uint8_t)(getRegister(rs) & 0xFF));
-+      setRegister(instr->raValue(), ea);
-+      break;
-+    case 44:
-+      // sth
-+      writeH(ea, U16(getRegister(rs)), instr);
-+      break;
-+    case 45:
-+      // sthu
-+      writeH(ea, U16(getRegister(rs)), instr);
-+      setRegister(instr->raValue(), ea);
-+      break;
-+    case 52: {
-+      // stfs: convert double in FPR to single and store (NaN-preserving;
-+      // matches Power ISA `stfs` which uses xscvdpspn semantics)
-+      double dval = getFpuRegisterDouble(rs);
-+      float fval = demoteDoublePreservingNaN(dval);
-+      if (handleWasmSegFault(ea, 4)) break;
-+      *reinterpret_cast<float*>(ea) = fval;
-+      LLBit_ = false;
-+      break;
-+    }
-+    case 53: {
-+      // stfsu
-+      double dval = getFpuRegisterDouble(rs);
-+      float fval = demoteDoublePreservingNaN(dval);
-+      if (handleWasmSegFault(ea, 4)) break;
-+      *reinterpret_cast<float*>(ea) = fval;
-+      LLBit_ = false;
-+      setRegister(instr->raValue(), ea);
-+      break;
-+    }
-+    case 54:
-+      // stfd
-+      writeD(ea, getFpuRegisterDouble(rs), instr);
-+      break;
-+    case 55:
-+      // stfdu
-+      writeD(ea, getFpuRegisterDouble(rs), instr);
-+      setRegister(instr->raValue(), ea);
-+      break;
-+    default:
-+      MOZ_CRASH_UNSAFE_PRINTF("decodeDFormStore: unhandled opcode %u", opcode);
-+  }
-+}
-+
-+// -----------------------------------------------------------------------------
-+// decodeDSForm: ld(58/0), lwa(58/2), std(62/0), stdu(62/1)
-+
-+void Simulator::decodeDSForm(SimInstruction* instr) {
-+  uint32_t opcode = instr->opcode();
-+  uint32_t rt = instr->rtValue();
-+  int16_t ds = instr->ds14Value();
-+  uint32_t xo = instr->bits(1, 0);
-+
-+  if (opcode == 58) {
-+    uint64_t ea = DSFormEA(this, instr, ds);
-+    switch (xo) {
-+      case 0:
-+        // ld
-+        setRegister(rt, readDW(ea, instr));
-+        break;
-+      case 1: {
-+        // ldu
-+        setRegister(rt, readDW(ea, instr));
-+        setRegister(instr->raValue(), ea);
-+        break;
-+      }
-+      case 2:
-+        // lwa (load word algebraic, sign-extended to 64)
-+        setRegister(rt, (int64_t)readW(ea, instr));
-+        break;
-+      default:
-+        MOZ_CRASH_UNSAFE_PRINTF("decodeDSForm: opcode 58, xo=%u", xo);
-+    }
-+  } else if (opcode == 62) {
-+    // For std/stdu, EA uses RA directly (no RA|0 rule).
-+    uint64_t ea;
-+    if (xo == 1) {
-+      // stdu: update form
-+      ea = getRegister(instr->raValue()) + (int64_t)ds;
-+    } else {
-+      ea = DSFormEA(this, instr, ds);
-+    }
-+    switch (xo) {
-+      case 0:
-+        // std
-+        writeDW(ea, getRegister(rt), instr);
-+        break;
-+      case 1:
-+        // stdu
-+        writeDW(ea, getRegister(rt), instr);
-+        setRegister(instr->raValue(), ea);
-+        break;
-+      default:
-+        MOZ_CRASH_UNSAFE_PRINTF("decodeDSForm: opcode 62, xo=%u", xo);
-+    }
-+  } else {
-+    MOZ_CRASH_UNSAFE_PRINTF("decodeDSForm: unhandled opcode %u", opcode);
-+  }
-+}
-+
-+// -----------------------------------------------------------------------------
-+// decodeXForm: Major opcode 31 (X-form, XO-form, etc.)
-+// This is the largest decoder covering most ALU, indexed load/store, SPR,
-+// and atomic instructions.
-+
-+void Simulator::decodeXForm(SimInstruction* instr) {
-+  uint32_t xo = instr->xoValue();
-+  uint32_t rt = instr->rtValue();
-+  uint32_t ra = instr->raValue();
-+  uint32_t rb = instr->rbValue();
-+  bool rc = instr->rcBit();
-+
-+  // Many instructions share major opcode 31. Switch on extended opcode.
-+  // For XO-form with OE=1, the xoValue() includes bit 10, so
-+  // addo (266 | 512 = 778) etc. are separate cases.
-+
-+  // First check for isel which uses bits 1-5 = 15 (XO = 15 in bits 1..5).
-+  if ((xo & 0x1F) == 15) {
-+    // isel: if CR[BC] then RT=RA else RT=RB
-+    // BC is in bits 6..10 (the rc field position).
-+    uint32_t bc = instr->rcValue();
-+    uint32_t crField = bc / 4;
-+    uint32_t crBit = bc % 4;
-+    uint8_t crFieldVal = getCRField(crField);
-+    // PPC CR field bits: bit3=LT(8), bit2=GT(4), bit1=EQ(2), bit0=SO(1)
-+    // Bit numbering within field: 0=LT, 1=GT, 2=EQ, 3=SO
-+    bool bitSet;
-+    switch (crBit) {
-+      case 0: bitSet = (crFieldVal & kCRFieldLT) != 0; break;
-+      case 1: bitSet = (crFieldVal & kCRFieldGT) != 0; break;
-+      case 2: bitSet = (crFieldVal & kCRFieldEQ) != 0; break;
-+      case 3: bitSet = (crFieldVal & kCRFieldSO) != 0; break;
-+      default: bitSet = false; break;
-+    }
-+    int64_t raVal = (ra == 0) ? 0 : getRegister(ra);
-+    int64_t rbVal = getRegister(rb);
-+    setRegister(rt, bitSet ? raVal : rbVal);
-+    return;
-+  }
-+
-+  switch (xo) {
-+    // --- Arithmetic ---
-+    case 266: {
-+      // add
-+      int64_t result = getRegister(ra) + getRegister(rb);
-+      setRegister(rt, result);
-+      if (rc) updateCR0(result);
-+      break;
-+    }
-+    case 778: {
-+      // addo
-+      int64_t ra_val = getRegister(ra);
-+      int64_t rb_val = getRegister(rb);
-+      int64_t result = ra_val + rb_val;
-+      setRegister(rt, result);
-+      // Overflow if signs of inputs are same but result sign differs.
-+      bool ov = ((ra_val ^ result) & (rb_val ^ result)) < 0;
-+      setXEROV(ov);
-+      if (rc) updateCR0(result);
-+      break;
-+    }
-+    case 10: {
-+      // addc
-+      uint64_t ra_val = U64(getRegister(ra));
-+      uint64_t rb_val = U64(getRegister(rb));
-+      uint64_t result = ra_val + rb_val;
-+      setRegister(rt, I64(result));
-+      setXERCA(result < ra_val);
-+      if (rc) updateCR0(I64(result));
-+      break;
-+    }
-+    case 138: {
-+      // adde: RT = RA + RB + CA
-+      uint64_t ra_val = U64(getRegister(ra));
-+      uint64_t rb_val = U64(getRegister(rb));
-+      uint64_t ca = getXERCA() ? 1ULL : 0ULL;
-+      uint64_t result = ra_val + rb_val + ca;
-+      setRegister(rt, I64(result));
-+      // Carry-out: when ca==0, only the ra+rb wrap matters; when ca==1,
-+      // an additional wrap occurs iff result <= ra_val.
-+      bool newCA = ca ? (result <= ra_val) : (result < ra_val);
-+      setXERCA(newCA);
-+      if (rc) updateCR0(I64(result));
-+      break;
-+    }
-+    case 234: {
-+      // addme: RT = RA + CA - 1
-+      uint64_t ra_val = U64(getRegister(ra));
-+      uint64_t ca = getXERCA() ? 1ULL : 0ULL;
-+      uint64_t result = ra_val + ca + ~0ULL;  // + CA + (-1)
-+      setRegister(rt, I64(result));
-+      // CA if carry out of (RA + CA + 0xFFFFFFFFFFFFFFFF)
-+      bool newCA = (ra_val != 0) || (ca != 0);
-+      setXERCA(newCA);
-+      if (rc) updateCR0(I64(result));
-+      break;
-+    }
-+    case 202: {
-+      // addze: RT = RA + CA
-+      uint64_t ra_val = U64(getRegister(ra));
-+      uint64_t ca = getXERCA() ? 1ULL : 0ULL;
-+      uint64_t result = ra_val + ca;
-+      setRegister(rt, I64(result));
-+      setXERCA(result < ra_val);
-+      if (rc) updateCR0(I64(result));
-+      break;
-+    }
-+    case 40: {
-+      // subf: RT = RB - RA
-+      int64_t result = getRegister(rb) - getRegister(ra);
-+      setRegister(rt, result);
-+      if (rc) updateCR0(result);
-+      break;
-+    }
-+    case 552: {
-+      // subfo: RT = RB - RA, set OV
-+      int64_t ra_val = getRegister(ra);
-+      int64_t rb_val = getRegister(rb);
-+      int64_t result = rb_val - ra_val;
-+      setRegister(rt, result);
-+      bool ov = ((rb_val ^ ra_val) & (rb_val ^ result)) < 0;
-+      setXEROV(ov);
-+      if (rc) updateCR0(result);
-+      break;
-+    }
-+    case 8: {
-+      // subfc: RT = ~RA + RB + 1
-+      uint64_t ra_val = U64(getRegister(ra));
-+      uint64_t rb_val = U64(getRegister(rb));
-+      uint64_t result = ~ra_val + rb_val + 1;
-+      setRegister(rt, I64(result));
-+      // CA = no borrow = (RB >= RA unsigned)
-+      setXERCA(rb_val >= ra_val);
-+      if (rc) updateCR0(I64(result));
-+      break;
-+    }
-+    case 136: {
-+      // subfe: RT = ~RA + RB + CA
-+      uint64_t ra_val = U64(getRegister(ra));
-+      uint64_t rb_val = U64(getRegister(rb));
-+      uint64_t ca = getXERCA() ? 1ULL : 0ULL;
-+      uint64_t result = ~ra_val + rb_val + ca;
-+      setRegister(rt, I64(result));
-+      __uint128_t wide = (__uint128_t)(~ra_val) + (__uint128_t)rb_val + ca;
-+      setXERCA((wide >> 64) != 0);
-+      if (rc) updateCR0(I64(result));
-+      break;
-+    }
-+    case 232: {
-+      // subfze: RT = ~RA + CA
-+      uint64_t ra_val = U64(getRegister(ra));
-+      uint64_t ca = getXERCA() ? 1ULL : 0ULL;
-+      uint64_t result = ~ra_val + ca;
-+      setRegister(rt, I64(result));
-+      setXERCA(ca > ra_val);  // CA if ~RA + CA overflows
-+      if (rc) updateCR0(I64(result));
-+      break;
-+    }
-+    case 104: {
-+      // neg: RT = -RA
-+      int64_t result = -getRegister(ra);
-+      setRegister(rt, result);
-+      if (rc) updateCR0(result);
-+      break;
-+    }
-+
-+    // --- Multiply ---
-+    case 233: {
-+      // mulld: RT = RA * RB (low 64 bits)
-+      int64_t result = getRegister(ra) * getRegister(rb);
-+      setRegister(rt, result);
-+      if (rc) updateCR0(result);
-+      break;
-+    }
-+    case 745: {
-+      // mulldo: RT = RA * RB, set OV
-+      int64_t ra_val = getRegister(ra);
-+      int64_t rb_val = getRegister(rb);
-+      int64_t result = ra_val * rb_val;
-+      setRegister(rt, result);
-+      // OV if high part of full 128-bit product is not all-sign.
-+      int64_t hi = MultiplyHighSigned(ra_val, rb_val);
-+      bool ov = (hi != (result >> 63));
-+      setXEROV(ov);
-+      if (rc) updateCR0(result);
-+      break;
-+    }
-+    case 235: {
-+      // mullw: RT = sign_ext(RA[32:63] * RB[32:63])
-+      int64_t result = (int64_t)I32(getRegister(ra)) *
-+                       (int64_t)I32(getRegister(rb));
-+      setRegister(rt, result);
-+      if (rc) updateCR0(result);
-+      break;
-+    }
-+    case 747: {
-+      // mullwo
-+      int64_t ra_val = I32(getRegister(ra));
-+      int64_t rb_val = I32(getRegister(rb));
-+      int64_t result = ra_val * rb_val;
-+      setRegister(rt, result);
-+      bool ov = (result != (int64_t)I32(result));
-+      setXEROV(ov);
-+      if (rc) updateCR0(result);
-+      break;
-+    }
-+    case 73: {
-+      // mulhd: RT = high 64 bits of RA * RB (signed)
-+      setRegister(rt, MultiplyHighSigned(getRegister(ra), getRegister(rb)));
-+      if (rc) updateCR0(getRegister(rt));
-+      break;
-+    }
-+    case 9: {
-+      // mulhdu: RT = high 64 bits of RA * RB (unsigned)
-+      setRegister(rt, I64(MultiplyHighUnsigned(U64(getRegister(ra)),
-+                                               U64(getRegister(rb)))));
-+      if (rc) updateCR0(getRegister(rt));
-+      break;
-+    }
-+    case 75: {
-+      // mulhw: RT = high 32 bits of (RA[32:63] * RB[32:63]), signed
-+      int64_t result =
-+          (int64_t)I32(getRegister(ra)) * (int64_t)I32(getRegister(rb));
-+      setRegister(rt, result >> 32);
-+      if (rc) updateCR0(getRegister(rt));
-+      break;
-+    }
-+    case 11: {
-+      // mulhwu: RT = high 32 bits, unsigned
-+      uint64_t result =
-+          (uint64_t)U32(getRegister(ra)) * (uint64_t)U32(getRegister(rb));
-+      setRegister(rt, I64(result >> 32));
-+      if (rc) updateCR0(getRegister(rt));
-+      break;
-+    }
-+
-+    // --- Divide ---
-+    case 489: {
-+      // divd: RT = RA / RB (signed, 64-bit)
-+      int64_t ra_val = getRegister(ra);
-+      int64_t rb_val = getRegister(rb);
-+      if (rb_val == 0 || (ra_val == INT64_MIN && rb_val == -1)) {
-+        setRegister(rt, 0);
-+      } else {
-+        setRegister(rt, ra_val / rb_val);
-+      }
-+      if (rc) updateCR0(getRegister(rt));
-+      break;
-+    }
-+    case 1001: {
-+      // divdo
-+      int64_t ra_val = getRegister(ra);
-+      int64_t rb_val = getRegister(rb);
-+      bool ov = (rb_val == 0) || (ra_val == INT64_MIN && rb_val == -1);
-+      if (ov) {
-+        setRegister(rt, 0);
-+      } else {
-+        setRegister(rt, ra_val / rb_val);
-+      }
-+      setXEROV(ov);
-+      if (rc) updateCR0(getRegister(rt));
-+      break;
-+    }
-+    case 457: {
-+      // divdu: unsigned 64-bit divide
-+      uint64_t ra_val = U64(getRegister(ra));
-+      uint64_t rb_val = U64(getRegister(rb));
-+      if (rb_val == 0) {
-+        setRegister(rt, 0);
-+      } else {
-+        setRegister(rt, I64(ra_val / rb_val));
-+      }
-+      if (rc) updateCR0(getRegister(rt));
-+      break;
-+    }
-+    case 969: {
-+      // divduo
-+      uint64_t ra_val = U64(getRegister(ra));
-+      uint64_t rb_val = U64(getRegister(rb));
-+      bool ov = (rb_val == 0);
-+      if (ov) {
-+        setRegister(rt, 0);
-+      } else {
-+        setRegister(rt, I64(ra_val / rb_val));
-+      }
-+      setXEROV(ov);
-+      if (rc) updateCR0(getRegister(rt));
-+      break;
-+    }
-+    case 491: {
-+      // divw: signed 32-bit divide
-+      int32_t ra_val = I32(getRegister(ra));
-+      int32_t rb_val = I32(getRegister(rb));
-+      if (rb_val == 0 || (ra_val == INT32_MIN && rb_val == -1)) {
-+        setRegister(rt, 0);
-+      } else {
-+        setRegister(rt, (int64_t)(ra_val / rb_val));
-+      }
-+      if (rc) updateCR0(getRegister(rt));
-+      break;
-+    }
-+    case 1003: {
-+      // divwo
-+      int32_t ra_val = I32(getRegister(ra));
-+      int32_t rb_val = I32(getRegister(rb));
-+      bool ov = (rb_val == 0) || (ra_val == INT32_MIN && rb_val == -1);
-+      if (ov) {
-+        setRegister(rt, 0);
-+      } else {
-+        setRegister(rt, (int64_t)(ra_val / rb_val));
-+      }
-+      setXEROV(ov);
-+      if (rc) updateCR0(getRegister(rt));
-+      break;
-+    }
-+    case 459: {
-+      // divwu: unsigned 32-bit divide
-+      uint32_t ra_val = U32(getRegister(ra));
-+      uint32_t rb_val = U32(getRegister(rb));
-+      if (rb_val == 0) {
-+        setRegister(rt, 0);
-+      } else {
-+        setRegister(rt, (int64_t)(ra_val / rb_val));
-+      }
-+      if (rc) updateCR0(getRegister(rt));
-+      break;
-+    }
-+    case 971: {
-+      // divwuo
-+      uint32_t ra_val = U32(getRegister(ra));
-+      uint32_t rb_val = U32(getRegister(rb));
-+      bool ov = (rb_val == 0);
-+      if (ov) {
-+        setRegister(rt, 0);
-+      } else {
-+        setRegister(rt, (int64_t)(ra_val / rb_val));
-+      }
-+      setXEROV(ov);
-+      if (rc) updateCR0(getRegister(rt));
-+      break;
-+    }
-+
-+    // --- POWER9 modulo (ISA 3.0) ---
-+    // Result of "undefined" division (rb_val == 0, or signed INT_MIN / -1)
-+    // is implementation-defined per Power ISA; matching the divX behaviour
-+    // above, we yield 0 in those cases. Rc has no encoding for these ops.
-+    case 779: {
-+      // modsw: RT = RA % RB (signed, 32-bit)
-+      int32_t ra_val = I32(getRegister(ra));
-+      int32_t rb_val = I32(getRegister(rb));
-+      if (rb_val == 0 || (ra_val == INT32_MIN && rb_val == -1)) {
-+        setRegister(rt, 0);
-+      } else {
-+        setRegister(rt, (int64_t)(ra_val % rb_val));
-+      }
-+      break;
-+    }
-+    case 267: {
-+      // moduw: RT = RA % RB (unsigned, 32-bit)
-+      uint32_t ra_val = U32(getRegister(ra));
-+      uint32_t rb_val = U32(getRegister(rb));
-+      if (rb_val == 0) {
-+        setRegister(rt, 0);
-+      } else {
-+        setRegister(rt, (int64_t)(ra_val % rb_val));
-+      }
-+      break;
-+    }
-+    case 777: {
-+      // modsd: RT = RA % RB (signed, 64-bit)
-+      int64_t ra_val = getRegister(ra);
-+      int64_t rb_val = getRegister(rb);
-+      if (rb_val == 0 || (ra_val == INT64_MIN && rb_val == -1)) {
-+        setRegister(rt, 0);
-+      } else {
-+        setRegister(rt, ra_val % rb_val);
-+      }
-+      break;
-+    }
-+    case 265: {
-+      // modud: RT = RA % RB (unsigned, 64-bit)
-+      uint64_t ra_val = U64(getRegister(ra));
-+      uint64_t rb_val = U64(getRegister(rb));
-+      if (rb_val == 0) {
-+        setRegister(rt, 0);
-+      } else {
-+        setRegister(rt, I64(ra_val % rb_val));
-+      }
-+      break;
-+    }
-+
-+    // --- Logical ---
-+    case 28: {
-+      // and: RA = RS & RB
-+      int64_t result = getRegister(rt) & getRegister(rb);
-+      setRegister(ra, result);
-+      if (rc) updateCR0(result);
-+      break;
-+    }
-+    case 60: {
-+      // andc: RA = RS & ~RB
-+      int64_t result = getRegister(rt) & ~getRegister(rb);
-+      setRegister(ra, result);
-+      if (rc) updateCR0(result);
-+      break;
-+    }
-+    case 444: {
-+      // or: RA = RS | RB
-+      int64_t result = getRegister(rt) | getRegister(rb);
-+      setRegister(ra, result);
-+      if (rc) updateCR0(result);
-+      break;
-+    }
-+    case 412: {
-+      // orc: RA = RS | ~RB
-+      int64_t result = getRegister(rt) | ~getRegister(rb);
-+      setRegister(ra, result);
-+      if (rc) updateCR0(result);
-+      break;
-+    }
-+    case 316: {
-+      // xor: RA = RS ^ RB
-+      int64_t result = getRegister(rt) ^ getRegister(rb);
-+      setRegister(ra, result);
-+      if (rc) updateCR0(result);
-+      break;
-+    }
-+    case 476: {
-+      // nand: RA = ~(RS & RB)
-+      int64_t result = ~(getRegister(rt) & getRegister(rb));
-+      setRegister(ra, result);
-+      if (rc) updateCR0(result);
-+      break;
-+    }
-+    case 124: {
-+      // nor: RA = ~(RS | RB)
-+      int64_t result = ~(getRegister(rt) | getRegister(rb));
-+      setRegister(ra, result);
-+      if (rc) updateCR0(result);
-+      break;
-+    }
-+    case 284: {
-+      // eqv: RA = ~(RS ^ RB)
-+      int64_t result = ~(getRegister(rt) ^ getRegister(rb));
-+      setRegister(ra, result);
-+      if (rc) updateCR0(result);
-+      break;
-+    }
-+
-+    // --- Shifts ---
-+    case 27: {
-+      // sld: RA = RS << RB[58:63] if RB[57]==0, else RA=0
-+      uint64_t shift = U64(getRegister(rb));
-+      uint64_t rs_val = U64(getRegister(rt));
-+      int64_t result;
-+      if (shift & 0x40) {
-+        result = 0;
-+      } else {
-+        result = I64(rs_val << (shift & 0x3F));
-+      }
-+      setRegister(ra, result);
-+      if (rc) updateCR0(result);
-+      break;
-+    }
-+    case 24: {
-+      // slw: RA = RS[32:63] << RB[59:63] if RB[58]==0, else RA=0 (32-bit)
-+      uint32_t shift = U32(getRegister(rb));
-+      uint32_t rs_val = U32(getRegister(rt));
-+      uint32_t result;
-+      if (shift & 0x20) {
-+        result = 0;
-+      } else {
-+        result = rs_val << (shift & 0x1F);
-+      }
-+      setRegister(ra, (int64_t)(int32_t)result);
-+      if (rc) updateCR0(getRegister(ra));
-+      break;
-+    }
-+    case 539: {
-+      // srd: RA = RS >> RB[58:63] if RB[57]==0, else RA=0 (logical)
-+      uint64_t shift = U64(getRegister(rb));
-+      uint64_t rs_val = U64(getRegister(rt));
-+      int64_t result;
-+      if (shift & 0x40) {
-+        result = 0;
-+      } else {
-+        result = I64(rs_val >> (shift & 0x3F));
-+      }
-+      setRegister(ra, result);
-+      if (rc) updateCR0(result);
-+      break;
-+    }
-+    case 536: {
-+      // srw: RA = RS[32:63] >> RB[59:63] logical (32-bit)
-+      uint32_t shift = U32(getRegister(rb));
-+      uint32_t rs_val = U32(getRegister(rt));
-+      uint32_t result;
-+      if (shift & 0x20) {
-+        result = 0;
-+      } else {
-+        result = rs_val >> (shift & 0x1F);
-+      }
-+      setRegister(ra, (int64_t)(int32_t)result);
-+      if (rc) updateCR0(getRegister(ra));
-+      break;
-+    }
-+    case 794: {
-+      // srad: RA = RS >> RB[58:63] arithmetic (64-bit), set CA
-+      uint64_t shift = U64(getRegister(rb));
-+      int64_t rs_val = getRegister(rt);
-+      int64_t result;
-+      bool carry;
-+      if (shift & 0x40) {
-+        result = rs_val >> 63;  // all sign bits
-+        carry = (rs_val < 0);
-+      } else {
-+        uint32_t sh = shift & 0x3F;
-+        result = rs_val >> sh;
-+        // CA = 1 if RS is negative and any 1-bits were shifted out.
-+        carry = (rs_val < 0) && ((rs_val & ((1ULL << sh) - 1)) != 0);
-+      }
-+      setRegister(ra, result);
-+      setXERCA(carry);
-+      if (rc) updateCR0(result);
-+      break;
-+    }
-+    case 792: {
-+      // sraw: RA = RS[32:63] >> RB[59:63] arithmetic (32-bit), set CA
-+      uint32_t shift = U32(getRegister(rb));
-+      int32_t rs_val = I32(getRegister(rt));
-+      int32_t result;
-+      bool carry;
-+      if (shift & 0x20) {
-+        result = rs_val >> 31;
-+        carry = (rs_val < 0);
-+      } else {
-+        uint32_t sh = shift & 0x1F;
-+        result = rs_val >> sh;
-+        carry = (rs_val < 0) && ((rs_val & ((1U << sh) - 1)) != 0);
-+      }
-+      setRegister(ra, (int64_t)result);
-+      setXERCA(carry);
-+      if (rc) updateCR0(getRegister(ra));
-+      break;
-+    }
-+    case 826:
-+    case 827: {
-+      // sradi RA, RS, SH: RA = EXTS(RS) >> sh arithmetic (64-bit), set CA.
-+      // XS-form, XO=413 (9-bit, bits 21-29), sh[5] at bit 30, Rc at bit 31.
-+      // Our xoValue() extracts bits 10:1 (10 bits)
-+      // which yields 413*2 + sh[5] = 826 (sh[5]=0) or 827 (sh[5]=1).
-+      // sh[0:4] at instruction bits 15:11 (= raValue field position, but
-+      // for this XS-form they're the SH[0:4] subfield).
-+      uint32_t sh = instr->bits(15, 11) | (instr->bit(1) << 5);
-+      int64_t rs_val = getRegister(rt);
-+      int64_t result = (sh == 0) ? rs_val : (rs_val >> sh);
-+      // CA := rs_val < 0 && any bits shifted out are 1.
-+      bool carry = (rs_val < 0) && sh > 0 &&
-+                   ((U64(rs_val) & ((1ULL << sh) - 1)) != 0);
-+      setRegister(ra, result);
-+      setXERCA(carry);
-+      if (rc) updateCR0(result);
-+      break;
-+    }
-+    case 824: {
-+      // srawi: RA = RS[32:63] >> SH arithmetic (32-bit), set CA
-+      uint32_t sh = instr->bits(15, 11);
-+      int32_t rs_val = I32(getRegister(rt));
-+      int32_t result = rs_val >> sh;
-+      bool carry = (rs_val < 0) && sh > 0 &&
-+                   ((U32(rs_val) & ((1U << sh) - 1)) != 0);
-+      setRegister(ra, (int64_t)result);
-+      setXERCA(carry);
-+      if (rc) updateCR0(getRegister(ra));
-+      break;
-+    }
-+
-+    // --- Extend / count ---
-+    case 954: {
-+      // extsb: RA = sign_ext(RS[56:63])
-+      int64_t result = (int64_t)(int8_t)(getRegister(rt) & 0xFF);
-+      setRegister(ra, result);
-+      if (rc) updateCR0(result);
-+      break;
-+    }
-+    case 922: {
-+      // extsh: RA = sign_ext(RS[48:63])
-+      int64_t result = (int64_t)(int16_t)(getRegister(rt) & 0xFFFF);
-+      setRegister(ra, result);
-+      if (rc) updateCR0(result);
-+      break;
-+    }
-+    case 986: {
-+      // extsw: RA = sign_ext(RS[32:63])
-+      int64_t result = (int64_t)(int32_t)(getRegister(rt) & 0xFFFFFFFF);
-+      setRegister(ra, result);
-+      if (rc) updateCR0(result);
-+      break;
-+    }
-+    case 58: {
-+      // cntlzd: RA = count leading zeros of RS (64-bit)
-+      setRegister(ra, CountLeadingZeros64(U64(getRegister(rt))));
-+      if (rc) updateCR0(getRegister(ra));
-+      break;
-+    }
-+    case 26: {
-+      // cntlzw: RA = count leading zeros of RS[32:63] (32-bit)
-+      setRegister(ra, CountLeadingZeros32(U32(getRegister(rt))));
-+      if (rc) updateCR0(getRegister(ra));
-+      break;
-+    }
-+    case 570: {
-+      // cnttzd
-+      setRegister(ra, CountTrailingZeros64(U64(getRegister(rt))));
-+      if (rc) updateCR0(getRegister(ra));
-+      break;
-+    }
-+    case 538: {
-+      // cnttzw
-+      setRegister(ra, CountTrailingZeros32(U32(getRegister(rt))));
-+      if (rc) updateCR0(getRegister(ra));
-+      break;
-+    }
-+    case 506: {
-+      // popcntd
-+      setRegister(ra, PopCount64(U64(getRegister(rt))));
-+      break;
-+    }
-+    case 378: {
-+      // popcntw: popcount each 32-bit half independently, sum in each half
-+      uint64_t val = U64(getRegister(rt));
-+      uint32_t lo = PopCount32(U32(val));
-+      uint32_t hi = PopCount32(U32(val >> 32));
-+      setRegister(ra, I64(((uint64_t)hi << 32) | lo));
-+      break;
-+    }
-+    case 122: {
-+      // popcntb: popcount each byte independently
-+      setRegister(ra, I64(PopCountPerByte(U64(getRegister(rt)))));
-+      break;
-+    }
-+    case 187: {
-+      // brd (POWER10): RA = byte-reverse(RS) full 64-bit doubleword.
-+      setRegister(ra, I64(__builtin_bswap64(U64(getRegister(rt)))));
-+      break;
-+    }
-+    case 219: {
-+      // brh (POWER10): byte-reverse each of the 4 halfwords in RS.
-+      uint64_t v = U64(getRegister(rt));
-+      uint64_t out = ((v & 0xFF00FF00FF00FF00ULL) >> 8) |
-+                     ((v & 0x00FF00FF00FF00FFULL) << 8);
-+      setRegister(ra, I64(out));
-+      break;
-+    }
-+    case 155: {
-+      // brw (POWER10): byte-reverse each of the 2 words in RS.
-+      uint64_t v = U64(getRegister(rt));
-+      uint64_t out = ((uint64_t)__builtin_bswap32((uint32_t)(v >> 32)) << 32) |
-+                     (uint64_t)__builtin_bswap32((uint32_t)v);
-+      setRegister(ra, I64(out));
-+      break;
-+    }
-+
-+    // --- Compare (X-form) ---
-+    case 0: {
-+      // cmp (cmpw/cmpd): signed compare
-+      uint32_t bf = instr->bfValue();
-+      bool l = instr->lBit();
-+      if (l) {
-+        setCRFieldCmp(bf, getRegister(ra), getRegister(rb));
-+      } else {
-+        setCRFieldCmp(bf, (int64_t)I32(getRegister(ra)),
-+                      (int64_t)I32(getRegister(rb)));
-+      }
-+      break;
-+    }
-+    case 32: {
-+      // cmpl (cmplw/cmpld): unsigned compare
-+      uint32_t bf = instr->bfValue();
-+      bool l = instr->lBit();
-+      if (l) {
-+        setCRFieldCmpU(bf, U64(getRegister(ra)), U64(getRegister(rb)));
-+      } else {
-+        setCRFieldCmpU(bf, (uint64_t)U32(getRegister(ra)),
-+                       (uint64_t)U32(getRegister(rb)));
-+      }
-+      break;
-+    }
-+
-+    // --- Trap ---
-+    case 4: {
-+      // tw: Trap Word. The JIT uses this for debugging / tagging.
-+      // In the simulator we just treat it as a NOP (the JIT uses tagged
-+      // trap words that are never actually reached during normal execution,
-+      // they serve as metadata for the patcher).
-+      break;
-+    }
-+
-+    // --- SPR ---
-+    case 339: {
-+      // mfspr: RT = SPR
-+      // SPR encoding: spr[4:0] at bits 16..20, spr[9:5] at bits 11..15
-+      uint32_t spr_lo = instr->raValue();  // bits 16..20
-+      uint32_t spr_hi = instr->rbValue();  // bits 11..15
-+      uint32_t spr = (spr_lo) | (spr_hi << 5);
-+      switch (spr) {
-+        case 8:  // LR
-+          setRegister(rt, getLR());
-+          break;
-+        case 9:  // CTR
-+          setRegister(rt, getCTR());
-+          break;
-+        case 1:  // XER
-+          setRegister(rt, I64(getXER()));
-+          break;
-+        default:
-+          MOZ_CRASH_UNSAFE_PRINTF("mfspr: unhandled SPR %u", spr);
-+      }
-+      break;
-+    }
-+    case 467: {
-+      // mtspr: SPR = RS
-+      uint32_t spr_lo = instr->raValue();
-+      uint32_t spr_hi = instr->rbValue();
-+      uint32_t spr = (spr_lo) | (spr_hi << 5);
-+      int64_t val = getRegister(rt);
-+      switch (spr) {
-+        case 8:  // LR
-+          setLR(val);
-+          break;
-+        case 9:  // CTR
-+          setCTR(val);
-+          break;
-+        case 1:  // XER
-+          setXER(U64(val));
-+          break;
-+        default:
-+          MOZ_CRASH_UNSAFE_PRINTF("mtspr: unhandled SPR %u", spr);
-+      }
-+      break;
-+    }
-+    case 19: {
-+      // mfocrf: read one CR field selected by the FXM bitmask into RT.
-+      // (Plain mfcr shares this XO with FXM=0; we model both by reading
-+      // the full CR — the JIT only emits mfocrf and the bits outside the
-+      // selected field are spec'd "undefined", so reading the full CR is
-+      // a valid implementation.)
-+      setRegister(rt, (int64_t)getCR());
-+      break;
-+    }
-+    case 144: {
-+      // mtcrf: move to CR fields
-+      // FXM field is in bits 12..19.
-+      uint32_t fxm = instr->bits(19, 12);
-+      uint32_t rs_val = U32(getRegister(rt));
-+      uint32_t cr = getCR();
-+      for (int i = 0; i < 8; i++) {
-+        if (fxm & (0x80 >> i)) {
-+          uint32_t shift = 4 * (7 - i);
-+          cr = (cr & ~(0xFu << shift)) | (rs_val & (0xFu << shift));
-+        }
-+      }
-+      setCR(cr);
-+      break;
-+    }
-+    case 576: {
-+      // mcrxrx: move XER[OV,OV32,CA,CA32] to CR field BF
-+      uint32_t bf = instr->bfValue();
-+      uint8_t field = 0;
-+      if (getXEROV()) field |= 0x8;
-+      // OV32 at bit 19 of XER
-+      if ((getXER() >> kXEROV32Bit) & 1) field |= 0x4;
-+      if (getXERCA()) field |= 0x2;
-+      if ((getXER() >> kXERCA32Bit) & 1) field |= 0x1;
-+      setCRField(bf, field);
-+      break;
-+    }
-+    case 384:
-+    case 416: {
-+      // POWER10 setbc/setbcr: RT = (CR[BI]==N) ? 1 : 0
-+      // BI at bits 11..15; xo=384 (setbc, N=1), xo=416 (setbcr, N=0).
-+      uint32_t bi = instr->raValue();
-+      uint32_t crField = bi / 4;
-+      uint32_t crBit = bi % 4;
-+      uint8_t crFieldVal = getCRField(crField);
-+      bool bitSet;
-+      switch (crBit) {
-+        case 0: bitSet = (crFieldVal & kCRFieldLT) != 0; break;
-+        case 1: bitSet = (crFieldVal & kCRFieldGT) != 0; break;
-+        case 2: bitSet = (crFieldVal & kCRFieldEQ) != 0; break;
-+        case 3: bitSet = (crFieldVal & kCRFieldSO) != 0; break;
-+        default: bitSet = false; break;
-+      }
-+      bool want = (xo == 384) ? bitSet : !bitSet;
-+      setRegister(rt, want ? 1 : 0);
-+      break;
-+    }
-+
-+    // --- Indexed loads ---
-+    case 21: {
-+      // ldx: RT = [RA|0 + RB], 8 bytes
-+      uint64_t ea = XFormEA(this, instr);
-+      setRegister(rt, readDW(ea, instr));
-+      break;
-+    }
-+    case 53: {
-+      // ldux: RT = [RA + RB], update RA
-+      uint64_t ea = XFormEAUpdate(this, instr);
-+      setRegister(rt, readDW(ea, instr));
-+      setRegister(ra, ea);
-+      break;
-+    }
-+    case 23: {
-+      // lwzx: RT = zero_ext([RA|0 + RB], 4 bytes)
-+      uint64_t ea = XFormEA(this, instr);
-+      setRegister(rt, U64(readWU(ea, instr)));
-+      break;
-+    }
-+    case 341: {
-+      // lwax: RT = sign_ext([RA|0 + RB], 4 bytes)
-+      uint64_t ea = XFormEA(this, instr);
-+      setRegister(rt, (int64_t)readW(ea, instr));
-+      break;
-+    }
-+    case 87: {
-+      // lbzx
-+      uint64_t ea = XFormEA(this, instr);
-+      setRegister(rt, U64(readBU(ea)));
-+      break;
-+    }
-+    case 279: {
-+      // lhzx
-+      uint64_t ea = XFormEA(this, instr);
-+      setRegister(rt, U64(readHU(ea, instr)));
-+      break;
-+    }
-+    case 343: {
-+      // lhax
-+      uint64_t ea = XFormEA(this, instr);
-+      setRegister(rt, (int64_t)readH(ea, instr));
-+      break;
-+    }
-+    case 535: {
-+      // lfsx: load float single indexed, widen to double (NaN-preserving)
-+      uint64_t ea = XFormEA(this, instr);
-+      if (!handleWasmSegFault(ea, 4)) {
-+        float val = *reinterpret_cast<float*>(ea);
-+        setFpuRegisterDouble(rt, promoteFloatPreservingNaN(val));
-+      }
-+      break;
-+    }
-+    case 599: {
-+      // lfdx: load float double indexed
-+      uint64_t ea = XFormEA(this, instr);
-+      setFpuRegisterDouble(rt, readD(ea, instr));
-+      break;
-+    }
-+    case 855: {
-+      // lfiwax: load float as integer word algebraic
-+      uint64_t ea = XFormEA(this, instr);
-+      int32_t val = readW(ea, instr);
-+      setFpuRegister(rt, (int64_t)val);
-+      break;
-+    }
-+    case 887: {
-+      // lfiwzx: load float as integer word zero
-+      uint64_t ea = XFormEA(this, instr);
-+      uint32_t val = readWU(ea, instr);
-+      setFpuRegister(rt, (int64_t)(uint64_t)val);
-+      break;
-+    }
-+
-+    // --- Indexed stores ---
-+    case 149: {
-+      // stdx
-+      uint64_t ea = XFormEA(this, instr);
-+      writeDW(ea, getRegister(rt), instr);
-+      break;
-+    }
-+    case 151: {
-+      // stwx
-+      uint64_t ea = XFormEA(this, instr);
-+      writeW(ea, I32(getRegister(rt)), instr);
-+      break;
-+    }
-+    case 215: {
-+      // stbx
-+      uint64_t ea = XFormEA(this, instr);
-+      writeB(ea, (uint8_t)(getRegister(rt) & 0xFF));
-+      break;
-+    }
-+    case 407: {
-+      // sthx
-+      uint64_t ea = XFormEA(this, instr);
-+      writeH(ea, U16(getRegister(rt)), instr);
-+      break;
-+    }
-+    case 663: {
-+      // stfsx: store float single indexed (NaN-preserving)
-+      uint64_t ea = XFormEA(this, instr);
-+      if (!handleWasmSegFault(ea, 4)) {
-+        float fval = demoteDoublePreservingNaN(getFpuRegisterDouble(rt));
-+        *reinterpret_cast<float*>(ea) = fval;
-+        LLBit_ = false;
-+      }
-+      break;
-+    }
-+    case 727: {
-+      // stfdx: store float double indexed
-+      uint64_t ea = XFormEA(this, instr);
-+      writeD(ea, getFpuRegisterDouble(rt), instr);
-+      break;
-+    }
-+
-+    // --- Byte-reversed stores ---
-+    case 662: {
-+      // stwbrx
-+      uint64_t ea = XFormEA(this, instr);
-+      uint32_t val = U32(getRegister(rt));
-+      writeW(ea, (int32_t)__builtin_bswap32(val), instr);
-+      break;
-+    }
-+
-+    // --- Atomic load/store ---
-+    //
-+    // Load-reserve and store-conditional. Sub-word variants
-+    // (lbarx/lharx/stbcx./sthcx.) were added in ISA v2.06 (POWER7+).
-+    // Word/doubleword variants (lwarx/stwcx./ldarx/stdcx.) go back
-+    // to the base ISA.
-+    case 52: {
-+      // lbarx RT, RA, RB, EH
-+      uint64_t ea = XFormEA(this, instr);
-+      uint8_t val = loadLinkedB(ea, instr);
-+      setRegister(rt, (int64_t)val);
-+      break;
-+    }
-+    case 116: {
-+      // lharx RT, RA, RB, EH
-+      uint64_t ea = XFormEA(this, instr);
-+      uint16_t val = loadLinkedH(ea, instr);
-+      setRegister(rt, (int64_t)val);
-+      break;
-+    }
-+    case 694: {
-+      // stbcx. RS, RA, RB: always Rc=1.
-+      uint64_t ea = XFormEA(this, instr);
-+      uint8_t val = uint8_t(getRegister(rt));
-+      int result = storeConditionalB(ea, val, instr);
-+      if (result) {
-+        setCRField(0, kCRFieldEQ | (kCRFieldSO * getXERSO()));
-+      } else {
-+        setCRField(0, kCRFieldSO * getXERSO());
-+      }
-+      break;
-+    }
-+    case 726: {
-+      // sthcx. RS, RA, RB: always Rc=1.
-+      uint64_t ea = XFormEA(this, instr);
-+      uint16_t val = uint16_t(getRegister(rt));
-+      int result = storeConditionalH(ea, val, instr);
-+      if (result) {
-+        setCRField(0, kCRFieldEQ | (kCRFieldSO * getXERSO()));
-+      } else {
-+        setCRField(0, kCRFieldSO * getXERSO());
-+      }
-+      break;
-+    }
-+    case 20: {
-+      // lwarx
-+      uint64_t ea = XFormEA(this, instr);
-+      int32_t val = loadLinkedW(ea, instr);
-+      setRegister(rt, (int64_t)val);
-+      break;
-+    }
-+    case 150: {
-+      // stwcx.
-+      uint64_t ea = XFormEA(this, instr);
-+      int32_t val = I32(getRegister(rt));
-+      int result = storeConditionalW(ea, val, instr);
-+      // stwcx. always updates CR0: EQ if store succeeded, else clear.
-+      if (result) {
-+        setCRField(0, kCRFieldEQ | (kCRFieldSO * getXERSO()));
-+      } else {
-+        setCRField(0, kCRFieldSO * getXERSO());
-+      }
-+      break;
-+    }
-+    case 84: {
-+      // ldarx
-+      uint64_t ea = XFormEA(this, instr);
-+      int64_t val = loadLinkedD(ea, instr);
-+      setRegister(rt, val);
-+      break;
-+    }
-+    case 214: {
-+      // stdcx.
-+      uint64_t ea = XFormEA(this, instr);
-+      int64_t val = getRegister(rt);
-+      int result = storeConditionalD(ea, val, instr);
-+      if (result) {
-+        setCRField(0, kCRFieldEQ | (kCRFieldSO * getXERSO()));
-+      } else {
-+        setCRField(0, kCRFieldSO * getXERSO());
-+      }
-+      break;
-+    }
-+
-+    // --- Synchronization ---
-+    case 598:
-+      // sync / lwsync / ptesync: no-op in simulator
-+      break;
-+    case 854:
-+      // eieio: no-op in simulator
-+      break;
-+
-+    // --- GPR <-> VSR move (major opcode 31, XX1-form) ---
-+    //
-+    // Two sub-encodings:
-+    //   mtvsr* XT,RA{,RB}: XX1Form — XT at bits 25:21 (5) + TX at bit 0 (1);
-+    //                      RA at bits 20:16; RB (if any) at bits 15:11.
-+    //   mfvsr* RA,XS:      XX1FormMfvsr — XS at bits 25:21 (5) + SX at bit 0 (1);
-+    //                      RA (GPR dest) at bits 20:16.
-+    //
-+    // The original decoder treated "rsValue()" (bits 25:21 = VSR field) as a
-+    // GPR index — doubly wrong: the GPR side lives at bits 20:16 (= raValue())
-+    // and the VSR side is 6 bits (5-bit field + extension bit at bit 0). Fixed
-+    // here and extended for the full VSR namespace (0-63).
-+    // The ISA names each field in BE. "XT.DW0" is the BE doubleword which on
-+    // PPC64LE register storage lives at LE bytes 8-15 (our bytes[] is LE-natural:
-+    // bytes[0] = lowest address). With `mtvsrd / mfvsrd / mtvsrdd / mfvsrld
-+    // / stxvx`: mtvsrd of 0x1122334455667788 produces `00 00 00 00 00 00 00 00
-+    // 88 77 66 55 44 33 22 11` in memory (LE bytes 8-15 hold the GPR bits with
-+    // LSB at byte 8). Matching semantics here means the sim respects
-+    // the full Power ISA, not a self-consistent LE-reversed
-+    // convention.
-+    case 51: {
-+      // mfvsrd RA, XS: GPR[RA] = XS.DW0 = LE bytes 8..15.
-+      int xs = int(instr->rtValue() | (instr->bit(0) << 5));  // T + SX(TX)
-+      uint8_t bytes[16];
-+      getVSR128(xs, bytes);
-+      int64_t val;
-+      memcpy(&val, bytes + 8, 8);
-+      setRegister(instr->raValue(), val);
-+      break;
-+    }
-+    case 211: {
-+      // mtvsrwa XT, RA: XT.DW0 = sign_ext_64(RA[32:63]); XT.DW1 = 0.
-+      // POWER8+ (ISA 2.07). Combines extsw + mtvsrd. LE layout: bytes
-+      // 8-15 ← sign-extended low 32 of RA; bytes 0-7 ← 0.
-+      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
-+      uint8_t bytes[16];
-+      int64_t val = (int64_t)(int32_t)getRegister(instr->raValue());
-+      memset(bytes, 0, 8);
-+      memcpy(bytes + 8, &val, 8);
-+      setVSR128(xt, bytes);
-+      break;
-+    }
-+    case 179: {
-+      // mtvsrd XT, RA: XT.DW0 = RA; XT.DW1 = 0.
-+      // LE layout: bytes 8-15 ← RA, bytes 0-7 ← 0.
-+      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
-+      uint8_t bytes[16];
-+      int64_t val = getRegister(instr->raValue());
-+      memset(bytes, 0, 8);
-+      memcpy(bytes + 8, &val, 8);
-+      setVSR128(xt, bytes);
-+      break;
-+    }
-+    case 243: {
-+      // mtvsrwz XT, RA: XT.DW0 = zero_ext(RA[32:63]); XT.DW1 = 0.
-+      // The 32-bit value lives in the low 32 bits of DW0 = BE word 1,
-+      // which on LE storage is LE bytes 8..11 (LE word 2); LE bytes
-+      // 12..15 = 0 (upper half of DW0 = BE word 0 = zero-extended).
-+      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
-+      uint8_t bytes[16];
-+      uint32_t lo = U32(getRegister(instr->raValue()));
-+      memset(bytes, 0, 16);
-+      bytes[8]  = (uint8_t)(lo);
-+      bytes[9]  = (uint8_t)(lo >> 8);
-+      bytes[10] = (uint8_t)(lo >> 16);
-+      bytes[11] = (uint8_t)(lo >> 24);
-+      setVSR128(xt, bytes);
-+      break;
-+    }
-+    case 307: {
-+      // mfvsrld RA, XS: GPR[RA] = XS.DW1 = LE bytes 0..7.
-+      // POWER9.
-+      int xs = int(instr->rtValue() | (instr->bit(0) << 5));
-+      uint8_t bytes[16];
-+      getVSR128(xs, bytes);
-+      int64_t val;
-+      memcpy(&val, bytes, 8);
-+      setRegister(instr->raValue(), val);
-+      break;
-+    }
-+    case 403: {
-+      // mtvsrws XT, RA (POWER9): splat low 32 bits of RA into all four
-+      // word elements of XT. The same 32-bit value appears in lanes 0..3,
-+      // so the byte layout is identical in LE and BE —
-+      // bytes 0..15 = lo | lo | lo | lo.
-+      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
-+      uint8_t bytes[16];
-+      uint32_t lo = U32(getRegister(instr->raValue()));
-+      uint64_t val = ((uint64_t)lo << 32) | lo;
-+      memcpy(bytes, &val, 8);
-+      memcpy(bytes + 8, &val, 8);
-+      setVSR128(xt, bytes);
-+      break;
-+    }
-+    case 435: {
-+      // mtvsrdd XT, RA, RB: XT.DW0 = RA; XT.DW1 = RB. POWER9.
-+      // LE: bytes 8-15 ← RA, bytes 0-7 ← RB.
-+      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
-+      uint8_t bytes[16];
-+      int64_t dw0 = getRegister(instr->raValue());
-+      int64_t dw1 = getRegister(instr->rbValue());
-+      memcpy(bytes,     &dw1, 8);
-+      memcpy(bytes + 8, &dw0, 8);
-+      setVSR128(xt, bytes);
-+      break;
-+    }
-+
-+    // --- VMX vector memory (major opcode 31) ---
-+    //
-+    // lvx / stvx / lvxl / stvxl.
-+    //   EA = (RA|0) + RB; EA = EA & ~0xF (alignment)
-+    //   lvx:  VRT[0:127] <- MEM(EA, 16)       bytes[0] = *(EA+0)
-+    //   stvx: MEM(EA, 16) <- VRS[0:127]       *(EA+0) = bytes[0]
-+    // lvxl / stvxl are identical in effect to lvx / stvx (the "l" form
-+    // hints "least recently used"; semantically indistinguishable).
-+    case 103: {
-+      // lvx: VRT = MEM(EA & ~0xF, 16 bytes)
-+      uint64_t ea = XFormEA(this, instr) & ~uint64_t(0xF);
-+      if (handleWasmSegFault(ea, 16)) break;
-+      memcpy(VRregisters_[rt], reinterpret_cast<const void*>(ea), 16);
-+      break;
-+    }
-+    case 231: {
-+      // stvx: MEM(EA & ~0xF, 16 bytes) = VRS
-+      uint64_t ea = XFormEA(this, instr) & ~uint64_t(0xF);
-+      if (handleWasmSegFault(ea, 16)) break;
-+      memcpy(reinterpret_cast<void*>(ea), VRregisters_[rt], 16);
-+      break;
-+    }
-+    case 359: {
-+      // lvxl: semantically identical to lvx
-+      uint64_t ea = XFormEA(this, instr) & ~uint64_t(0xF);
-+      if (handleWasmSegFault(ea, 16)) break;
-+      memcpy(VRregisters_[rt], reinterpret_cast<const void*>(ea), 16);
-+      break;
-+    }
-+    case 487: {
-+      // stvxl: semantically identical to stvx
-+      uint64_t ea = XFormEA(this, instr) & ~uint64_t(0xF);
-+      if (handleWasmSegFault(ea, 16)) break;
-+      memcpy(reinterpret_cast<void*>(ea), VRregisters_[rt], 16);
-+      break;
-+    }
-+
-+    // --- VSX vector memory indexed (major opcode 31) ---
-+    //
-+    // These ops take a 6-bit VSR register,
-+    // encoded as 5-bit T/S + 1-bit TX/SX extension at instruction LSB
-+    // bit 0 (= our instr->bit(0)). EA = (RA|0) + RB. 16-byte access,
-+    // not forced-aligned (hardware may handle misaligned via sub-access
-+    // or alignment interrupt per impl).
-+    //
-+    // Byte-order note: lxvx/stxvx perform a natural 16-byte LE
-+    // memcpy. lxvd2x/stxvd2x on real PPC64 LE hardware load/store
-+    // doublewords in BE-pair order — i.e. lxvd2x places memory bytes
-+    // 0-7 in the register's BE-DW0 (= LE bytes 8-15) and bytes 8-15
-+    // in BE-DW1 (= LE bytes 0-7). The JIT brackets every wasm SIMD
-+    // load/store with a compensating xxpermdi DM=2 so the net effect
-+    // is a natural LE byte order. The constant pool emits the same
-+    // lxvd2x + xxpermdi sequence (per PatchConstantPoolLoad) but
-+    // assumes the hardware semantics, not a plain memcpy. So the sim
-+    // must match real-hardware lxvd2x/stxvd2x semantics including the
-+    // BE-DW byte order — otherwise the post-load xxpermdi unswaps
-+    // bytes that were never swapped, and constant-pool Simd128 loads
-+    // (e.g. shuffle masks) come out with halves transposed.
-+    case 268: {
-+      // lxvx: XT = MEM((RA|0)+RB, 16)
-+      uint64_t ea = XFormEA(this, instr);
-+      if (handleWasmSegFault(ea, 16)) break;
-+      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
-+      uint8_t buf[16];
-+      memcpy(buf, reinterpret_cast<const void*>(ea), 16);
-+      setVSR128(xt, buf);
-+      break;
-+    }
-+    case 396: {
-+      // stxvx: MEM((RA|0)+RB, 16) = XS
-+      uint64_t ea = XFormEA(this, instr);
-+      if (handleWasmSegFault(ea, 16)) break;
-+      int xs = int(instr->rtValue() | (instr->bit(0) << 5));
-+      uint8_t buf[16];
-+      getVSR128(xs, buf);
-+      memcpy(reinterpret_cast<void*>(ea), buf, 16);
-+      break;
-+    }
-+    case 813: {
-+      // lxsihzx XT, RA, RB: P9 (ISA 3.0). Load halfword to VSR & zero,
-+      // indexed. MEM(EA, 2) (LE-natural halfword) is placed in dw[0]
-+      // low 16 bits; the rest of the VSR is zeroed. In sim LE-byte
-+      // storage, that is bytes[8..9] (low byte at bytes[8]).
-+      uint64_t ea = XFormEA(this, instr);
-+      if (handleWasmSegFault(ea, 2)) break;
-+      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
-+      uint16_t halfword = readH(ea, instr);
-+      uint8_t buf[16];
-+      memset(buf, 0, 16);
-+      buf[8] = (uint8_t)(halfword & 0xFF);
-+      buf[9] = (uint8_t)((halfword >> 8) & 0xFF);
-+      setVSR128(xt, buf);
-+      break;
-+    }
-+    case 941: {
-+      // stxsihx XS, RA, RB: P9 (ISA 3.0). Store halfword from VSR,
-+      // indexed. dw[0] low 16 bits (sim bytes[8..9] in host-LE order)
-+      // are written as a halfword at MEM(EA, 2).
-+      uint64_t ea = XFormEA(this, instr);
-+      if (handleWasmSegFault(ea, 2)) break;
-+      int xs = int(instr->rtValue() | (instr->bit(0) << 5));
-+      uint8_t buf[16];
-+      getVSR128(xs, buf);
-+      uint16_t halfword =
-+          (uint16_t)buf[8] | ((uint16_t)buf[9] << 8);
-+      writeH(ea, halfword, instr);
-+      break;
-+    }
-+    case 844: {
-+      // lxvd2x: XT = MEM((RA|0)+RB, 16) with BE-DW byte ordering.
-+      // Memory bytes 0-7 land in BE-DW0 (= LE bytes 8-15); memory
-+      // bytes 8-15 land in BE-DW1 (= LE bytes 0-7).
-+      uint64_t ea = XFormEA(this, instr);
-+      if (handleWasmSegFault(ea, 16)) break;
-+      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
-+      uint8_t mem[16], buf[16];
-+      memcpy(mem, reinterpret_cast<const void*>(ea), 16);
-+      memcpy(buf, mem + 8, 8);
-+      memcpy(buf + 8, mem, 8);
-+      setVSR128(xt, buf);
-+      break;
-+    }
-+    case 972: {
-+      // stxvd2x: MEM((RA|0)+RB, 16) = XS with BE-DW byte ordering.
-+      // Inverse of lxvd2x: register LE bytes 0-7 → memory bytes 8-15;
-+      // LE bytes 8-15 → memory bytes 0-7.
-+      uint64_t ea = XFormEA(this, instr);
-+      if (handleWasmSegFault(ea, 16)) break;
-+      int xs = int(instr->rtValue() | (instr->bit(0) << 5));
-+      uint8_t buf[16], mem[16];
-+      getVSR128(xs, buf);
-+      memcpy(mem, buf + 8, 8);
-+      memcpy(mem + 8, buf, 8);
-+      memcpy(reinterpret_cast<void*>(ea), mem, 16);
-+      break;
-+    }
-+
-+    default:
-+      MOZ_CRASH_UNSAFE_PRINTF(
-+          "decodeXForm: unimplemented XO=%u (instruction 0x%08x)", xo,
-+          instr->instructionBits());
-+  }
-+}
-+
-+// -----------------------------------------------------------------------------
-+// decodeRotateMask: rlwinm(21), rlwnm(23), rlwimi(20),
-+//   rldicl(30), rldicr(30), rldic(30), rldimi(30), rldcl(30), rldcr(30)
-+
-+void Simulator::decodeRotateMask(SimInstruction* instr) {
-+  uint32_t opcode = instr->opcode();
-+
-+  if (opcode == 21) {
-+    // rlwinm: RA = ROTL32(RS,SH) & MASK(MB,ME), Rc
-+    uint32_t rs_val = U32(getRegister(instr->rsValue()));
-+    uint32_t sh = instr->mSHValue();
-+    uint32_t mb = instr->mMBValue();
-+    uint32_t me = instr->mMEValue();
-+    uint32_t rotated = RotateLeft32(rs_val, sh);
-+    uint32_t mask = MASK32(mb, me);
-+    int64_t result = (int64_t)(uint64_t)(rotated & mask);
-+    setRegister(instr->raValue(), result);
-+    if (instr->rcBit()) updateCR0(result);
-+  } else if (opcode == 23) {
-+    // rlwnm: RA = ROTL32(RS,RB[27:31]) & MASK(MB,ME), Rc
-+    uint32_t rs_val = U32(getRegister(instr->rsValue()));
-+    uint32_t sh = U32(getRegister(instr->rbValue())) & 0x1F;
-+    uint32_t mb = instr->mMBValue();
-+    uint32_t me = instr->mMEValue();
-+    uint32_t rotated = RotateLeft32(rs_val, sh);
-+    uint32_t mask = MASK32(mb, me);
-+    int64_t result = (int64_t)(uint64_t)(rotated & mask);
-+    setRegister(instr->raValue(), result);
-+    if (instr->rcBit()) updateCR0(result);
-+  } else if (opcode == 20) {
-+    // rlwimi: RA = (ROTL32(RS,SH) & MASK) | (RA & ~MASK), Rc
-+    uint32_t rs_val = U32(getRegister(instr->rsValue()));
-+    uint32_t sh = instr->mSHValue();
-+    uint32_t mb = instr->mMBValue();
-+    uint32_t me = instr->mMEValue();
-+    uint32_t rotated = RotateLeft32(rs_val, sh);
-+    uint32_t mask = MASK32(mb, me);
-+    uint32_t ra_val = U32(getRegister(instr->raValue()));
-+    int64_t result = (int64_t)(uint64_t)((rotated & mask) | (ra_val & ~mask));
-+    setRegister(instr->raValue(), result);
-+    if (instr->rcBit()) updateCR0(result);
-+  } else if (opcode == 30) {
-+    // MD-form / MDS-form: 64-bit rotate/mask
-+    uint32_t rs = instr->rsValue();
-+    uint64_t rs_val = U64(getRegister(rs));
-+    uint32_t ra_reg = instr->raValue();
-+
-+    // Determine which sub-opcode: bits 2..4 for MD-form, bit 4 for MDS.
-+    // MD: bits 2..4
-+    // MDS: bit 4 (rldcl has bit4=0, bit3..2=00 with bit1=1; rldcr has
-+    //      bit4=0, bit3..2=01 with bit1=1). Actually:
-+    //   rldicl:  30 | MD-XO=0 (bits 2..4 = 000), bit1=0
-+    //   rldicr:  30 | MD-XO=1 (bits 2..4 = 001), bit1=0
-+    //   rldic:   30 | MD-XO=2 (bits 2..4 = 010), bit1=0
-+    //   rldimi:  30 | MD-XO=3 (bits 2..4 = 011), bit1=0
-+    //   rldcl:   30 | MDS, bit4=0, bit3..1=000, bit0=Rc => bits 1..4=1000
-+    //            Actually rldcl: bits 1..4 = 1000, i.e. bit(4)=1,bit(3)=0,
-+    //            bit(2)=0,bit(1)=0
-+    //   rldcr:   30 | MDS, bits 1..4 = 1001
-+    //
-+    // Let's check bit 4 first: if bit(4)==1, it's MDS-form (rldcl/rldcr).
-+    if (instr->bit(4)) {
-+      // MDS-form: shift amount from RB register
-+      uint32_t sh = U32(getRegister(instr->rbValue())) & 0x3F;
-+      uint64_t rotated = RotateLeft64(rs_val, sh);
-+      uint32_t mb = instr->mdsMBValue();
-+
-+      if (!instr->bit(1)) {
-+        // rldcl: RA = ROTL64(RS, RB[58:63]) & MASK(mb, 63)
-+        uint64_t mask = MASK64(mb, 63);
-+        int64_t result = I64(rotated & mask);
-+        setRegister(ra_reg, result);
-+        if (instr->rcBit()) updateCR0(result);
-+      } else {
-+        // rldcr: RA = ROTL64(RS, RB[58:63]) & MASK(0, me)
-+        uint32_t me = instr->mdsMBValue();
-+        uint64_t mask = MASK64(0, me);
-+        int64_t result = I64(rotated & mask);
-+        setRegister(ra_reg, result);
-+        if (instr->rcBit()) updateCR0(result);
-+      }
-+    } else {
-+      // MD-form
-+      uint32_t sh = instr->mdSHValue();
-+      uint64_t rotated = RotateLeft64(rs_val, sh);
-+      uint32_t xo_md = instr->bits(3, 2);
-+
-+      switch (xo_md) {
-+        case 0: {
-+          // rldicl: RA = ROTL64(RS, SH) & MASK(mb, 63)
-+          uint32_t mb = instr->mdMBValue();
-+          uint64_t mask = MASK64(mb, 63);
-+          int64_t result = I64(rotated & mask);
-+          setRegister(ra_reg, result);
-+          if (instr->rcBit()) updateCR0(result);
-+          break;
-+        }
-+        case 1: {
-+          // rldicr: RA = ROTL64(RS, SH) & MASK(0, me)
-+          uint32_t me = instr->mdMEValue();
-+          uint64_t mask = MASK64(0, me);
-+          int64_t result = I64(rotated & mask);
-+          setRegister(ra_reg, result);
-+          if (instr->rcBit()) updateCR0(result);
-+          break;
-+        }
-+        case 2: {
-+          // rldic: RA = ROTL64(RS, SH) & MASK(mb, ~SH)
-+          // Actually: MASK(mb, 63-SH)
-+          uint32_t mb = instr->mdMBValue();
-+          uint64_t mask = MASK64(mb, 63 - sh);
-+          int64_t result = I64(rotated & mask);
-+          setRegister(ra_reg, result);
-+          if (instr->rcBit()) updateCR0(result);
-+          break;
-+        }
-+        case 3: {
-+          // rldimi: RA = (ROTL64(RS,SH) & MASK) | (RA & ~MASK)
-+          uint32_t mb = instr->mdMBValue();
-+          uint64_t mask = MASK64(mb, 63 - sh);
-+          uint64_t ra_val = U64(getRegister(ra_reg));
-+          int64_t result = I64((rotated & mask) | (ra_val & ~mask));
-+          setRegister(ra_reg, result);
-+          if (instr->rcBit()) updateCR0(result);
-+          break;
-+        }
-+        default:
-+          MOZ_CRASH_UNSAFE_PRINTF("decodeRotateMask: MD xo=%u", xo_md);
-+      }
-+    }
-+  } else {
-+    MOZ_CRASH_UNSAFE_PRINTF("decodeRotateMask: opcode=%u", opcode);
-+  }
-+}
-+
-+// -----------------------------------------------------------------------------
-+// CR-bit accessors used by the XL-form CR-logic ops (crand, crandc, cror,
-+// crorc, crxor, creqv). Bit index is in BIF*4+x form: field=b/4, bit=b%4
-+// where 0=LT, 1=GT, 2=EQ, 3=SO.
-+static inline uint8_t CRBitMask(uint32_t bitInField) {
-+  switch (bitInField) {
-+    case 0: return kCRFieldLT;
-+    case 1: return kCRFieldGT;
-+    case 2: return kCRFieldEQ;
-+    case 3: return kCRFieldSO;
-+  }
-+  return 0;
-+}
-+
-+static inline bool GetCRBit(Simulator& s, uint32_t b) {
-+  return (s.getCRField(b / 4) & CRBitMask(b % 4)) != 0;
-+}
-+
-+static inline void SetCRBit(Simulator& s, uint32_t b, bool val) {
-+  uint8_t fv = s.getCRField(b / 4);
-+  uint8_t mask = CRBitMask(b % 4);
-+  s.setCRField(b / 4, val ? (fv | mask) : (fv & ~mask));
-+}
-+
-+// -----------------------------------------------------------------------------
-+// decodeBranch: b(18), bc(16), XL-form(19)
-+
-+void Simulator::decodeBranch(SimInstruction* instr) {
-+  uint32_t opcode = instr->opcode();
-+
-+  if (opcode == 18) {
-+    // b / bl: I-form unconditional branch
-+    int32_t offset = instr->li26Value();
-+    bool lk = instr->lkBit();
-+    bool aa = instr->aaBit();
-+
-+    int64_t target;
-+    if (aa) {
-+      target = (int64_t)offset;
-+    } else {
-+      target = get_pc() + (int64_t)offset;
-+    }
-+
-+    if (lk) {
-+      setLR(get_pc() + SimInstruction::kInstrSize);
-+    }
-+
-+    set_pc(target);
-+    return;
-+  }
-+
-+  if (opcode == 16) {
-+    // bc / bcl: B-form conditional branch
-+    uint32_t bo = instr->boValue();
-+    uint32_t bi = instr->biValue();
-+    int32_t bd = instr->bd16Value();
-+    bool lk = instr->lkBit();
-+    bool aa = instr->aaBit();
-+
-+    // Decrement CTR if BO[2] (bit 2 of BO, which is bo & 0x04) is clear.
-+    if (!(bo & 0x04)) {
-+      setCTR(getCTR() - 1);
-+    }
-+
-+    // Evaluate CTR condition.
-+    bool ctr_ok = (bo & 0x04) ||
-+                  ((getCTR() != 0) ^ ((bo & 0x02) != 0));
-+
-+    // Evaluate CR condition.
-+    uint32_t crField = bi / 4;
-+    uint32_t crBit = bi % 4;
-+    uint8_t crFieldVal = getCRField(crField);
-+    bool crBitSet;
-+    switch (crBit) {
-+      case 0: crBitSet = (crFieldVal & kCRFieldLT) != 0; break;
-+      case 1: crBitSet = (crFieldVal & kCRFieldGT) != 0; break;
-+      case 2: crBitSet = (crFieldVal & kCRFieldEQ) != 0; break;
-+      case 3: crBitSet = (crFieldVal & kCRFieldSO) != 0; break;
-+      default: crBitSet = false; break;
-+    }
-+    bool cond_ok = (bo & 0x10) || (crBitSet == ((bo & 0x08) != 0));
-+
-+    if (ctr_ok && cond_ok) {
-+      int64_t target;
-+      if (aa) {
-+        target = (int64_t)bd;
-+      } else {
-+        target = get_pc() + (int64_t)bd;
-+      }
-+      if (lk) {
-+        setLR(get_pc() + SimInstruction::kInstrSize);
-+      }
-+      set_pc(target);
-+    } else {
-+      // Branch not taken.
-+      set_pc(get_pc() + SimInstruction::kInstrSize);
-+    }
-+    return;
-+  }
-+
-+  if (opcode == 19) {
-+    // XL-form: bclr, bcctr, crand, crandc, cror, crorc, crxor, creqv,
-+    //          mcrf, isync
-+    uint32_t xl = instr->xlValue();
-+
-+    switch (xl) {
-+      case 16: {
-+        // bclr: conditional branch to LR
-+        uint32_t bo = instr->boValue();
-+        uint32_t bi = instr->biValue();
-+        bool lk = instr->lkBit();
-+
-+        if (!(bo & 0x04)) {
-+          setCTR(getCTR() - 1);
-+        }
-+
-+        bool ctr_ok = (bo & 0x04) ||
-+                      ((getCTR() != 0) ^ ((bo & 0x02) != 0));
-+
-+        uint32_t crField = bi / 4;
-+        uint32_t crBit = bi % 4;
-+        uint8_t crFieldVal = getCRField(crField);
-+        bool crBitSet;
-+        switch (crBit) {
-+          case 0: crBitSet = (crFieldVal & kCRFieldLT) != 0; break;
-+          case 1: crBitSet = (crFieldVal & kCRFieldGT) != 0; break;
-+          case 2: crBitSet = (crFieldVal & kCRFieldEQ) != 0; break;
-+          case 3: crBitSet = (crFieldVal & kCRFieldSO) != 0; break;
-+          default: crBitSet = false; break;
-+        }
-+        bool cond_ok = (bo & 0x10) || (crBitSet == ((bo & 0x08) != 0));
-+
-+        if (ctr_ok && cond_ok) {
-+          int64_t target = getLR() & ~3LL;
-+          if (lk) {
-+            setLR(get_pc() + SimInstruction::kInstrSize);
-+          }
-+          set_pc(target);
-+        } else {
-+          set_pc(get_pc() + SimInstruction::kInstrSize);
-+        }
-+        break;
-+      }
-+      case 528: {
-+        // bcctr: conditional branch to CTR
-+        uint32_t bo = instr->boValue();
-+        uint32_t bi = instr->biValue();
-+        bool lk = instr->lkBit();
-+
-+        // CTR is not decremented for bcctr.
-+        uint32_t crField = bi / 4;
-+        uint32_t crBit = bi % 4;
-+        uint8_t crFieldVal = getCRField(crField);
-+        bool crBitSet;
-+        switch (crBit) {
-+          case 0: crBitSet = (crFieldVal & kCRFieldLT) != 0; break;
-+          case 1: crBitSet = (crFieldVal & kCRFieldGT) != 0; break;
-+          case 2: crBitSet = (crFieldVal & kCRFieldEQ) != 0; break;
-+          case 3: crBitSet = (crFieldVal & kCRFieldSO) != 0; break;
-+          default: crBitSet = false; break;
-+        }
-+        bool cond_ok = (bo & 0x10) || (crBitSet == ((bo & 0x08) != 0));
-+
-+        if (cond_ok) {
-+          int64_t target = getCTR() & ~3LL;
-+          if (lk) {
-+            setLR(get_pc() + SimInstruction::kInstrSize);
-+          }
-+          set_pc(target);
-+        } else {
-+          set_pc(get_pc() + SimInstruction::kInstrSize);
-+        }
-+        break;
-+      }
-+      case 257: {
-+        // crand: CR[BT] = CR[BA] & CR[BB]
-+        uint32_t bt = instr->rtValue();
-+        uint32_t ba = instr->raValue();
-+        uint32_t bb = instr->rbValue();
-+        SetCRBit(*this, bt, GetCRBit(*this, ba) && GetCRBit(*this, bb));
-+        break;
-+      }
-+      case 129: {
-+        // crandc: CR[BT] = CR[BA] & ~CR[BB]
-+        uint32_t bt = instr->rtValue();
-+        uint32_t ba = instr->raValue();
-+        uint32_t bb = instr->rbValue();
-+        SetCRBit(*this, bt, GetCRBit(*this, ba) && !GetCRBit(*this, bb));
-+        break;
-+      }
-+      case 449: {
-+        // cror: CR[BT] = CR[BA] | CR[BB]
-+        uint32_t bt = instr->rtValue();
-+        uint32_t ba = instr->raValue();
-+        uint32_t bb = instr->rbValue();
-+        SetCRBit(*this, bt, GetCRBit(*this, ba) || GetCRBit(*this, bb));
-+        break;
-+      }
-+      case 417: {
-+        // crorc: CR[BT] = CR[BA] | ~CR[BB]
-+        uint32_t bt = instr->rtValue();
-+        uint32_t ba = instr->raValue();
-+        uint32_t bb = instr->rbValue();
-+        SetCRBit(*this, bt, GetCRBit(*this, ba) || !GetCRBit(*this, bb));
-+        break;
-+      }
-+      case 193: {
-+        // crxor: CR[BT] = CR[BA] ^ CR[BB]
-+        uint32_t bt = instr->rtValue();
-+        uint32_t ba = instr->raValue();
-+        uint32_t bb = instr->rbValue();
-+        SetCRBit(*this, bt, GetCRBit(*this, ba) ^ GetCRBit(*this, bb));
-+        break;
-+      }
-+      case 289: {
-+        // creqv: CR[BT] = ~(CR[BA] ^ CR[BB])
-+        uint32_t bt = instr->rtValue();
-+        uint32_t ba = instr->raValue();
-+        uint32_t bb = instr->rbValue();
-+        SetCRBit(*this, bt, !(GetCRBit(*this, ba) ^ GetCRBit(*this, bb)));
-+        break;
-+      }
-+      case 150: {
-+        // isync: no-op in simulator
-+        break;
-+      }
-+      case 370: {
-+        // PPC_stop (0x4C0002E4) decoded as XL-form opcode 19, XL=370.
-+        // This is our kCallRedirInstr. Handle via softwareInterrupt.
-+        softwareInterrupt(instr);
-+        break;
-+      }
-+      case 2: {
-+        // POWER9 addpcis rT, D (DX-form). Computes rT = (CIA + 4) +
-+        // (sext16(D) << 16). The 16-bit signed displacement D is split
-+        // across three sub-fields:
-+        //   d0 = bits LE 6..15 (10 bits) — D[15:6]
-+        //   d1 = bits LE 16..20 (5 bits)  — D[5:1]
-+        //   d2 = bit  LE 0      (1 bit)   — D[0]
-+        // (Mirrors the encoder in Assembler-ppc64.cpp:as_addpcis.)
-+        uint32_t rt = instr->rtValue();
-+        uint32_t d0 = instr->bits(15, 6);
-+        uint32_t d1 = instr->bits(20, 16);
-+        uint32_t d2 = instr->bit(0);
-+        int16_t D = (int16_t)((d0 << 6) | (d1 << 1) | d2);
-+        int64_t cia = reinterpret_cast<int64_t>(instr);
-+        setRegister(rt, cia + SimInstruction::kInstrSize +
-+                            (static_cast<int64_t>(D) << 16));
-+        break;
-+      }
-+      default:
-+        MOZ_CRASH_UNSAFE_PRINTF("decodeBranch: XL opcode 19, xl=%u", xl);
-+    }
-+    return;
-+  }
-+
-+  MOZ_CRASH_UNSAFE_PRINTF("decodeBranch: opcode=%u", opcode);
-+}
-+
-+// -----------------------------------------------------------------------------
-+// decodeFP: Major opcodes 59 (A-form single) and 63 (X-form / A-form double)
-+
-+void Simulator::decodeFP(SimInstruction* instr) {
-+  uint32_t opcode = instr->opcode();
-+  uint32_t rt = instr->rtValue();  // FRT
-+  uint32_t ra = instr->raValue();  // FRA
-+  uint32_t rb = instr->rbValue();  // FRB
-+  uint32_t rc_reg = instr->rcValue();  // FRC (A-form)
-+
-+  if (opcode == 63) {
-+    // X-form and A-form double-precision instructions.
-+    // For A-form, the sub-opcode is in bits 1..5.
-+    // For X-form, the sub-opcode is in bits 1..10.
-+    uint32_t xo_a = instr->bits(5, 1);  // A-form sub-opcode
-+    uint32_t xo_x = instr->bits(10, 1); // X-form sub-opcode
-+
-+    // Try A-form first (5-bit sub-opcode in bits 1..5).
-+    switch (xo_a) {
-+      case 21: {
-+        // fadd
-+        double result = getFpuRegisterDouble(ra) + getFpuRegisterDouble(rb);
-+        setFpuRegisterDouble(rt, result);
-+        return;
-+      }
-+      case 20: {
-+        // fsub
-+        double result = getFpuRegisterDouble(ra) - getFpuRegisterDouble(rb);
-+        setFpuRegisterDouble(rt, result);
-+        return;
-+      }
-+      case 25: {
-+        // fmul: FRT = FRA * FRC (note: FRC, not FRB!)
-+        double result = getFpuRegisterDouble(ra) * getFpuRegisterDouble(rc_reg);
-+        setFpuRegisterDouble(rt, result);
-+        return;
-+      }
-+      case 18: {
-+        // fdiv
-+        double result = getFpuRegisterDouble(ra) / getFpuRegisterDouble(rb);
-+        setFpuRegisterDouble(rt, result);
-+        return;
-+      }
-+      case 22: {
-+        // fsqrt
-+        double result = sqrt(getFpuRegisterDouble(rb));
-+        setFpuRegisterDouble(rt, result);
-+        return;
-+      }
-+      case 29: {
-+        // fmadd: FRT = FRA * FRC + FRB
-+        double result = std::fma(getFpuRegisterDouble(ra),
-+                                 getFpuRegisterDouble(rc_reg),
-+                                 getFpuRegisterDouble(rb));
-+        setFpuRegisterDouble(rt, result);
-+        return;
-+      }
-+      case 30: {
-+        // fnmsub: FRT = -(FRA * FRC - FRB)
-+        double result = -(std::fma(getFpuRegisterDouble(ra),
-+                                   getFpuRegisterDouble(rc_reg),
-+                                   -getFpuRegisterDouble(rb)));
-+        setFpuRegisterDouble(rt, result);
-+        return;
-+      }
-+      case 28: {
-+        // fmsub: FRT = FRA * FRC - FRB
-+        double result = std::fma(getFpuRegisterDouble(ra),
-+                                 getFpuRegisterDouble(rc_reg),
-+                                 -getFpuRegisterDouble(rb));
-+        setFpuRegisterDouble(rt, result);
-+        return;
-+      }
-+      case 31: {
-+        // fnmadd: FRT = -(FRA * FRC + FRB)
-+        double result = -(std::fma(getFpuRegisterDouble(ra),
-+                                   getFpuRegisterDouble(rc_reg),
-+                                   getFpuRegisterDouble(rb)));
-+        setFpuRegisterDouble(rt, result);
-+        return;
-+      }
-+      case 23: {
-+        // fsel: FRT = (FRA >= 0) ? FRC : FRB
-+        double fra = getFpuRegisterDouble(ra);
-+        setFpuRegisterDouble(rt, (fra >= 0.0) ? getFpuRegisterDouble(rc_reg)
-+                                               : getFpuRegisterDouble(rb));
-+        return;
-+      }
-+      case 26: {
-+        // frsqrte: FRT = 1.0 / sqrt(FRB) (estimate)
-+        double result = 1.0 / sqrt(getFpuRegisterDouble(rb));
-+        setFpuRegisterDouble(rt, result);
-+        return;
-+      }
-+    }
-+
-+    // X-form (10-bit sub-opcode).
-+    switch (xo_x) {
-+      case 72: {
-+        // fmr: FRT = FRB
-+        setFpuRegisterDouble(rt, getFpuRegisterDouble(rb));
-+        break;
-+      }
-+      case 40: {
-+        // fneg: FRT = -FRB
-+        setFpuRegisterDouble(rt, -getFpuRegisterDouble(rb));
-+        break;
-+      }
-+      case 264: {
-+        // fabs: FRT = |FRB|
-+        setFpuRegisterDouble(rt, fabs(getFpuRegisterDouble(rb)));
-+        break;
-+      }
-+      case 136: {
-+        // fnabs: FRT = -|FRB|
-+        setFpuRegisterDouble(rt, -fabs(getFpuRegisterDouble(rb)));
-+        break;
-+      }
-+      case 8: {
-+        // fcpsgn: FRT = sign(FRA) || magnitude(FRB)
-+        double fra = getFpuRegisterDouble(ra);
-+        double frb = getFpuRegisterDouble(rb);
-+        setFpuRegisterDouble(rt, std::copysign(frb, fra));
-+        break;
-+      }
-+      case 0: {
-+        // fcmpu: compare FRA, FRB unordered
-+        uint32_t bf = instr->bfValue();
-+        double fra = getFpuRegisterDouble(ra);
-+        double frb = getFpuRegisterDouble(rb);
-+        uint8_t field = 0;
-+        if (std::isnan(fra) || std::isnan(frb)) {
-+          field = kCRFieldSO;
-+        } else if (fra < frb) {
-+          field = kCRFieldLT;
-+        } else if (fra > frb) {
-+          field = kCRFieldGT;
-+        } else {
-+          field = kCRFieldEQ;
-+        }
-+        setCRField(bf, field);
-+        break;
-+      }
-+      case 32: {
-+        // fcmpo: compare FRA, FRB ordered
-+        uint32_t bf = instr->bfValue();
-+        double fra = getFpuRegisterDouble(ra);
-+        double frb = getFpuRegisterDouble(rb);
-+        uint8_t field = 0;
-+        if (std::isnan(fra) || std::isnan(frb)) {
-+          field = kCRFieldSO;
-+        } else if (fra < frb) {
-+          field = kCRFieldLT;
-+        } else if (fra > frb) {
-+          field = kCRFieldGT;
-+        } else {
-+          field = kCRFieldEQ;
-+        }
-+        setCRField(bf, field);
-+        break;
-+      }
-+      // For fctid* and fctiw* the ISA specifies that bit 23 of FPSCR (VXCVI,
-+      // "invalid op for integer convert") is set when the source is NaN, +Inf,
-+      // -Inf, or out of the destination's range. Wasm's out-of-range trap
-+      // sequence is `mtfsb0 23; fctidz; mfvsrd; mcrfs cr0,5; bt SOBit,trap`,
-+      // so the simulator MUST update VXCVI here for the trap to fire. With
-+      // FPSCR_ in the low-half PPC layout (PPC bit N → int64 bit (31-N)),
-+      // VXCVI lives at int64 bit (31-23) = 8.
-+      case 814: {
-+        // fctid: convert double to int64 (current rounding)
-+        double frb = getFpuRegisterDouble(rb);
-+        int64_t result;
-+        bool invalid = false;
-+        if (std::isnan(frb)) {
-+          result = INT64_MIN;
-+          invalid = true;
-+        } else if (frb >= -(double)INT64_MIN || frb < (double)INT64_MIN) {
-+          result = (frb < 0) ? INT64_MIN : INT64_MAX;
-+          invalid = true;
-+        } else {
-+          switch (FPSCR_ & kFPSCRRNMask) {
-+            case RN: result = (int64_t)llrint(frb); break;
-+            case RZ: result = (int64_t)frb; break;
-+            case RP: result = (int64_t)ceil(frb); break;
-+            case RM: result = (int64_t)floor(frb); break;
-+            default: result = (int64_t)frb; break;
-+          }
-+        }
-+        if (invalid) FPSCR_ |= (1ULL << 8);  /* VXCVI: PPC bit 23 in low-half layout */
-+        setFpuRegister(rt, result);
-+        break;
-+      }
-+      case 815: {
-+        // fctidz: convert double to int64 (round toward zero)
-+        double frb = getFpuRegisterDouble(rb);
-+        int64_t result;
-+        bool invalid = false;
-+        if (std::isnan(frb)) {
-+          result = INT64_MIN;
-+          invalid = true;
-+        } else if (frb >= -(double)INT64_MIN) {
-+          result = INT64_MAX;
-+          invalid = true;
-+        } else if (frb < (double)INT64_MIN) {
-+          result = INT64_MIN;
-+          invalid = true;
-+        } else {
-+          result = (int64_t)frb;
-+        }
-+        if (invalid) FPSCR_ |= (1ULL << 8);  /* VXCVI: PPC bit 23 in low-half layout */
-+        setFpuRegister(rt, result);
-+        break;
-+      }
-+      case 942: {
-+        // fctidu: convert double to uint64 (current rounding).
-+        // VXCVI is signaled when source is NaN, ±Inf, or the rounded value
-+        // is outside [0, 2^64-1]. Notably,
-+        // a negative source whose rounded value is 0 (e.g. -0.4 in RN, or
-+        // any value in (-1, 0) in RZ) is NOT invalid.
-+        double frb = getFpuRegisterDouble(rb);
-+        uint64_t result;
-+        bool invalid = false;
-+        if (std::isnan(frb)) {
-+          result = 0;
-+          invalid = true;
-+        } else if (frb >= -2.0 * (double)INT64_MIN /* 2^64 */) {
-+          result = UINT64_MAX;
-+          invalid = true;
-+        } else {
-+          double rounded;
-+          switch (FPSCR_ & kFPSCRRNMask) {
-+            case RN: rounded = nearbyint(frb); break;
-+            case RZ: rounded = trunc(frb); break;
-+            case RP: rounded = ceil(frb); break;
-+            case RM: rounded = floor(frb); break;
-+            default: rounded = trunc(frb); break;
-+          }
-+          if (rounded < 0.0) {
-+            result = 0;
-+            invalid = true;
-+          } else {
-+            result = (uint64_t)rounded;
-+          }
-+        }
-+        if (invalid) FPSCR_ |= (1ULL << 8);  /* VXCVI: PPC bit 23 in low-half layout */
-+        setFpuRegister(rt, I64(result));
-+        break;
-+      }
-+      case 943: {
-+        // fctiduz: convert double to uint64 (round toward zero).
-+        // Same VXCVI rule as fctidu but rounding is fixed to truncate
-+        // toward zero. Source in (-1, 0) truncates to 0 — VALID.
-+        double frb = getFpuRegisterDouble(rb);
-+        uint64_t result;
-+        bool invalid = false;
-+        if (std::isnan(frb)) {
-+          result = 0;
-+          invalid = true;
-+        } else if (frb >= -2.0 * (double)INT64_MIN /* 2^64 */) {
-+          result = UINT64_MAX;
-+          invalid = true;
-+        } else if (frb <= -1.0) {
-+          // Truncated value is negative — invalid for unsigned.
-+          result = 0;
-+          invalid = true;
-+        } else {
-+          // Source is in (-1, 2^64); truncation toward zero yields a value
-+          // in [0, 2^64).
-+          result = (uint64_t)trunc(frb);
-+        }
-+        if (invalid) FPSCR_ |= (1ULL << 8);  /* VXCVI: PPC bit 23 in low-half layout */
-+        setFpuRegister(rt, I64(result));
-+        break;
-+      }
-+      case 14: {
-+        // fctiw: convert double to int32 (current rounding).
-+        // Invalid range: rounded value < INT32_MIN or > INT32_MAX. The
-+        // double-precision boundary on the negative side is INT32_MIN-1 =
-+        // -2^31-1 = -2147483649.0 (exactly representable; doubles in
-+        // (-2^31-1, -2^31) all round-to-nearest to -2^31 which is valid).
-+        double frb = getFpuRegisterDouble(rb);
-+        int32_t result;
-+        bool invalid = false;
-+        if (std::isnan(frb)) {
-+          result = INT32_MIN;
-+          invalid = true;
-+        } else {
-+          double rounded;
-+          switch (FPSCR_ & kFPSCRRNMask) {
-+            case RN: rounded = nearbyint(frb); break;
-+            case RZ: rounded = trunc(frb); break;
-+            case RP: rounded = ceil(frb); break;
-+            case RM: rounded = floor(frb); break;
-+            default: rounded = trunc(frb); break;
-+          }
-+          if (rounded > (double)INT32_MAX) {
-+            result = INT32_MAX;
-+            invalid = true;
-+          } else if (rounded < (double)INT32_MIN) {
-+            result = INT32_MIN;
-+            invalid = true;
-+          } else {
-+            result = (int32_t)rounded;
-+          }
-+        }
-+        if (invalid) FPSCR_ |= (1ULL << 8);  /* VXCVI: PPC bit 23 in low-half layout */
-+        setFpuRegister(rt, (int64_t)result);
-+        break;
-+      }
-+      case 15: {
-+        // fctiwz: convert double to int32 (round toward zero).
-+        // Truncation of a value in (-2^31-1, INT32_MIN) toward zero gives
-+        // INT32_MIN — valid. Only `frb <= -2^31-1` (i.e. `frb < INT32_MIN-1+1`
-+        // = `frb < -2147483648` ... wait, simplest: check truncated value in
-+        // range AFTER truncation.)
-+        double frb = getFpuRegisterDouble(rb);
-+        int32_t result;
-+        bool invalid = false;
-+        if (std::isnan(frb)) {
-+          result = INT32_MIN;
-+          invalid = true;
-+        } else {
-+          double truncated = trunc(frb);
-+          if (truncated > (double)INT32_MAX) {
-+            result = INT32_MAX;
-+            invalid = true;
-+          } else if (truncated < (double)INT32_MIN) {
-+            result = INT32_MIN;
-+            invalid = true;
-+          } else {
-+            result = (int32_t)truncated;
-+          }
-+        }
-+        if (invalid) FPSCR_ |= (1ULL << 8);  /* VXCVI: PPC bit 23 in low-half layout */
-+        setFpuRegister(rt, (int64_t)result);
-+        break;
-+      }
-+      case 142: {
-+        // fctiwu: convert double to uint32 (current rounding). The check is
-+        // on the ROUNDED value: VXCVI iff rounded < 0 or rounded > UINT32_MAX.
-+        double frb = getFpuRegisterDouble(rb);
-+        uint32_t result;
-+        bool invalid = false;
-+        if (std::isnan(frb)) {
-+          result = 0;
-+          invalid = true;
-+        } else {
-+          double rounded;
-+          switch (FPSCR_ & kFPSCRRNMask) {
-+            case RN: rounded = nearbyint(frb); break;
-+            case RZ: rounded = trunc(frb); break;
-+            case RP: rounded = ceil(frb); break;
-+            case RM: rounded = floor(frb); break;
-+            default: rounded = trunc(frb); break;
-+          }
-+          if (rounded < 0.0) {
-+            result = 0;
-+            invalid = true;
-+          } else if (rounded > (double)UINT32_MAX) {
-+            result = UINT32_MAX;
-+            invalid = true;
-+          } else {
-+            result = (uint32_t)rounded;
-+          }
-+        }
-+        if (invalid) FPSCR_ |= (1ULL << 8);  /* VXCVI: PPC bit 23 in low-half layout */
-+        setFpuRegister(rt, (int64_t)(uint64_t)result);
-+        break;
-+      }
-+      case 143: {
-+        // fctiwuz: convert double to uint32 (round toward zero).
-+        // Source in (-1, 0) truncates to 0 — VALID.
-+        double frb = getFpuRegisterDouble(rb);
-+        uint32_t result;
-+        bool invalid = false;
-+        if (std::isnan(frb)) {
-+          result = 0;
-+          invalid = true;
-+        } else {
-+          double truncated = trunc(frb);
-+          if (truncated > (double)UINT32_MAX) {
-+            result = UINT32_MAX;
-+            invalid = true;
-+          } else if (truncated < 0.0) {
-+            result = 0;
-+            invalid = true;
-+          } else {
-+            result = (uint32_t)truncated;
-+          }
-+        }
-+        if (invalid) FPSCR_ |= (1ULL << 8);  /* VXCVI: PPC bit 23 in low-half layout */
-+        setFpuRegister(rt, (int64_t)(uint64_t)result);
-+        break;
-+      }
-+      case 846: {
-+        // fcfid: convert int64 in FPR to double
-+        int64_t val = getFpuRegister(rb);
-+        setFpuRegisterDouble(rt, (double)val);
-+        break;
-+      }
-+      case 974: {
-+        // fcfidu: convert uint64 in FPR to double
-+        uint64_t val = U64(getFpuRegister(rb));
-+        setFpuRegisterDouble(rt, (double)val);
-+        break;
-+      }
-+      case 12: {
-+        // frsp: round double to single precision (then re-extend in FPR).
-+        // sNaN inputs are quieted (the result payload MSB is set).
-+        // wasm f32.demote_f64 lowers to this op when
-+        // not using xscvdpsp directly.
-+        double frb = getFpuRegisterDouble(rb);
-+        float result = demoteDoublePreservingNaN(frb);
-+        uint32_t fbits;
-+        memcpy(&fbits, &result, sizeof(fbits));
-+        if ((fbits & 0x7F800000u) == 0x7F800000u &&
-+            (fbits & 0x007FFFFFu) != 0) {
-+          fbits |= 0x00400000u;
-+          memcpy(&result, &fbits, sizeof(result));
-+        }
-+        setFpuRegisterDouble(rt, promoteFloatPreservingNaN(result));
-+        break;
-+      }
-+      case 392: {
-+        // frin: round to nearest integer (ties away from zero)
-+        double frb = getFpuRegisterDouble(rb);
-+        setFpuRegisterDouble(rt, round(frb));
-+        break;
-+      }
-+      case 424: {
-+        // friz: round toward zero
-+        double frb = getFpuRegisterDouble(rb);
-+        setFpuRegisterDouble(rt, trunc(frb));
-+        break;
-+      }
-+      case 456: {
-+        // frip: round toward +infinity (ceil). XO=456.
-+        double frb = getFpuRegisterDouble(rb);
-+        setFpuRegisterDouble(rt, ceil(frb));
-+        break;
-+      }
-+      case 488: {
-+        // frim: round toward -infinity (floor). XO=488.
-+        double frb = getFpuRegisterDouble(rb);
-+        setFpuRegisterDouble(rt, floor(frb));
-+        break;
-+      }
-+      case 583: {
-+        // mffs: FRT = FPSCR (as double bit pattern)
-+        setFpuRegister(rt, I64(FPSCR_));
-+        break;
-+      }
-+      // FPSCR is treated as a 32-bit register stored in the low 32 bits of
-+      // FPSCR_ (uint64_t), with PPC bit numbering: PPC bit N (where bit 0 is
-+      // the MSB) lives at int64 bit (31-N). Field F (4 bits) covers PPC bits
-+      // 4F..4F+3 → int64 bit-LSB (28-4F) to bit-MSB (31-4F). This matches
-+      // mcrfs, mtfsfi, kFPSCRRNMask (which checks bits 30-31 PPC = int64 bits
-+      // 0-1), and mffs (which copies FPSCR into FPR bits 32..63 PPC = int64
-+      // bits 0..31). Earlier mtfsb0/mtfsb1 used (63-bt) which placed bits in
-+      // the high half of FPSCR_ where mcrfs etc. would never see them — so
-+      // the wasm trap sequence `mtfsb0 23; fctidz; mcrfs cr0,5; bt SO,oolEntry`
-+      // could not detect VXCVI.
-+      case 70: {
-+        // mtfsb0: clear FPSCR bit. XO=70.
-+        // (Cases 38 and 70 had the labels swapped, so wasm's
-+        // `mtfsb0 23; fctidz; mcrfs cr0,5; bt SO,trap` sequence accidentally
-+        // SET VXCVI before the convert ran, causing every fctid* to trap.)
-+        uint32_t bt = instr->rtValue();
-+        FPSCR_ &= ~(1ULL << (31 - bt));
-+        break;
-+      }
-+      case 64: {
-+        // mcrfs: copy FPSCR field to CR field
-+        uint32_t bf = instr->bfValue();
-+        uint32_t bfa = instr->bits(20, 18);
-+        uint32_t shift = 4 * (7 - bfa);
-+        uint8_t val = (FPSCR_ >> shift) & 0xF;
-+        setCRField(bf, val);
-+        break;
-+      }
-+      default:
-+        MOZ_CRASH_UNSAFE_PRINTF(
-+            "decodeFP: opcode 63, xo_x=%u (instruction 0x%08x)", xo_x,
-+            instr->instructionBits());
-+    }
-+  } else if (opcode == 59) {
-+    // A-form single-precision instructions.
-+    uint32_t xo_a = instr->bits(5, 1);
-+
-+    switch (xo_a) {
-+      case 21: {
-+        // fadds
-+        double result = (double)((float)(getFpuRegisterDouble(ra) +
-+                                         getFpuRegisterDouble(rb)));
-+        setFpuRegisterDouble(rt, result);
-+        break;
-+      }
-+      case 20: {
-+        // fsubs
-+        double result = (double)((float)(getFpuRegisterDouble(ra) -
-+                                         getFpuRegisterDouble(rb)));
-+        setFpuRegisterDouble(rt, result);
-+        break;
-+      }
-+      case 25: {
-+        // fmuls: FRT = (float)(FRA * FRC)
-+        double result = (double)((float)(getFpuRegisterDouble(ra) *
-+                                         getFpuRegisterDouble(rc_reg)));
-+        setFpuRegisterDouble(rt, result);
-+        break;
-+      }
-+      case 18: {
-+        // fdivs
-+        double result = (double)((float)(getFpuRegisterDouble(ra) /
-+                                         getFpuRegisterDouble(rb)));
-+        setFpuRegisterDouble(rt, result);
-+        break;
-+      }
-+      case 22: {
-+        // fsqrts
-+        double result = (double)sqrtf((float)getFpuRegisterDouble(rb));
-+        setFpuRegisterDouble(rt, result);
-+        break;
-+      }
-+      case 29: {
-+        // fmadds
-+        double result = (double)((float)std::fma(getFpuRegisterDouble(ra),
-+                                                 getFpuRegisterDouble(rc_reg),
-+                                                 getFpuRegisterDouble(rb)));
-+        setFpuRegisterDouble(rt, result);
-+        break;
-+      }
-+      case 30: {
-+        // fnmsubs
-+        double result = (double)(-(float)std::fma(getFpuRegisterDouble(ra),
-+                                                  getFpuRegisterDouble(rc_reg),
-+                                                  -getFpuRegisterDouble(rb)));
-+        setFpuRegisterDouble(rt, result);
-+        break;
-+      }
-+      case 28: {
-+        // fmsubs
-+        double result = (double)((float)std::fma(getFpuRegisterDouble(ra),
-+                                                 getFpuRegisterDouble(rc_reg),
-+                                                 -getFpuRegisterDouble(rb)));
-+        setFpuRegisterDouble(rt, result);
-+        break;
-+      }
-+      case 31: {
-+        // fnmadds
-+        double result = (double)(-(float)std::fma(getFpuRegisterDouble(ra),
-+                                                  getFpuRegisterDouble(rc_reg),
-+                                                  getFpuRegisterDouble(rb)));
-+        setFpuRegisterDouble(rt, result);
-+        break;
-+      }
-+      default: {
-+        // Try X-form sub-opcodes for opcode 59 (e.g., fcfids, fcfidus).
-+        uint32_t xo_x = instr->bits(10, 1);
-+        switch (xo_x) {
-+          case 846: {
-+            // fcfids: convert int64 to float single (result stored as double)
-+            int64_t val = getFpuRegister(rb);
-+            setFpuRegisterDouble(rt, (double)(float)val);
-+            break;
-+          }
-+          case 974: {
-+            // fcfidus: convert uint64 to float single
-+            uint64_t val = U64(getFpuRegister(rb));
-+            setFpuRegisterDouble(rt, (double)(float)val);
-+            break;
-+          }
-+          default:
-+            MOZ_CRASH_UNSAFE_PRINTF(
-+                "decodeFP: opcode 59, xo_a=%u xo_x=%u", xo_a, xo_x);
-+        }
-+        break;
-+      }
-+    }
-+  } else {
-+    MOZ_CRASH_UNSAFE_PRINTF("decodeFP: opcode=%u", opcode);
-+  }
-+}
-+
-+// -----------------------------------------------------------------------------
-+// decodeVMX: Major opcode 4 (AltiVec/VMX vector ops on VR0-VR31).
-+//
-+// VR-form (VX-form): bits 0-5 = primary opcode (4), bits 6-10 = VRT,
-+// bits 11-15 = VRA, bits 16-20 = VRB, bits 21-31 = XO (11 bits).
-+// XO extracted via `instructionBits() & 0x7FF`.
-+//
-+// Helpers below pack/unpack each VR via the VRregisters_ byte storage
-+// (16 bytes, big-endian PPC numbering: bytes[0] is the most-significant
-+// byte of the architectural register, but on PPC64 LE wasm the lane
-+// ordering is what the JIT expects). All ops here use byte-level
-+// accessors for consistency with the existing VMX memory ops.
-+
-+void Simulator::decodeVMX(SimInstruction* instr) {
-+  uint32_t xo = instr->instructionBits() & 0x7FFu;
-+  uint32_t vrt = instr->rtValue();   // bits 6..10
-+  uint32_t vra = instr->raValue();   // bits 11..15
-+  uint32_t vrb = instr->rbValue();   // bits 16..20
-+  uint32_t uimm = instr->raValue();  // VA-form: 5-bit immediate at bits 11..15
-+
-+  uint8_t a[16], b[16], r[16];
-+  getVRBytes(vra, a);
-+  getVRBytes(vrb, b);
-+
-+  // Helpers for treating the byte storage as typed lane arrays.
-+  // The PPC64LE wasm SIMD lowering stores each lane's bytes in
-+  // little-endian order, so lane i of an N-byte element occupies bytes
-+  // (i*N) .. (i*N + N - 1) with the LSB at byte (i*N). For example,
-+  // a v128.const i32x4 0x12345678 has bytes [78 56 34 12 …].
-+  #define LANE_U8(buf, i)  ((uint8_t)(buf)[(i)])
-+  #define LANE_S8(buf, i)  ((int8_t)(buf)[(i)])
-+  #define LANE_U16(buf, i)                                     \
-+    ((uint16_t)((uint16_t)(buf)[(i) * 2] |                    \
-+                ((uint16_t)(buf)[(i) * 2 + 1] << 8)))
-+  #define LANE_S16(buf, i) ((int16_t)LANE_U16(buf, i))
-+  #define LANE_U32(buf, i)                                     \
-+    ((uint32_t)((uint32_t)(buf)[(i) * 4] |                    \
-+                ((uint32_t)(buf)[(i) * 4 + 1] << 8) |         \
-+                ((uint32_t)(buf)[(i) * 4 + 2] << 16) |        \
-+                ((uint32_t)(buf)[(i) * 4 + 3] << 24)))
-+  #define LANE_S32(buf, i) ((int32_t)LANE_U32(buf, i))
-+  #define LANE_U64(buf, i)                                     \
-+    ((uint64_t)((uint64_t)(buf)[(i) * 8] |                    \
-+                ((uint64_t)(buf)[(i) * 8 + 1] << 8) |         \
-+                ((uint64_t)(buf)[(i) * 8 + 2] << 16) |        \
-+                ((uint64_t)(buf)[(i) * 8 + 3] << 24) |        \
-+                ((uint64_t)(buf)[(i) * 8 + 4] << 32) |        \
-+                ((uint64_t)(buf)[(i) * 8 + 5] << 40) |        \
-+                ((uint64_t)(buf)[(i) * 8 + 6] << 48) |        \
-+                ((uint64_t)(buf)[(i) * 8 + 7] << 56)))
-+  #define LANE_S64(buf, i) ((int64_t)LANE_U64(buf, i))
-+  #define SET_LANE_U8(buf, i, v)  do { (buf)[(i)] = (uint8_t)(v); } while (0)
-+  #define SET_LANE_U16(buf, i, v) do {                                       \
-+      (buf)[(i) * 2]     = (uint8_t)((uint16_t)(v) & 0xFF);                 \
-+      (buf)[(i) * 2 + 1] = (uint8_t)(((uint16_t)(v) >> 8) & 0xFF);          \
-+    } while (0)
-+  #define SET_LANE_U32(buf, i, v) do {                                       \
-+      (buf)[(i) * 4]     = (uint8_t)((uint32_t)(v) & 0xFF);                 \
-+      (buf)[(i) * 4 + 1] = (uint8_t)(((uint32_t)(v) >> 8) & 0xFF);          \
-+      (buf)[(i) * 4 + 2] = (uint8_t)(((uint32_t)(v) >> 16) & 0xFF);         \
-+      (buf)[(i) * 4 + 3] = (uint8_t)(((uint32_t)(v) >> 24) & 0xFF);         \
-+    } while (0)
-+  #define SET_LANE_U64(buf, i, v) do {                                       \
-+      (buf)[(i) * 8]     = (uint8_t)((uint64_t)(v) & 0xFF);                 \
-+      (buf)[(i) * 8 + 1] = (uint8_t)(((uint64_t)(v) >> 8) & 0xFF);          \
-+      (buf)[(i) * 8 + 2] = (uint8_t)(((uint64_t)(v) >> 16) & 0xFF);         \
-+      (buf)[(i) * 8 + 3] = (uint8_t)(((uint64_t)(v) >> 24) & 0xFF);         \
-+      (buf)[(i) * 8 + 4] = (uint8_t)(((uint64_t)(v) >> 32) & 0xFF);         \
-+      (buf)[(i) * 8 + 5] = (uint8_t)(((uint64_t)(v) >> 40) & 0xFF);         \
-+      (buf)[(i) * 8 + 6] = (uint8_t)(((uint64_t)(v) >> 48) & 0xFF);         \
-+      (buf)[(i) * 8 + 7] = (uint8_t)(((uint64_t)(v) >> 56) & 0xFF);         \
-+    } while (0)
-+
-+  // --- VA-form pre-dispatch ---
-+  //
-+  // VA-form has a 6-bit XO at bits 26-31 and a 5-bit VRC at bits 21-25.
-+  // decodeVMX's 11-bit XO mask conflates VRC with
-+  // XO, so a plain `switch (xo)` over 11-bit values only matches when
-+  // VRC == 0. Peel off the three VA-form ops actually used by the JIT
-+  // (vmladduhm, vsel, vperm) before the main switch so any VRC value
-+  // works. vsldoi (XO=44) is VX-form with SH at bits 22-25, not VA —
-+  // handled in the switch below.
-+  {
-+    uint32_t va_xo = xo & 0x3Fu;
-+    if (va_xo == 32 || va_xo == 33 || va_xo == 34 || va_xo == 38 ||
-+        va_xo == 40 || va_xo == 42 || va_xo == 43) {
-+      uint32_t vrc = (instr->instructionBits() >> 6) & 0x1F;
-+      uint8_t cv[16];
-+      getVRBytes(vrc, cv);
-+      if (va_xo == 32) {
-+        // vmhaddshs VT,VA,VB,VC : VT[i] = sat_s16(
-+        //   (s32)VA.h[i] * (s32)VB.h[i] >> 15 + (s32)VC.h[i])
-+        // (no rounding term — use vmhraddshs for the rounded form).
-+        for (int i = 0; i < 8; i++) {
-+          int32_t prod = (int32_t)LANE_S16(a, i) * (int32_t)LANE_S16(b, i);
-+          int32_t sum = (prod >> 15) + (int32_t)LANE_S16(cv, i);
-+          if (sum > INT16_MAX) sum = INT16_MAX;
-+          if (sum < INT16_MIN) sum = INT16_MIN;
-+          SET_LANE_U16(r, i, (uint16_t)(int16_t)sum);
-+        }
-+      } else if (va_xo == 33) {
-+        // vmhraddshs VT,VA,VB,VC : rounded Q15 multiply-add-saturate.
-+        //   VT[i] = sat_s16(((s32)VA.h[i] * (s32)VB.h[i] + 0x4000)
-+        //                   >> 15 + (s32)VC.h[i])
-+        // Used by wasm i16x8.q15mulr_sat_s (VC is zero).
-+        for (int i = 0; i < 8; i++) {
-+          int32_t prod = (int32_t)LANE_S16(a, i) * (int32_t)LANE_S16(b, i);
-+          int32_t sum = ((prod + 0x4000) >> 15) + (int32_t)LANE_S16(cv, i);
-+          if (sum > INT16_MAX) sum = INT16_MAX;
-+          if (sum < INT16_MIN) sum = INT16_MIN;
-+          SET_LANE_U16(r, i, (uint16_t)(int16_t)sum);
-+        }
-+      } else if (va_xo == 34) {
-+        // vmladduhm VT,VA,VB,VC : VT = low16(VA*VB + VC)
-+        for (int i = 0; i < 8; i++) {
-+          uint16_t prod = LANE_U16(a, i) * LANE_U16(b, i);
-+          SET_LANE_U16(r, i, prod + LANE_U16(cv, i));
-+        }
-+      } else if (va_xo == 40) {
-+        // vmsumshm VT,VA,VB,VC : pairwise multiply-sum of signed halfwords
-+        // into i32 lanes, modulo i32 wrap.
-+        //   VT.i32[k] = VC.i32[k] + VA.i16[2k]*VB.i16[2k]
-+        //                         + VA.i16[2k+1]*VB.i16[2k+1]
-+        // Used by wasm i32x4.dot_i16x8_s with VC = 0, and by
-+        // i32x4.extadd_pairwise_i16x8_s with VB = splat(1) and VC = 0.
-+        for (int k = 0; k < 4; k++) {
-+          int32_t a0 = (int32_t)LANE_S16(a, 2 * k);
-+          int32_t a1 = (int32_t)LANE_S16(a, 2 * k + 1);
-+          int32_t b0 = (int32_t)LANE_S16(b, 2 * k);
-+          int32_t b1 = (int32_t)LANE_S16(b, 2 * k + 1);
-+          int32_t c  = LANE_S32(cv, k);
-+          int32_t result = (int32_t)((uint32_t)c + (uint32_t)(a0 * b0) +
-+                                     (uint32_t)(a1 * b1));
-+          SET_LANE_U32(r, k, (uint32_t)result);
-+        }
-+      } else if (va_xo == 38) {
-+        // vmsumuhm VT,VA,VB,VC : same as vmsumshm but unsigned halfwords.
-+        //   VT.u32[k] = VC.u32[k] + VA.u16[2k]*VB.u16[2k]
-+        //                         + VA.u16[2k+1]*VB.u16[2k+1]
-+        // Used by wasm i32x4.extadd_pairwise_i16x8_u with VB = splat(1)
-+        // and VC = 0.
-+        for (int k = 0; k < 4; k++) {
-+          uint32_t a0 = (uint32_t)LANE_U16(a, 2 * k);
-+          uint32_t a1 = (uint32_t)LANE_U16(a, 2 * k + 1);
-+          uint32_t b0 = (uint32_t)LANE_U16(b, 2 * k);
-+          uint32_t b1 = (uint32_t)LANE_U16(b, 2 * k + 1);
-+          uint32_t c  = LANE_U32(cv, k);
-+          uint32_t result = c + a0 * b0 + a1 * b1;
-+          SET_LANE_U32(r, k, result);
-+        }
-+      } else if (va_xo == 42) {
-+        // vsel VT,VA,VB,VC : VT[i] = (VC[i] & VB[i]) | (~VC[i] & VA[i])
-+        for (int i = 0; i < 16; i++) {
-+          r[i] = (uint8_t)((cv[i] & b[i]) | (~cv[i] & a[i]));
-+        }
-+      } else {
-+        // vperm VT,VA,VB,VC; empirical LE:
-+        //   r[LE_i] = (VC[LE_i] < 16) ? VA[LE_(15-VC[i])]
-+        //                             : VB[LE_(31-VC[i])]
-+        for (int i = 0; i < 16; i++) {
-+          uint8_t idx = cv[i] & 0x1F;
-+          r[i] = (idx < 16) ? a[15 - idx] : b[31 - idx];
-+        }
-+      }
-+      setVRBytes(vrt, r);
-+      goto vmx_done;
-+    }
-+  }
-+
-+  switch (xo) {
-+    // === Integer add (modulo) ===
-+    case 0:    // vaddubm
-+      for (int i = 0; i < 16; i++) {
-+        SET_LANE_U8(r, i, LANE_U8(a, i) + LANE_U8(b, i));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 64:   // vadduhm
-+      for (int i = 0; i < 8; i++) {
-+        SET_LANE_U16(r, i, LANE_U16(a, i) + LANE_U16(b, i));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 128:  // vadduwm
-+      for (int i = 0; i < 4; i++) {
-+        SET_LANE_U32(r, i, LANE_U32(a, i) + LANE_U32(b, i));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 192:  // vaddudm
-+      for (int i = 0; i < 2; i++) {
-+        SET_LANE_U64(r, i, LANE_U64(a, i) + LANE_U64(b, i));
-+      }
-+      setVRBytes(vrt, r); break;
-+
-+    // === Integer sub (modulo) ===
-+    case 1024: // vsububm
-+      for (int i = 0; i < 16; i++) {
-+        SET_LANE_U8(r, i, LANE_U8(a, i) - LANE_U8(b, i));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 1088: // vsubuhm
-+      for (int i = 0; i < 8; i++) {
-+        SET_LANE_U16(r, i, LANE_U16(a, i) - LANE_U16(b, i));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 1152: // vsubuwm
-+      for (int i = 0; i < 4; i++) {
-+        SET_LANE_U32(r, i, LANE_U32(a, i) - LANE_U32(b, i));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 1216: // vsubudm
-+      for (int i = 0; i < 2; i++) {
-+        SET_LANE_U64(r, i, LANE_U64(a, i) - LANE_U64(b, i));
-+      }
-+      setVRBytes(vrt, r); break;
-+
-+    // === Integer add (saturating, signed) ===
-+    case 768:  // vaddsbs
-+      for (int i = 0; i < 16; i++) {
-+        int s = (int)LANE_S8(a, i) + (int)LANE_S8(b, i);
-+        if (s > INT8_MAX) s = INT8_MAX;
-+        if (s < INT8_MIN) s = INT8_MIN;
-+        SET_LANE_U8(r, i, (uint8_t)s);
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 832:  // vaddshs
-+      for (int i = 0; i < 8; i++) {
-+        int s = (int)LANE_S16(a, i) + (int)LANE_S16(b, i);
-+        if (s > INT16_MAX) s = INT16_MAX;
-+        if (s < INT16_MIN) s = INT16_MIN;
-+        SET_LANE_U16(r, i, (uint16_t)s);
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 896:  // vaddsws
-+      for (int i = 0; i < 4; i++) {
-+        int64_t s = (int64_t)LANE_S32(a, i) + (int64_t)LANE_S32(b, i);
-+        if (s > INT32_MAX) s = INT32_MAX;
-+        if (s < INT32_MIN) s = INT32_MIN;
-+        SET_LANE_U32(r, i, (uint32_t)s);
-+      }
-+      setVRBytes(vrt, r); break;
-+
-+    // === Integer add (saturating, unsigned) ===
-+    case 512:  // vaddubs
-+      for (int i = 0; i < 16; i++) {
-+        unsigned s = (unsigned)LANE_U8(a, i) + (unsigned)LANE_U8(b, i);
-+        if (s > UINT8_MAX) s = UINT8_MAX;
-+        SET_LANE_U8(r, i, (uint8_t)s);
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 576:  // vadduhs
-+      for (int i = 0; i < 8; i++) {
-+        unsigned s = (unsigned)LANE_U16(a, i) + (unsigned)LANE_U16(b, i);
-+        if (s > UINT16_MAX) s = UINT16_MAX;
-+        SET_LANE_U16(r, i, (uint16_t)s);
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 640:  // vadduws
-+      for (int i = 0; i < 4; i++) {
-+        uint64_t s = (uint64_t)LANE_U32(a, i) + (uint64_t)LANE_U32(b, i);
-+        if (s > UINT32_MAX) s = UINT32_MAX;
-+        SET_LANE_U32(r, i, (uint32_t)s);
-+      }
-+      setVRBytes(vrt, r); break;
-+
-+    // === Integer sub (saturating, signed) ===
-+    case 1792: // vsubsbs
-+      for (int i = 0; i < 16; i++) {
-+        int s = (int)LANE_S8(a, i) - (int)LANE_S8(b, i);
-+        if (s > INT8_MAX) s = INT8_MAX;
-+        if (s < INT8_MIN) s = INT8_MIN;
-+        SET_LANE_U8(r, i, (uint8_t)s);
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 1856: // vsubshs
-+      for (int i = 0; i < 8; i++) {
-+        int s = (int)LANE_S16(a, i) - (int)LANE_S16(b, i);
-+        if (s > INT16_MAX) s = INT16_MAX;
-+        if (s < INT16_MIN) s = INT16_MIN;
-+        SET_LANE_U16(r, i, (uint16_t)s);
-+      }
-+      setVRBytes(vrt, r); break;
-+
-+    // === Integer sub (saturating, unsigned) ===
-+    case 1536: // vsububs
-+      for (int i = 0; i < 16; i++) {
-+        int s = (int)LANE_U8(a, i) - (int)LANE_U8(b, i);
-+        if (s < 0) s = 0;
-+        SET_LANE_U8(r, i, (uint8_t)s);
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 1600: // vsubuhs
-+      for (int i = 0; i < 8; i++) {
-+        int s = (int)LANE_U16(a, i) - (int)LANE_U16(b, i);
-+        if (s < 0) s = 0;
-+        SET_LANE_U16(r, i, (uint16_t)s);
-+      }
-+      setVRBytes(vrt, r); break;
-+
-+    // === Average unsigned (rounded: (a+b+1)>>1) ===
-+    case 1026: // vavgub
-+      for (int i = 0; i < 16; i++) {
-+        SET_LANE_U8(r, i,
-+                    ((unsigned)LANE_U8(a, i) + LANE_U8(b, i) + 1) >> 1);
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 1090: // vavguh
-+      for (int i = 0; i < 8; i++) {
-+        SET_LANE_U16(r, i,
-+                     ((unsigned)LANE_U16(a, i) + LANE_U16(b, i) + 1) >> 1);
-+      }
-+      setVRBytes(vrt, r); break;
-+
-+    // === Vector multiply per-lane (i32x4.mul) ===
-+    case 137: { // vmuluwm: per-lane i32 multiply (low 32 bits)
-+      for (int i = 0; i < 4; i++) {
-+        SET_LANE_U32(r, i, LANE_U32(a, i) * LANE_U32(b, i));
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+
-+    // === POWER10 vmulld: per-lane i64 multiply (low 64 bits) ===
-+    case 457: {
-+      for (int i = 0; i < 2; i++) {
-+        uint64_t av = 0, bv = 0;
-+        for (int j = 0; j < 8; j++) {
-+          av |= ((uint64_t)a[i * 8 + j]) << (j * 8);
-+          bv |= ((uint64_t)b[i * 8 + j]) << (j * 8);
-+        }
-+        uint64_t prod = av * bv;  // low 64 bits, modulo wrap
-+        for (int j = 0; j < 8; j++) {
-+          r[i * 8 + j] = (uint8_t)(prod >> (j * 8));
-+        }
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+
-+    // === vmule/vmulo* (multiply even/odd lanes, widening) ===
-+    //
-+    // All XO values below were verified by disassembling the
-+    // PPC_vmule*/PPC_vmulo* constants from Assembler-ppc64.h with
-+    // `as -mppc64 -mlittle` + `objdump -Mpower9 -d`. The previous
-+    // version had all 12 XO labels swapped with each other's semantic
-+    // pair (so the JIT's vmulesb was decoded as vmulosb and vice
-+    // versa), causing i8x16→i16x8 extmul to produce wrong halfwords.
-+    //
-+    //   PPC_vmuloub = 0x10000008 → XO=8     vmuloub (LE even-byte pairs)
-+    //   PPC_vmulouh = 0x10000048 → XO=72    vmulouh
-+    //   PPC_vmulouw = 0x10000088 → XO=136   vmulouw
-+    //   PPC_vmulosb = 0x10000108 → XO=264   vmulosb
-+    //   PPC_vmulosh = 0x10000148 → XO=328   vmulosh
-+    //   PPC_vmulosw = 0x10000188 → XO=392   vmulosw
-+    //   PPC_vmuleub = 0x10000208 → XO=520   vmuleub (LE odd-byte pairs)
-+    //   PPC_vmuleuh = 0x10000248 → XO=584   vmuleuh
-+    //   PPC_vmuleuw = 0x10000288 → XO=648   vmuleuw
-+    //   PPC_vmulesb = 0x10000308 → XO=776   vmulesb
-+    //   PPC_vmulesh = 0x10000348 → XO=840   vmulesh
-+    //   PPC_vmulesw = 0x10000388 → XO=904   vmulesw
-+    //
-+    // Lane indexing on LE storage: "BE-even byte i" is stored at LE
-+    // byte index (15 - 2i); since our LANE_S8 uses LE byte index, the
-+    // "BE-even" = "LE-odd" mapping gives `2*i + 1` for vmule, `2*i`
-+    // for vmulo. The JIT's extmul helpers emit `vmulesb + vmulosb +
-+    // vmrglh` to pack both halves; getting the semantics swapped here
-+    // produces the right result register but with the halves in the
-+    // wrong merge order, breaking extmul.
-+    case 776: { // vmulesb: signed BE-even byte → halfword (8 results)
-+      for (int i = 0; i < 8; i++) {
-+        SET_LANE_U16(r, i,
-+                     (int16_t)LANE_S8(a, 2 * i + 1) *
-+                     (int16_t)LANE_S8(b, 2 * i + 1));
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+    case 520: { // vmuleub: unsigned BE-even byte → halfword
-+      for (int i = 0; i < 8; i++) {
-+        SET_LANE_U16(r, i,
-+                     (uint16_t)LANE_U8(a, 2 * i + 1) *
-+                     (uint16_t)LANE_U8(b, 2 * i + 1));
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+    case 840: { // vmulesh: signed BE-even halfword → word
-+      for (int i = 0; i < 4; i++) {
-+        SET_LANE_U32(r, i,
-+                     (int32_t)LANE_S16(a, 2 * i + 1) *
-+                     (int32_t)LANE_S16(b, 2 * i + 1));
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+    case 584: { // vmuleuh: unsigned BE-even halfword → word
-+      for (int i = 0; i < 4; i++) {
-+        SET_LANE_U32(r, i,
-+                     (uint32_t)LANE_U16(a, 2 * i + 1) *
-+                     (uint32_t)LANE_U16(b, 2 * i + 1));
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+    case 904: { // vmulesw: signed BE-even word → dword (POWER8)
-+      for (int i = 0; i < 2; i++) {
-+        SET_LANE_U64(r, i,
-+                     (int64_t)LANE_S32(a, 2 * i + 1) *
-+                     (int64_t)LANE_S32(b, 2 * i + 1));
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+    case 648: { // vmuleuw: unsigned BE-even word → dword (POWER8)
-+      for (int i = 0; i < 2; i++) {
-+        SET_LANE_U64(r, i,
-+                     (uint64_t)LANE_U32(a, 2 * i + 1) *
-+                     (uint64_t)LANE_U32(b, 2 * i + 1));
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+    case 264: { // vmulosb: signed BE-odd byte → halfword
-+      for (int i = 0; i < 8; i++) {
-+        SET_LANE_U16(r, i,
-+                     (int16_t)LANE_S8(a, 2 * i) *
-+                     (int16_t)LANE_S8(b, 2 * i));
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+    case 8: { // vmuloub: unsigned BE-odd byte
-+      for (int i = 0; i < 8; i++) {
-+        SET_LANE_U16(r, i,
-+                     (uint16_t)LANE_U8(a, 2 * i) *
-+                     (uint16_t)LANE_U8(b, 2 * i));
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+    case 328: { // vmulosh: signed BE-odd halfword → word
-+      for (int i = 0; i < 4; i++) {
-+        SET_LANE_U32(r, i,
-+                     (int32_t)LANE_S16(a, 2 * i) *
-+                     (int32_t)LANE_S16(b, 2 * i));
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+    case 72: { // vmulouh: unsigned BE-odd halfword → word
-+      for (int i = 0; i < 4; i++) {
-+        SET_LANE_U32(r, i,
-+                     (uint32_t)LANE_U16(a, 2 * i) *
-+                     (uint32_t)LANE_U16(b, 2 * i));
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+    case 392: { // vmulosw: signed BE-odd word
-+      for (int i = 0; i < 2; i++) {
-+        SET_LANE_U64(r, i,
-+                     (int64_t)LANE_S32(a, 2 * i) *
-+                     (int64_t)LANE_S32(b, 2 * i));
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+    case 136: { // vmulouw: unsigned BE-odd word
-+      for (int i = 0; i < 2; i++) {
-+        SET_LANE_U64(r, i,
-+                     (uint64_t)LANE_U32(a, 2 * i) *
-+                     (uint64_t)LANE_U32(b, 2 * i));
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+
-+    // === Per-lane rotate left (vrl{b,h,w,d}) ===
-+    case 4:    // vrlb
-+      for (int i = 0; i < 16; i++) {
-+        uint8_t v = LANE_U8(a, i);
-+        uint32_t s = LANE_U8(b, i) & 7;
-+        SET_LANE_U8(r, i, (uint8_t)((v << s) | (v >> ((8 - s) & 7))));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 68:   // vrlh
-+      for (int i = 0; i < 8; i++) {
-+        uint16_t v = LANE_U16(a, i);
-+        uint32_t s = LANE_U16(b, i) & 15;
-+        SET_LANE_U16(r, i, (uint16_t)((v << s) | (v >> ((16 - s) & 15))));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 132:  // vrlw
-+      for (int i = 0; i < 4; i++) {
-+        uint32_t v = LANE_U32(a, i);
-+        uint32_t s = LANE_U32(b, i) & 31;
-+        SET_LANE_U32(r, i, (v << s) | (v >> ((32 - s) & 31)));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 196:  // vrld
-+      for (int i = 0; i < 2; i++) {
-+        uint64_t v = LANE_U64(a, i);
-+        uint32_t s = LANE_U64(b, i) & 63;
-+        SET_LANE_U64(r, i, (v << s) | (v >> ((64 - s) & 63)));
-+      }
-+      setVRBytes(vrt, r); break;
-+
-+    // === Min / Max signed ===
-+    case 258:  // vmaxsb
-+      for (int i = 0; i < 16; i++) {
-+        SET_LANE_U8(r, i, std::max(LANE_S8(a, i), LANE_S8(b, i)));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 322:  // vmaxsh
-+      for (int i = 0; i < 8; i++) {
-+        SET_LANE_U16(r, i, std::max(LANE_S16(a, i), LANE_S16(b, i)));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 386:  // vmaxsw
-+      for (int i = 0; i < 4; i++) {
-+        SET_LANE_U32(r, i, std::max(LANE_S32(a, i), LANE_S32(b, i)));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 450:  // vmaxsd
-+      for (int i = 0; i < 2; i++) {
-+        SET_LANE_U64(r, i, std::max(LANE_S64(a, i), LANE_S64(b, i)));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 770:  // vminsb
-+      for (int i = 0; i < 16; i++) {
-+        SET_LANE_U8(r, i, std::min(LANE_S8(a, i), LANE_S8(b, i)));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 834:  // vminsh
-+      for (int i = 0; i < 8; i++) {
-+        SET_LANE_U16(r, i, std::min(LANE_S16(a, i), LANE_S16(b, i)));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 898:  // vminsw
-+      for (int i = 0; i < 4; i++) {
-+        SET_LANE_U32(r, i, std::min(LANE_S32(a, i), LANE_S32(b, i)));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 962:  // vminsd
-+      for (int i = 0; i < 2; i++) {
-+        SET_LANE_U64(r, i, std::min(LANE_S64(a, i), LANE_S64(b, i)));
-+      }
-+      setVRBytes(vrt, r); break;
-+
-+    // === Min / Max unsigned ===
-+    case 2:    // vmaxub
-+      for (int i = 0; i < 16; i++) {
-+        SET_LANE_U8(r, i, std::max(LANE_U8(a, i), LANE_U8(b, i)));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 66:   // vmaxuh
-+      for (int i = 0; i < 8; i++) {
-+        SET_LANE_U16(r, i, std::max(LANE_U16(a, i), LANE_U16(b, i)));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 130:  // vmaxuw
-+      for (int i = 0; i < 4; i++) {
-+        SET_LANE_U32(r, i, std::max(LANE_U32(a, i), LANE_U32(b, i)));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 194:  // vmaxud
-+      for (int i = 0; i < 2; i++) {
-+        SET_LANE_U64(r, i, std::max(LANE_U64(a, i), LANE_U64(b, i)));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 514:  // vminub
-+      for (int i = 0; i < 16; i++) {
-+        SET_LANE_U8(r, i, std::min(LANE_U8(a, i), LANE_U8(b, i)));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 578:  // vminuh
-+      for (int i = 0; i < 8; i++) {
-+        SET_LANE_U16(r, i, std::min(LANE_U16(a, i), LANE_U16(b, i)));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 642:  // vminuw
-+      for (int i = 0; i < 4; i++) {
-+        SET_LANE_U32(r, i, std::min(LANE_U32(a, i), LANE_U32(b, i)));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 706:  // vminud
-+      for (int i = 0; i < 2; i++) {
-+        SET_LANE_U64(r, i, std::min(LANE_U64(a, i), LANE_U64(b, i)));
-+      }
-+      setVRBytes(vrt, r); break;
-+
-+    // === Vector compare (eq, gt signed, gt unsigned, ne POWER9) ===
-+    //
-+    // All vcmp* ops set per-lane all-1s on true, all-0s on false. The
-+    // record form (Rc=1, XO MSB bit set; XO_rec = XO_base + 1024) must
-+    // additionally write CR6:
-+    //   CR6.LT = 1 iff ALL lanes are true;
-+    //   CR6.GT = 0 (always);
-+    //   CR6.EQ = 1 iff NO lane is true;
-+    //   CR6.SO = 0 (always).
-+    // `i8x16.all_true` etc. in wasm rely on CR6.EQ via `mfocrf cr6`; the
-+    // previous simulator implementation left CR6 untouched, so the
-+    // predicate was always wrong.
-+    //
-+    // Helper: count true lanes by looking at byte 0 of each lane (all
-+    // bytes within a "true" lane are 0xFF so byte 0 is a sound proxy).
-+    #define VCMP_DONE(lanes_, lane_bytes_)                                \
-+      do {                                                                \
-+        setVRBytes(vrt, r);                                                \
-+        if (xo >= 1024) {                                                  \
-+          int numTrue_ = 0;                                                \
-+          for (int i_ = 0; i_ < (lanes_); i_++) {                          \
-+            if (r[i_ * (lane_bytes_)] == 0xFF) numTrue_++;                 \
-+          }                                                                \
-+          uint8_t field_ = 0;                                              \
-+          if (numTrue_ == (lanes_)) field_ |= kCRFieldLT;                  \
-+          if (numTrue_ == 0) field_ |= kCRFieldEQ;                         \
-+          setCRField(6, field_);                                           \
-+        }                                                                  \
-+      } while (0)
-+
-+    case 6:    // vcmpequb (Rc=0)
-+    case 1030: // vcmpequb. (record, CR6 updated)
-+      for (int i = 0; i < 16; i++) {
-+        SET_LANE_U8(r, i, LANE_U8(a, i) == LANE_U8(b, i) ? 0xFF : 0);
-+      }
-+      VCMP_DONE(16, 1); break;
-+    case 70:   // vcmpequh
-+    case 1094: // vcmpequh.
-+      for (int i = 0; i < 8; i++) {
-+        SET_LANE_U16(r, i, LANE_U16(a, i) == LANE_U16(b, i) ? 0xFFFF : 0);
-+      }
-+      VCMP_DONE(8, 2); break;
-+    case 134:  // vcmpequw
-+    case 1158: // vcmpequw.
-+      for (int i = 0; i < 4; i++) {
-+        SET_LANE_U32(r, i,
-+                     LANE_U32(a, i) == LANE_U32(b, i) ? 0xFFFFFFFFu : 0);
-+      }
-+      VCMP_DONE(4, 4); break;
-+    case 199:  // vcmpequd
-+    case 1223: // vcmpequd.
-+      for (int i = 0; i < 2; i++) {
-+        SET_LANE_U64(r, i,
-+                     LANE_U64(a, i) == LANE_U64(b, i)
-+                         ? UINT64_MAX
-+                         : 0);
-+      }
-+      VCMP_DONE(2, 8); break;
-+
-+    // === Compare greater-than signed ===
-+    case 774:  // vcmpgtsb
-+    case 1798: // vcmpgtsb.
-+      for (int i = 0; i < 16; i++) {
-+        SET_LANE_U8(r, i, LANE_S8(a, i) > LANE_S8(b, i) ? 0xFF : 0);
-+      }
-+      VCMP_DONE(16, 1); break;
-+    case 838:  // vcmpgtsh
-+    case 1862: // vcmpgtsh.
-+      for (int i = 0; i < 8; i++) {
-+        SET_LANE_U16(r, i, LANE_S16(a, i) > LANE_S16(b, i) ? 0xFFFF : 0);
-+      }
-+      VCMP_DONE(8, 2); break;
-+    case 902:  // vcmpgtsw
-+    case 1926: // vcmpgtsw.
-+      for (int i = 0; i < 4; i++) {
-+        SET_LANE_U32(r, i,
-+                     LANE_S32(a, i) > LANE_S32(b, i) ? 0xFFFFFFFFu : 0);
-+      }
-+      VCMP_DONE(4, 4); break;
-+    case 967:  // vcmpgtsd
-+    case 1991: // vcmpgtsd.
-+      for (int i = 0; i < 2; i++) {
-+        SET_LANE_U64(r, i,
-+                     LANE_S64(a, i) > LANE_S64(b, i) ? UINT64_MAX : 0);
-+      }
-+      VCMP_DONE(2, 8); break;
-+
-+    // === Compare greater-than unsigned ===
-+    case 518:  // vcmpgtub
-+    case 1542: // vcmpgtub.
-+      for (int i = 0; i < 16; i++) {
-+        SET_LANE_U8(r, i, LANE_U8(a, i) > LANE_U8(b, i) ? 0xFF : 0);
-+      }
-+      VCMP_DONE(16, 1); break;
-+    case 582:  // vcmpgtuh
-+    case 1606: // vcmpgtuh.
-+      for (int i = 0; i < 8; i++) {
-+        SET_LANE_U16(r, i, LANE_U16(a, i) > LANE_U16(b, i) ? 0xFFFF : 0);
-+      }
-+      VCMP_DONE(8, 2); break;
-+    case 646:  // vcmpgtuw
-+    case 1670: // vcmpgtuw.
-+      for (int i = 0; i < 4; i++) {
-+        SET_LANE_U32(r, i,
-+                     LANE_U32(a, i) > LANE_U32(b, i) ? 0xFFFFFFFFu : 0);
-+      }
-+      VCMP_DONE(4, 4); break;
-+    case 711:  // vcmpgtud
-+    case 1735: // vcmpgtud.
-+      for (int i = 0; i < 2; i++) {
-+        SET_LANE_U64(r, i,
-+                     LANE_U64(a, i) > LANE_U64(b, i) ? UINT64_MAX : 0);
-+      }
-+      VCMP_DONE(2, 8); break;
-+
-+    // === Splat from immediate (5-bit signed splat into all lanes) ===
-+    // ISA defines UIM in BE element numbering. For LE storage, BE element i = LE element (N-1-i).
-+    case 524:  // vspltb: VRT[*] = VRB[BE-byte-UIM]; uimm from VRA field (bits 11..15)
-+      for (int i = 0; i < 16; i++) {
-+        SET_LANE_U8(r, i, LANE_U8(b, 15 - (uimm & 0xF)));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 588:  // vsplth
-+      for (int i = 0; i < 8; i++) {
-+        SET_LANE_U16(r, i, LANE_U16(b, 7 - (uimm & 0x7)));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 652:  // vspltw
-+      for (int i = 0; i < 4; i++) {
-+        SET_LANE_U32(r, i, LANE_U32(b, 3 - (uimm & 0x3)));
-+      }
-+      setVRBytes(vrt, r); break;
-+
-+    // === Splat 5-bit signed immediate to all byte lanes ===
-+    case 780: {  // vspltisb VRT, SIMM5
-+      int32_t simm5 = (int32_t)((instr->instructionBits() >> 16) & 0x1F);
-+      if (simm5 & 0x10) simm5 |= ~0x1F;
-+      uint8_t b = (uint8_t)(int8_t)simm5;
-+      memset(r, b, 16);
-+      setVRBytes(vrt, r); break;
-+    }
-+
-+    // === Splat 5-bit signed immediate to all halfword lanes ===
-+    case 844: {  // vspltish VRT, SIMM5
-+      // SIMM5 occupies bits 11..15 of the instruction (VRA field). It
-+      // is sign-extended to 16 bits and replicated across all 8 halfword
-+      // lanes of VRT. Range: [-16, 15].
-+      int32_t simm5 = (int32_t)((instr->instructionBits() >> 16) & 0x1F);
-+      if (simm5 & 0x10) simm5 |= ~0x1F;  // sign-extend bit 4
-+      int16_t hw = (int16_t)simm5;
-+      for (int i = 0; i < 8; i++) {
-+        SET_LANE_U16(r, i, (uint16_t)hw);
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+
-+    // === Splat 5-bit signed immediate to all word lanes ===
-+    case 908: {  // vspltisw VRT, SIMM5
-+      int32_t simm5 = (int32_t)((instr->instructionBits() >> 16) & 0x1F);
-+      if (simm5 & 0x10) simm5 |= ~0x1F;
-+      for (int i = 0; i < 4; i++) {
-+        SET_LANE_U32(r, i, (uint32_t)simm5);
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+
-+    // === Merge (interleave) ===
-+    //
-+    // The ISA defines vmrgh* / vmrgl* in BE numbering; the
-+    // empirical LE storage behaviour is:
-+    //   vmrgh* VT,VA,VB: for i in 0..N/2-1,
-+    //     VT.lane_LE[2i]   = VB.lane_LE[(N/2) + i]
-+    //     VT.lane_LE[2i+1] = VA.lane_LE[(N/2) + i]
-+    //   vmrgl* VT,VA,VB: for i in 0..N/2-1,
-+    //     VT.lane_LE[2i]   = VB.lane_LE[i]
-+    //     VT.lane_LE[2i+1] = VA.lane_LE[i]
-+    // i.e. the VB operand goes to the even result positions (reversed
-+    // from what a naïve BE reading would suggest) and the "high" form
-+    // selects the upper-half of LE storage.
-+    //
-+    // Previous implementation had both the operand order swapped AND
-+    // the high/low halves swapped (consistent with each other, so
-+    // JIT-only-visible ops that round-tripped through vmrg* happened
-+    // to produce the right answer, but wasm-visible extmul exposed
-+    // the bug).
-+    case 12:   // vmrghb
-+      for (int i = 0; i < 8; i++) {
-+        SET_LANE_U8(r, 2 * i, LANE_U8(b, 8 + i));
-+        SET_LANE_U8(r, 2 * i + 1, LANE_U8(a, 8 + i));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 76:   // vmrghh
-+      for (int i = 0; i < 4; i++) {
-+        SET_LANE_U16(r, 2 * i, LANE_U16(b, 4 + i));
-+        SET_LANE_U16(r, 2 * i + 1, LANE_U16(a, 4 + i));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 140:  // vmrghw
-+      for (int i = 0; i < 2; i++) {
-+        SET_LANE_U32(r, 2 * i, LANE_U32(b, 2 + i));
-+        SET_LANE_U32(r, 2 * i + 1, LANE_U32(a, 2 + i));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 268:  // vmrglb
-+      for (int i = 0; i < 8; i++) {
-+        SET_LANE_U8(r, 2 * i, LANE_U8(b, i));
-+        SET_LANE_U8(r, 2 * i + 1, LANE_U8(a, i));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 332:  // vmrglh
-+      for (int i = 0; i < 4; i++) {
-+        SET_LANE_U16(r, 2 * i, LANE_U16(b, i));
-+        SET_LANE_U16(r, 2 * i + 1, LANE_U16(a, i));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 396:  // vmrglw
-+      for (int i = 0; i < 2; i++) {
-+        SET_LANE_U32(r, 2 * i, LANE_U32(b, i));
-+        SET_LANE_U32(r, 2 * i + 1, LANE_U32(a, i));
-+      }
-+      setVRBytes(vrt, r); break;
-+
-+    // === Per-lane shift left (count from VRB, low N bits per element) ===
-+    case 260:  // vslb
-+      for (int i = 0; i < 16; i++) {
-+        SET_LANE_U8(r, i, LANE_U8(a, i) << (LANE_U8(b, i) & 7));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 324:  // vslh
-+      for (int i = 0; i < 8; i++) {
-+        SET_LANE_U16(r, i, LANE_U16(a, i) << (LANE_U16(b, i) & 15));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 388:  // vslw
-+      for (int i = 0; i < 4; i++) {
-+        SET_LANE_U32(r, i, LANE_U32(a, i) << (LANE_U32(b, i) & 31));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 1476: // vsld
-+      for (int i = 0; i < 2; i++) {
-+        SET_LANE_U64(r, i, LANE_U64(a, i) << (LANE_U64(b, i) & 63));
-+      }
-+      setVRBytes(vrt, r); break;
-+
-+    // === Per-lane shift right unsigned ===
-+    case 516:  // vsrb
-+      for (int i = 0; i < 16; i++) {
-+        SET_LANE_U8(r, i, LANE_U8(a, i) >> (LANE_U8(b, i) & 7));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 580:  // vsrh
-+      for (int i = 0; i < 8; i++) {
-+        SET_LANE_U16(r, i, LANE_U16(a, i) >> (LANE_U16(b, i) & 15));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 644:  // vsrw
-+      for (int i = 0; i < 4; i++) {
-+        SET_LANE_U32(r, i, LANE_U32(a, i) >> (LANE_U32(b, i) & 31));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 1732: // vsrd
-+      for (int i = 0; i < 2; i++) {
-+        SET_LANE_U64(r, i, LANE_U64(a, i) >> (LANE_U64(b, i) & 63));
-+      }
-+      setVRBytes(vrt, r); break;
-+
-+    // === Per-lane shift right algebraic (signed) ===
-+    case 772:  // vsrab
-+      for (int i = 0; i < 16; i++) {
-+        SET_LANE_U8(r, i,
-+                    (uint8_t)(LANE_S8(a, i) >> (LANE_U8(b, i) & 7)));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 836:  // vsrah
-+      for (int i = 0; i < 8; i++) {
-+        SET_LANE_U16(r, i,
-+                     (uint16_t)(LANE_S16(a, i) >> (LANE_U16(b, i) & 15)));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 900:  // vsraw
-+      for (int i = 0; i < 4; i++) {
-+        SET_LANE_U32(r, i,
-+                     (uint32_t)(LANE_S32(a, i) >> (LANE_U32(b, i) & 31)));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 964:  // vsrad
-+      for (int i = 0; i < 2; i++) {
-+        SET_LANE_U64(r, i,
-+                     (uint64_t)(LANE_S64(a, i) >> (LANE_U64(b, i) & 63)));
-+      }
-+      setVRBytes(vrt, r); break;
-+
-+    // === POWER9 per-lane integer negate (subop in VRA field) ===
-+    // PPC_vnegw = 0x10060602 → XO=0x602=1538, VRA=6
-+    // PPC_vnegd = 0x10070602 → XO=0x602=1538, VRA=7
-+    case 1538:
-+      if (vra == 6) {  // vnegw
-+        for (int i = 0; i < 4; i++) {
-+          SET_LANE_U32(r, i, (uint32_t)(-LANE_S32(b, i)));
-+        }
-+      } else if (vra == 7) {  // vnegd
-+        for (int i = 0; i < 2; i++) {
-+          SET_LANE_U64(r, i, (uint64_t)(-LANE_S64(b, i)));
-+        }
-+      } else {
-+        MOZ_CRASH_UNSAFE_PRINTF("decodeVMX XO=1538: unknown subop %u", vra);
-+      }
-+      setVRBytes(vrt, r); break;
-+
-+    // === POWER10 vextract{b,h,w,d}m (XO=1602=0x642) ===
-+    // RT (GPR) gets the wasm-spec bitmask in low 16/8/4/2 bits. UIM at
-+    // bits 11..15 (= sim `vra`) selects lane width: 8=byte, 9=halfword,
-+    // 10=word, 11=doubleword.
-+    case 1602: {
-+      uint64_t result = 0;
-+      switch (vra) {
-+        case 8:  // vextractbm: 16 byte lanes
-+          for (int i = 0; i < 16; i++) {
-+            if (b[i] & 0x80) result |= (1ULL << i);
-+          }
-+          break;
-+        case 9:  // vextracthm: 8 halfword lanes; MSB lives at byte 2i+1
-+          for (int i = 0; i < 8; i++) {
-+            if (b[2 * i + 1] & 0x80) result |= (1ULL << i);
-+          }
-+          break;
-+        case 10:  // vextractwm: 4 word lanes; MSB at byte 4i+3
-+          for (int i = 0; i < 4; i++) {
-+            if (b[4 * i + 3] & 0x80) result |= (1ULL << i);
-+          }
-+          break;
-+        case 11:  // vextractdm: 2 dword lanes; MSB at byte 8i+7
-+          for (int i = 0; i < 2; i++) {
-+            if (b[8 * i + 7] & 0x80) result |= (1ULL << i);
-+          }
-+          break;
-+        default:
-+          MOZ_CRASH_UNSAFE_PRINTF("decodeVMX XO=1602: unknown UIM %u", vra);
-+      }
-+      // vrt is the GPR target (RT field at bits 6..10).
-+      setRegister(int(vrt), int64_t(result));
-+      goto vmx_done;  // Skip the trailing setVRBytes used by VR-targeting ops.
-+    }
-+
-+    // === POWER9 vinsertb (XO=781) / vinserth (XO=845) ===
-+    // Insert byte/halfword from a VR (NOT a GPR) at an immediate byte
-+    // position UIM (BE).
-+    //   vinsertb: VRT.byte[UIM]   (BE) ← VRB.byte[7] (BE)
-+    //   vinserth: VRT.byte[UIM]   (BE) ← VRB.byte[6] (BE)
-+    //             VRT.byte[UIM+1] (BE) ← VRB.byte[7] (BE)
-+    // BE byte i ↔ LE byte (15-i). So VRB.byte[6] (BE) = LE byte 9 of
-+    // VRB, VRB.byte[7] (BE) = LE byte 8. (Byte-pair order matters.)
-+    case 781:    // vinsertb
-+    case 845: {  // vinserth
-+      getVRBytes(vrt, r);  // start from current VRT
-+      if (xo == 845) {
-+        // vinserth: copy 2-byte halfword (BE bytes 6..7 of VRB).
-+        r[15 - uimm]     = b[9];  // BE byte UIM   ← VRB BE byte 6
-+        r[14 - uimm]     = b[8];  // BE byte UIM+1 ← VRB BE byte 7
-+      } else {
-+        // vinsertb: copy a single byte (BE byte 7 of VRB).
-+        r[15 - uimm]     = b[8];  // BE byte UIM   ← VRB BE byte 7
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+
-+    // === POWER9 vextractub (XO=525) / vextractuh (XO=589) ===
-+    // Extract one byte/halfword from VRB at immediate BE position UIM
-+    // and place it at BE byte 7 of VRT, with all other bytes of VRT
-+    // zeroed. Companion to vinsertb/h; chooses an immediate BE position
-+    // and lands the result at the low byte of VRT (= low byte of mfvsrd).
-+    //   vextractub: VRT.byte[7] (BE) ← VRB.byte[UIM] (BE), rest = 0
-+    //   vextractuh: VRT.byte[6] (BE) ← VRB.byte[UIM]   (BE)
-+    //               VRT.byte[7] (BE) ← VRB.byte[UIM+1] (BE), rest = 0
-+    case 525:    // vextractub
-+    case 589: {  // vextractuh
-+      memset(r, 0, sizeof(r));
-+      if (xo == 589) {
-+        r[9] = b[15 - uimm];  // VRT BE byte 6 ← VRB BE byte UIM
-+        r[8] = b[14 - uimm];  // VRT BE byte 7 ← VRB BE byte UIM+1
-+      } else {
-+        r[8] = b[15 - uimm];  // VRT BE byte 7 ← VRB BE byte UIM
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+
-+    // === POWER10 vinsbrx (XO=783) / vinshrx (XO=847) ===
-+    // Right-indexed (LE-natural) byte/halfword insert from GPR. RA's
-+    // low 4 bits supply the byte position (mod 16); for vinshrx the
-+    // position is also masked to even (& 0xE) so the halfword is
-+    // 2-byte aligned. RB's low 8 / 16 bits are inserted; other bytes
-+    // of VRT are unchanged. RA and RB are GPRs (NOT VRs) — sim's
-+    // pre-fetched `a` and `b` from getVRBytes are unused here.
-+    case 783:    // vinsbrx
-+    case 847: {  // vinshrx
-+      uint64_t ra_val = U64(getRegister(int(vra)));
-+      uint64_t rb_val = U64(getRegister(int(vrb)));
-+      getVRBytes(vrt, r);  // start from current VRT
-+      const bool isHalf = (xo == 847);
-+      const uint32_t pos = isHalf ? uint32_t(ra_val & 0xEULL)
-+                                  : uint32_t(ra_val & 0xFULL);
-+      r[pos] = (uint8_t)(rb_val & 0xFFULL);
-+      if (isHalf) {
-+        r[pos + 1] = (uint8_t)((rb_val >> 8) & 0xFFULL);
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+
-+    // === POWER10 vinsw (XO=207) / vinsd (XO=463) ===
-+    // VRT[UIM*8:UIM*8+N-1] (BE bits) ← RB low N bits, where N = 32 or 64.
-+    // RB is a GPR (the `vrb` field at sim bits 15..11). UIM is at sim
-+    // bits 20..16 (= the `uimm` / `vra` decode). Other bytes of VRT are
-+    // unchanged, so we read VRT first then patch UIM..UIM+(N/8-1).
-+    case 207:    // vinsw
-+    case 463: {  // vinsd
-+      uint64_t rb_val = U64(getRegister(int(vrb)));
-+      getVRBytes(vrt, r);  // start from current VRT
-+      const int width = (xo == 463) ? 8 : 4;  // bytes
-+      // BE byte UIM+i of VRT = LE byte (15 - UIM - i).
-+      // For vinsd, RB.dword[0] (BE) = bits 56..63 of rb_val (host LSB end
-+      // of the GPR — recall U64() puts the canonical 64-bit value in a
-+      // host uint64_t with bit 63 = MSB).
-+      // For vinsw, source is RB[32:63] = low 32 bits of rb_val.
-+      uint64_t src = (width == 8) ? rb_val : (rb_val & 0xFFFFFFFFULL);
-+      const int srcMsbShift = (width * 8) - 8;  // 56 or 24
-+      for (int i = 0; i < width; i++) {
-+        r[15 - uimm - i] = (uint8_t)(src >> (srcMsbShift - 8 * i));
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+
-+    // === POWER8+ vbpermq (XO=1356=0x54C): per-byte bit permute ===
-+    // For each i in 0..15, take VRB BE-byte i (= sim b[15-i]); if its
-+    // high bit is set, perm[i]=0; else perm[i] = bit at BE position
-+    // (low 7 bits) of VRA. ISA says perm[0..15] go into VRT.dw[1] low
-+    // 16 bits, but on real LE silicon the bitmap is observable in dw[0]
-+    // low 16 bits — i.e., recoverable via mfvsrd. Match that observable
-+    // behaviour: write the bitmap into sim bytes[8..9] (where mfvsrd
-+    // reads dw[0] from), zero the rest.
-+    case 1356: {
-+      uint8_t perm[16];
-+      for (int k = 0; k < 16; k++) {
-+        uint8_t ctl = b[15 - k];
-+        if (ctl & 0x80) {
-+          perm[k] = 0;
-+        } else {
-+          int p = ctl & 0x7F;
-+          int le_idx = 15 - (p / 8);
-+          int bit_in_byte = 7 - (p % 8);
-+          perm[k] = (a[le_idx] >> bit_in_byte) & 1;
-+        }
-+      }
-+      uint8_t lo = 0, hi = 0;
-+      for (int k = 0; k < 8; k++) hi = (hi << 1) | perm[k];
-+      for (int k = 8; k < 16; k++) lo = (lo << 1) | perm[k];
-+      for (int i = 0; i < 16; i++) r[i] = 0;
-+      r[8] = lo;
-+      r[9] = hi;
-+      setVRBytes(vrt, r); break;
-+    }
-+
-+    // VA-form ops vmladduhm (XO=34), vsel (XO=42), vperm (XO=43) are
-+    // peeled off in the pre-dispatch above (see "VA-form pre-dispatch"
-+    // comment near the top of this function), since the 11-bit XO
-+    // mask conflates VRC into the case label.
-+
-+    // === Unpack high signed (BE-numbering = LE indices 8..15) ===
-+    // vupkhsb: VRT[i] = sign_extend_to_16(VRA[i+0..7]). On LE storage with
-+    // BE-named "high" being the low-indexed bytes, vupkhsb sign-extends the
-+    // low 8 bytes of VRA into 8 halfwords. PPC64LE wasm calls these the
-+    // "high" lanes per PPC convention; the JIT compensates internally via
-+    // the vupklsb/vupkhsb swap documented in MacroAssembler-ppc64-inl.h.
-+    case 526:  // vupkhsb (high signed byte → halfword)
-+      for (int i = 0; i < 8; i++) {
-+        SET_LANE_U16(r, i, (uint16_t)(int16_t)LANE_S8(b, 8 + i));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 590:  // vupkhsh (high signed halfword → word)
-+      for (int i = 0; i < 4; i++) {
-+        SET_LANE_U32(r, i, (uint32_t)(int32_t)LANE_S16(b, 4 + i));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 1614: // vupkhsw (high signed word → dword) POWER8+
-+      for (int i = 0; i < 2; i++) {
-+        SET_LANE_U64(r, i, (uint64_t)(int64_t)LANE_S32(b, 2 + i));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 654:  // vupklsb (low signed byte → halfword) — PPC LE: takes high lanes
-+      for (int i = 0; i < 8; i++) {
-+        SET_LANE_U16(r, i, (uint16_t)(int16_t)LANE_S8(b, i));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 718:  // vupklsh (low signed halfword → word)
-+      for (int i = 0; i < 4; i++) {
-+        SET_LANE_U32(r, i, (uint32_t)(int32_t)LANE_S16(b, i));
-+      }
-+      setVRBytes(vrt, r); break;
-+    case 1742: // vupklsw (low signed word → dword)
-+      for (int i = 0; i < 2; i++) {
-+        SET_LANE_U64(r, i, (uint64_t)(int64_t)LANE_S32(b, i));
-+      }
-+      setVRBytes(vrt, r); break;
-+
-+    // === Pack (saturate or modulo) ===
-+    //
-+    // vpk* definitions are BE-specified:
-+    // VT.byte[0..7] = saturate(VA.halfword[0..7]), VT.byte[8..15] =
-+    // saturate(VB.halfword[0..7]) (BE-numbered throughout). On
-+    // PPC64LE register storage that inverts to: LE bytes 0-7 = VB's
-+    // saturated halfwords, LE bytes 8-15 = VA's.
-+    //
-+    //   vpkshus = XO 270   (s16 → u8 sat)
-+    //   vpkshss = XO 398   (s16 → s8 sat)
-+    //   vpkswus = XO 334   (s32 → u16 sat)
-+    //   vpkswss = XO 462   (s32 → s16 sat)
-+    // The sim previously had three of these four labels rotated
-+    // (270=vpkshss, 334=vpkshus, 398=vpkswus) so every i8x16/i16x8
-+    // narrow_* call silently used the wrong saturation kind or
-+    // lane width — vpkshss was completely absent.
-+    case 398: { // vpkshss (signed halfword → signed byte)
-+      for (int i = 0; i < 8; i++) {
-+        int v = LANE_S16(b, i);
-+        if (v > INT8_MAX) v = INT8_MAX;
-+        if (v < INT8_MIN) v = INT8_MIN;
-+        SET_LANE_U8(r, i, (uint8_t)(int8_t)v);
-+      }
-+      for (int i = 0; i < 8; i++) {
-+        int v = LANE_S16(a, i);
-+        if (v > INT8_MAX) v = INT8_MAX;
-+        if (v < INT8_MIN) v = INT8_MIN;
-+        SET_LANE_U8(r, 8 + i, (uint8_t)(int8_t)v);
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+    case 462: { // vpkswss (signed word → signed halfword)
-+      for (int i = 0; i < 4; i++) {
-+        int64_t v = LANE_S32(b, i);
-+        if (v > INT16_MAX) v = INT16_MAX;
-+        if (v < INT16_MIN) v = INT16_MIN;
-+        SET_LANE_U16(r, i, (uint16_t)(int16_t)v);
-+      }
-+      for (int i = 0; i < 4; i++) {
-+        int64_t v = LANE_S32(a, i);
-+        if (v > INT16_MAX) v = INT16_MAX;
-+        if (v < INT16_MIN) v = INT16_MIN;
-+        SET_LANE_U16(r, 4 + i, (uint16_t)(int16_t)v);
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+    case 270: { // vpkshus (signed halfword → unsigned byte, sat)
-+      for (int i = 0; i < 8; i++) {
-+        int v = LANE_S16(b, i);
-+        if (v > UINT8_MAX) v = UINT8_MAX;
-+        if (v < 0) v = 0;
-+        SET_LANE_U8(r, i, (uint8_t)v);
-+      }
-+      for (int i = 0; i < 8; i++) {
-+        int v = LANE_S16(a, i);
-+        if (v > UINT8_MAX) v = UINT8_MAX;
-+        if (v < 0) v = 0;
-+        SET_LANE_U8(r, 8 + i, (uint8_t)v);
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+    case 334: { // vpkswus (signed word → unsigned halfword, sat)
-+      for (int i = 0; i < 4; i++) {
-+        int64_t v = LANE_S32(b, i);
-+        if (v > UINT16_MAX) v = UINT16_MAX;
-+        if (v < 0) v = 0;
-+        SET_LANE_U16(r, i, (uint16_t)v);
-+      }
-+      for (int i = 0; i < 4; i++) {
-+        int64_t v = LANE_S32(a, i);
-+        if (v > UINT16_MAX) v = UINT16_MAX;
-+        if (v < 0) v = 0;
-+        SET_LANE_U16(r, 4 + i, (uint16_t)v);
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+
-+    // === POWER9 compare not-equal (vcmpne{b,h,w}) — Rc=0 and Rc=1 ===
-+    case 7:    // vcmpneb
-+    case 1031: // vcmpneb.
-+      for (int i = 0; i < 16; i++) {
-+        SET_LANE_U8(r, i, LANE_U8(a, i) != LANE_U8(b, i) ? 0xFF : 0);
-+      }
-+      VCMP_DONE(16, 1); break;
-+    case 71:   // vcmpneh
-+    case 1095: // vcmpneh.
-+      for (int i = 0; i < 8; i++) {
-+        SET_LANE_U16(r, i, LANE_U16(a, i) != LANE_U16(b, i) ? 0xFFFF : 0);
-+      }
-+      VCMP_DONE(8, 2); break;
-+    case 135:  // vcmpnew
-+    case 1159: // vcmpnew.
-+      for (int i = 0; i < 4; i++) {
-+        SET_LANE_U32(r, i,
-+                     LANE_U32(a, i) != LANE_U32(b, i) ? 0xFFFFFFFFu : 0);
-+      }
-+      VCMP_DONE(4, 4); break;
-+    #undef VCMP_DONE
-+
-+    // === Population count per byte (POWER8) ===
-+    case 1795: { // vpopcntb (XO 0x703 = 1795). VRA field unused.
-+      for (int i = 0; i < 16; i++) {
-+        SET_LANE_U8(r, i, (uint8_t)__builtin_popcount(LANE_U8(b, i)));
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+
-+    // === vsldoi: VRT = (VRA || VRB) shifted left by SH bytes (SH at bits 22..25) ===
-+    case 44: case 45: case 46: case 47: {
-+      // SH is at bits 22..25 (PPC) → LSB bits 6..9 of the instruction →
-+      // (instructionBits >> 6) & 0xF. Our XO mask already bottoms-out at
-+      // bit 0, so extract from the raw instruction.
-+      uint32_t sh = (instr->instructionBits() >> 6) & 0xF;
-+      uint8_t cat[32];
-+      memcpy(cat, a, 16);
-+      memcpy(cat + 16, b, 16);
-+      for (int i = 0; i < 16; i++) {
-+        r[i] = cat[sh + i];
-+      }
-+      setVRBytes(vrt, r); break;
-+    }
-+
-+
-+    default:
-+      MOZ_CRASH_UNSAFE_PRINTF(
-+          "decodeVMX: unimplemented XO=%u (instruction 0x%08x)", xo,
-+          instr->instructionBits());
-+  }
-+
-+vmx_done:
-+  #undef LANE_U8
-+  #undef LANE_S8
-+  #undef LANE_U16
-+  #undef LANE_S16
-+  #undef LANE_U32
-+  #undef LANE_S32
-+  #undef LANE_U64
-+  #undef LANE_S64
-+  #undef SET_LANE_U8
-+  #undef SET_LANE_U16
-+  #undef SET_LANE_U32
-+  #undef SET_LANE_U64
-+  ;  // empty stmt for label
-+}
-+
-+// -----------------------------------------------------------------------------
-+// decodeVSX: Major opcode 60 (XX1-form, XX2-form)
-+// mfvsrd, mtvsrd, mtvsrwz, mtvsrws, xscvdpsp, xscvdpspn, xscvspdp,
-+// xscvspdpn, xxbrd
-+
-+void Simulator::decodeVSX(SimInstruction* instr) {
-+  // VSX major opcode 60 covers XX1/XX2/XX3/XX4 forms. We dispatch XX4
-+  // (xxsel) first because its XO is only 2 bits (at ISA 26-27 = sim
-+  // bits 5-4), and the XC register field at ISA 21-25 would otherwise
-+  // produce 32 different 9-bit XO values to enumerate in the switch.
-+  // Peel off any instruction with XX4 XO=3 (xxsel). No XX2/XX3 op currently
-+  // emitted by the JIT has sim bits (5,4) == 3.
-+  if (instr->bits(5, 4) == 3) {
-+    // xxsel XT,XA,XB,XC  (VA-like XX4-form).
-+    //   XT[i] = (XA[i] & ~XC[i]) | (XB[i] & XC[i])
-+    // Register fields: XA/XB/XT per-byte; XC at ISA bits 21-25 (sim
-+    // bits 10-6) with CX extension at ISA bit 28 (sim bit 3).
-+    int xa = int(instr->raValue() | (instr->bit(2) << 5));
-+    int xb = int(instr->rbValue() | (instr->bit(1) << 5));
-+    int xt = int(instr->rtValue() | (instr->bit(0) << 5));
-+    int xc = int(instr->bits(10, 6) | (instr->bit(3) << 5));
-+    uint8_t ab[16], bb[16], cb[16], result[16];
-+    getVSR128(xa, ab);
-+    getVSR128(xb, bb);
-+    getVSR128(xc, cb);
-+    for (int i = 0; i < 16; i++) {
-+      result[i] = (uint8_t)((ab[i] & ~cb[i]) | (bb[i] & cb[i]));
-+    }
-+    setVSR128(xt, result);
-+    return;
-+  }
-+
-+  // The remaining forms (XX1/XX2/XX3) share a 9-bit XO at ISA bits
-+  // 21-29 (sim bits 10-2). For XX3 this is (8-bit XO << 1) | AX; for
-+  // XX2 the full 9 bits are the XO (no AX field).
-+  uint32_t xo = instr->bits(10, 2);
-+  uint32_t rt = instr->rtValue();
-+  uint32_t rb = instr->rbValue();
-+
-+  switch (xo) {
-+    // xscvdpsp / xscvdpspn / xscvspdp / xscvspdpn / xxbrd are
-+    // XX2-form: XT/XB are each 6-bit (5-bit field + TX/BX extension at
-+    // sim bits 0/1). Post-Phase-2 the JIT emits these with Simd128
-+    // targets (encoding 32-63), which require the extension bit to
-+    // select VR-space instead of FPR-space. The previous code used
-+    // only the 5-bit field, so any VR-space target silently clobbered
-+    // FPR 0..31 and the post-splat fbits in splatX4 never reached the
-+    // vector lanes.
-+    case 265: {
-+      // xscvdpsp: double→single with sNaN quieting. The ISA says
-+      // result lands at XT[0:31] (BE word 0 = LE bytes 12..15) and
-+      // XT[32:127] is "undefined". Real POWER9 silicon actually
-+      // duplicates the result into BE word 1 as well, so the bytes
-+      // at LE 8..11 hold the same single. The JIT's
-+      // replaceLaneFloat32x4 lowering depends on this: it follows
-+      // xscvdpspn with `xxinsertw …, 12`, which reads XB.word[1]
-+      // (LE bytes 8..11). Zeroing those bytes here would silently
-+      // lose the single under sim. Mirror HW.
-+      int xt = int(rt | (instr->bit(0) << 5));
-+      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
-+      uint8_t bb[16];
-+      getVSR128(xb, bb);
-+      // Source double at BE DW0 = LE bytes 8..15 of xb.
-+      uint64_t dbits = 0;
-+      for (int i = 0; i < 8; i++) dbits |= ((uint64_t)bb[8 + i]) << (i * 8);
-+      double frb;
-+      memcpy(&frb, &dbits, sizeof(frb));
-+      float result = demoteDoublePreservingNaN(frb);
-+      uint32_t fbits;
-+      memcpy(&fbits, &result, sizeof(fbits));
-+      if ((fbits & 0x7F800000u) == 0x7F800000u && (fbits & 0x007FFFFFu) != 0) {
-+        fbits |= 0x00400000u;
-+      }
-+      uint8_t out[16];
-+      memset(out, 0, 8);
-+      // BE word 1 (LE 8..11) and BE word 0 (LE 12..15) both = fbits.
-+      for (int off : {8, 12}) {
-+        out[off]     = (uint8_t)(fbits);
-+        out[off + 1] = (uint8_t)(fbits >> 8);
-+        out[off + 2] = (uint8_t)(fbits >> 16);
-+        out[off + 3] = (uint8_t)(fbits >> 24);
-+      }
-+      setVSR128(xt, out);
-+      break;
-+    }
-+    case 267: {
-+      // xscvdpspn: same as xscvdpsp but non-signaling. Same HW-observed
-+      // word-1 duplication (see xscvdpsp comment above).
-+      int xt = int(rt | (instr->bit(0) << 5));
-+      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
-+      uint8_t bb[16];
-+      getVSR128(xb, bb);
-+      uint64_t dbits = 0;
-+      for (int i = 0; i < 8; i++) dbits |= ((uint64_t)bb[8 + i]) << (i * 8);
-+      double frb;
-+      memcpy(&frb, &dbits, sizeof(frb));
-+      float result = demoteDoublePreservingNaN(frb);
-+      uint32_t fbits;
-+      memcpy(&fbits, &result, sizeof(fbits));
-+      uint8_t out[16];
-+      memset(out, 0, 8);
-+      for (int off : {8, 12}) {
-+        out[off]     = (uint8_t)(fbits);
-+        out[off + 1] = (uint8_t)(fbits >> 8);
-+        out[off + 2] = (uint8_t)(fbits >> 16);
-+        out[off + 3] = (uint8_t)(fbits >> 24);
-+      }
-+      setVSR128(xt, out);
-+      break;
-+    }
-+    case 393: {
-+      // xvcvdpsp: convert two doubles to two singles, replicating each
-+      // result across its dword. BE words = [s(BE_dw0), s(BE_dw0),
-+      // s(BE_dw1), s(BE_dw1)]. SIGNALING form per ISA: sNaN inputs are
-+      // quieted (high-order fraction bit set in result).
-+      int xt = int(rt | (instr->bit(0) << 5));
-+      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
-+      uint8_t bb[16], out[16];
-+      getVSR128(xb, bb);
-+      uint32_t fbits[2];
-+      // BE_dw0 = LE bytes 8..15, BE_dw1 = LE bytes 0..7.
-+      for (int dw = 0; dw < 2; dw++) {
-+        int leOff = (dw == 0) ? 8 : 0;
-+        uint64_t dbits = 0;
-+        for (int i = 0; i < 8; i++) {
-+          dbits |= ((uint64_t)bb[leOff + i]) << (i * 8);
-+        }
-+        double frb;
-+        memcpy(&frb, &dbits, sizeof(frb));
-+        float result = demoteDoublePreservingNaN(frb);
-+        memcpy(&fbits[dw], &result, sizeof(uint32_t));
-+        if ((fbits[dw] & 0x7F800000u) == 0x7F800000u &&
-+            (fbits[dw] & 0x007FFFFFu) != 0) {
-+          fbits[dw] |= 0x00400000u;  // quiet sNaN result
-+        }
-+      }
-+      // LE words: [s(dw1), s(dw1), s(dw0), s(dw0)]
-+      // (LE word 0 = BE word 3 = s(dw1); LE word 3 = BE word 0 = s(dw0)).
-+      uint32_t leWords[4] = {fbits[1], fbits[1], fbits[0], fbits[0]};
-+      for (int w = 0; w < 4; w++) {
-+        out[w * 4]     = (uint8_t)leWords[w];
-+        out[w * 4 + 1] = (uint8_t)(leWords[w] >> 8);
-+        out[w * 4 + 2] = (uint8_t)(leWords[w] >> 16);
-+        out[w * 4 + 3] = (uint8_t)(leWords[w] >> 24);
-+      }
-+      setVSR128(xt, out);
-+      break;
-+    }
-+    case 216:    // xvcvdpsxws: double → signed word, saturating, RTZ (vector)
-+    case 200: {  // xvcvdpuxws: double → unsigned word, saturating, RTZ (vector)
-+      //   src1 := XB.dword_BE[0]; src2 := XB.dword_BE[1]
-+      //   r1 := ConvertDPtoSat(src1); r2 := ConvertDPtoSat(src2)
-+      //   XT.word_BE[0] := r1; XT.word_BE[1] := r1 (replicated)
-+      //   XT.word_BE[2] := r2; XT.word_BE[3] := r2 (replicated)
-+      // Saturation: signed clamps to [INT32_MIN, INT32_MAX] with NaN→INT32_MIN;
-+      //             unsigned clamps to [0, UINT32_MAX] with NaN→0 and neg→0.
-+      // BE_dw0 = LE bytes 8..15; BE_dw1 = LE bytes 0..7.
-+      bool isSigned = (xo == 216);
-+      int xt = int(rt | (instr->bit(0) << 5));
-+      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
-+      uint8_t bb[16], out[16];
-+      getVSR128(xb, bb);
-+      const int srcOffsets[2] = {8, 0};   // BE_dw0 (LE 8..15), BE_dw1 (LE 0..7)
-+      uint32_t results[2];
-+      for (int lane = 0; lane < 2; lane++) {
-+        uint64_t dbits = 0;
-+        for (int j = 0; j < 8; j++) {
-+          dbits |= ((uint64_t)bb[srcOffsets[lane] + j]) << (j * 8);
-+        }
-+        double dval;
-+        memcpy(&dval, &dbits, sizeof(dval));
-+        if (std::isnan(dval)) {
-+          results[lane] = isSigned ? 0x80000000u : 0u;
-+        } else if (isSigned) {
-+          if (dval >= 2147483647.0) {
-+            results[lane] = 0x7FFFFFFFu;
-+          } else if (dval <= -2147483648.0) {
-+            results[lane] = 0x80000000u;
-+          } else {
-+            results[lane] = (uint32_t)(int32_t)dval;  // RTZ
-+          }
-+        } else {  // unsigned
-+          if (dval <= 0.0) {
-+            results[lane] = 0u;
-+          } else if (dval >= 4294967295.0) {
-+            results[lane] = 0xFFFFFFFFu;
-+          } else {
-+            results[lane] = (uint32_t)dval;  // RTZ
-+          }
-+        }
-+      }
-+      // Replicated layout: BE words [r1, r1, r2, r2]; in LE bytes
-+      // [r2, r2, r1, r1] (LE word 0 = BE word 3 = r2, LE word 3 = BE word 0 = r1).
-+      uint32_t leWords[4] = {results[1], results[1], results[0], results[0]};
-+      for (int w = 0; w < 4; w++) {
-+        out[w * 4]     = (uint8_t)leWords[w];
-+        out[w * 4 + 1] = (uint8_t)(leWords[w] >> 8);
-+        out[w * 4 + 2] = (uint8_t)(leWords[w] >> 16);
-+        out[w * 4 + 3] = (uint8_t)(leWords[w] >> 24);
-+      }
-+      setVSR128(xt, out);
-+      break;
-+    }
-+    case 248:    // xvcvsxwdp: signed word → double (vector)
-+    case 232: {  // xvcvuxwdp: unsigned word → double (vector)
-+      //   src1 := XB.word_BE[0]; src2 := XB.word_BE[2]
-+      //   XT.dword_BE[0] := Convert(src1); XT.dword_BE[1] := Convert(src2)
-+      // BE word 0 = LE bytes 12..15; BE word 2 = LE bytes 4..7.
-+      // Output BE dword 0 = LE bytes 8..15; BE dword 1 = LE bytes 0..7.
-+      // No NaN handling needed (integer source).
-+      bool isSigned = (xo == 248);
-+      int xt = int(rt | (instr->bit(0) << 5));
-+      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
-+      uint8_t bb[16], out[16];
-+      getVSR128(xb, bb);
-+      const int srcOffsets[2] = {12, 4};
-+      const int dstOffsets[2] = {8, 0};
-+      for (int lane = 0; lane < 2; lane++) {
-+        uint32_t bits = (uint32_t)bb[srcOffsets[lane]] |
-+                        ((uint32_t)bb[srcOffsets[lane] + 1] << 8) |
-+                        ((uint32_t)bb[srcOffsets[lane] + 2] << 16) |
-+                        ((uint32_t)bb[srcOffsets[lane] + 3] << 24);
-+        double dval = isSigned ? (double)(int32_t)bits : (double)bits;
-+        uint64_t dbits;
-+        memcpy(&dbits, &dval, sizeof(dbits));
-+        for (int i = 0; i < 8; i++) {
-+          out[dstOffsets[lane] + i] = (uint8_t)(dbits >> (i * 8));
-+        }
-+      }
-+      setVSR128(xt, out);
-+      break;
-+    }
-+    case 457: {
-+      // xvcvspdp: convert two singles to two doubles. SIGNALING form
-+      // per ISA: sNaN inputs are quieted in the result (bit 51 set).
-+      //   src1 := XB.word_BE[0]; src2 := XB.word_BE[2]
-+      //   XT.dword_BE[0] := ConvertSPtoDP(src1)
-+      //   XT.dword_BE[1] := ConvertSPtoDP(src2)
-+      // BE word 0 = LE bytes 12..15; BE word 2 = LE bytes 4..7.
-+      // Output BE dword 0 = LE bytes 8..15; BE dword 1 = LE bytes 0..7.
-+      int xt = int(rt | (instr->bit(0) << 5));
-+      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
-+      uint8_t bb[16], out[16];
-+      getVSR128(xb, bb);
-+      // src1 from BE word 0 (LE 12..15), output dword at LE 8..15.
-+      // src2 from BE word 2 (LE 4..7),   output dword at LE 0..7.
-+      const int srcOffsets[2] = {12, 4};   // LE byte offsets of word_BE[0], word_BE[2]
-+      const int dstOffsets[2] = {8, 0};    // LE byte offsets of dword_BE[0], dword_BE[1]
-+      for (int lane = 0; lane < 2; lane++) {
-+        uint32_t fbits = (uint32_t)bb[srcOffsets[lane]] |
-+                         ((uint32_t)bb[srcOffsets[lane] + 1] << 8) |
-+                         ((uint32_t)bb[srcOffsets[lane] + 2] << 16) |
-+                         ((uint32_t)bb[srcOffsets[lane] + 3] << 24);
-+        float fval;
-+        memcpy(&fval, &fbits, sizeof(fval));
-+        double dval = promoteFloatPreservingNaN(fval);
-+        uint64_t dbits;
-+        memcpy(&dbits, &dval, sizeof(dbits));
-+        if ((dbits & 0x7FF0000000000000ULL) == 0x7FF0000000000000ULL &&
-+            (dbits & 0x000FFFFFFFFFFFFFULL) != 0) {
-+          dbits |= 0x0008000000000000ULL;  // quiet sNaN result
-+        }
-+        for (int i = 0; i < 8; i++) {
-+          out[dstOffsets[lane] + i] = (uint8_t)(dbits >> (i * 8));
-+        }
-+      }
-+      setVSR128(xt, out);
-+      break;
-+    }
-+    case 329: {
-+      // xscvspdp: single→double from BE word 0 of XB. SIGNALING form;
-+      // an sNaN input yields a qNaN result with the high-order
-+      // fraction bit (quiet bit) set.
-+      int xt = int(rt | (instr->bit(0) << 5));
-+      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
-+      uint8_t bb[16];
-+      getVSR128(xb, bb);
-+      // BE word 0 = LE bytes 12..15 of xb.
-+      uint32_t fbits = (uint32_t)bb[12] |
-+                       ((uint32_t)bb[13] << 8) |
-+                       ((uint32_t)bb[14] << 16) |
-+                       ((uint32_t)bb[15] << 24);
-+      float fval;
-+      memcpy(&fval, &fbits, sizeof(fval));
-+      double dval = promoteFloatPreservingNaN(fval);
-+      uint64_t dbits;
-+      memcpy(&dbits, &dval, sizeof(dbits));
-+      // Quiet any NaN result (signaling form): set bit 51 of mantissa.
-+      if ((dbits & 0x7FF0000000000000ULL) == 0x7FF0000000000000ULL &&
-+          (dbits & 0x000FFFFFFFFFFFFFULL) != 0) {
-+        dbits |= 0x0008000000000000ULL;
-+      }
-+      uint8_t out[16];
-+      memset(out, 0, 8);
-+      for (int i = 0; i < 8; i++) out[8 + i] = (uint8_t)(dbits >> (i * 8));
-+      setVSR128(xt, out);
-+      break;
-+    }
-+    case 331: {
-+      // xscvspdpn: non-signaling variant of xscvspdp.
-+      int xt = int(rt | (instr->bit(0) << 5));
-+      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
-+      uint8_t bb[16];
-+      getVSR128(xb, bb);
-+      uint32_t fbits = (uint32_t)bb[12] |
-+                       ((uint32_t)bb[13] << 8) |
-+                       ((uint32_t)bb[14] << 16) |
-+                       ((uint32_t)bb[15] << 24);
-+      float fval;
-+      memcpy(&fval, &fbits, sizeof(fval));
-+      double dval = promoteFloatPreservingNaN(fval);
-+      uint64_t dbits;
-+      memcpy(&dbits, &dval, sizeof(dbits));
-+      uint8_t out[16];
-+      memset(out, 0, 8);
-+      for (int i = 0; i < 8; i++) out[8 + i] = (uint8_t)(dbits >> (i * 8));
-+      setVSR128(xt, out);
-+      break;
-+    }
-+    case 347: {
-+      // POWER9 XX2-form ops sharing XO=347; disambiguated by the 5-bit
-+      // A immediate (sim bits 20..16):
-+      //   A=0  -> xsxexpdp  (extract biased exponent into 11 LSBs of XT.dw0)
-+      //   A=16 -> xscvhpdp  (FP16 -> FP64)
-+      //   A=17 -> xscvdphp  (FP64 -> FP16)
-+      // Half placement: the FP16 value lives at LE bytes 8..9 of
-+      // the VSR (= BE bits 48..63 of
-+      // dword[0]), with the rest of dword[0] zeroed. This matches the
-+      // lxsihzx layout already used by the JIT.
-+      uint32_t aImm = (instr->instructionBits() >> 16) & 0x1F;
-+      int xt = int(rt | (instr->bit(0) << 5));
-+      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
-+      uint8_t bb[16], out[16];
-+      getVSR128(xb, bb);
-+      memset(out, 0, 16);
-+      if (aImm == 17) {
-+        // xscvdphp: read FP64 from BE 0..63 of XB (LE bytes 8..15),
-+        // convert to FP16, place at LE bytes 8..9 of XT.
-+        double d;
-+        memcpy(&d, bb + 8, 8);
-+        uint16_t h = js::float16(d).toRawBits();
-+        out[8] = (uint8_t)(h & 0xFF);
-+        out[9] = (uint8_t)((h >> 8) & 0xFF);
-+      } else if (aImm == 16) {
-+        // xscvhpdp: read FP16 from LE bytes 8..9 of XB, convert to FP64,
-+        // place at LE bytes 8..15 of XT.
-+        uint16_t h = (uint16_t)bb[8] | ((uint16_t)bb[9] << 8);
-+        double d = static_cast<double>(js::float16::fromRawBits(h));
-+        memcpy(out + 8, &d, 8);
-+      } else if (aImm == 0) {
-+        // xsxexpdp: read FP64 from LE bytes 8..15 of XB, extract biased
-+        // exponent (bits 1..11 of the IEEE-754 double = bits 52..62 of
-+        // the 64-bit pattern), place into XT.dw0 with rest zeroed.
-+        uint64_t bits = 0;
-+        for (int i = 0; i < 8; i++) bits |= uint64_t(bb[8 + i]) << (i * 8);
-+        uint64_t exp = (bits >> 52) & 0x7FF;
-+        for (int i = 0; i < 8; i++) out[8 + i] = (uint8_t)(exp >> (i * 8));
-+      } else {
-+        MOZ_CRASH_UNSAFE_PRINTF(
-+            "decodeVSX XO=347 with unexpected A=%u (instr 0x%08x)",
-+            aImm, instr->instructionBits());
-+      }
-+      setVSR128(xt, out);
-+      break;
-+    }
-+    case 475: {
-+      // xxbrd: byte-reverse each doubleword.
-+      int xt = int(rt | (instr->bit(0) << 5));
-+      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
-+      uint8_t bb[16], out[16];
-+      getVSR128(xb, bb);
-+      for (int i = 0; i < 8; i++) out[i] = bb[7 - i];
-+      for (int i = 0; i < 8; i++) out[8 + i] = bb[15 - i];
-+      setVSR128(xt, out);
-+      break;
-+    }
-+
-+    // === XX3-form scalar: xsmaxjdp / xsminjdp (POWER9) ===
-+    //
-+    // xs{max,min}jdp XT, XA, XB. Scalar inputs at BE bits 0..63 of
-+    // XA / XB (= LE bytes 8..15); result lands at BE 0..63 of XT
-+    // (upper bits "undefined" per ISA).
-+    //
-+    // Semantics match ECMA-262 Math.{max,min} / wasm f64.{max,min}:
-+    //   - NaN: if A is NaN return A; else if B is NaN return B. sNaN
-+    //     payload preserved bit-for-bit (NOT quieted).
-+    //   - ±0 tie: signed-zero ordering. xsmaxjdp returns +0 for any
-+    //     mix of (-0, +0); xsminjdp returns -0.
-+    //   - Otherwise: standard IEEE max / min.
-+    case 288: case 289:    // xsmaxjdp  (XO8=144 → 9-bit 288/289)
-+    case 304: case 305: {  // xsminjdp  (XO8=152 → 9-bit 304/305)
-+      int xa = int(instr->raValue() | (instr->bit(2) << 5));
-+      int xb = int(instr->rbValue() | (instr->bit(1) << 5));
-+      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
-+      uint8_t ab[16], bb[16], out[16];
-+      getVSR128(xa, ab);
-+      getVSR128(xb, bb);
-+      double a, b;
-+      memcpy(&a, ab + 8, 8);
-+      memcpy(&b, bb + 8, 8);
-+      bool isMax = (xo >> 1) == 144;
-+      double r;
-+      if (std::isnan(a)) {
-+        r = a;
-+      } else if (std::isnan(b)) {
-+        r = b;
-+      } else if (a == 0.0 && b == 0.0) {
-+        // Signed-zero ordering: max picks +0, min picks -0.
-+        if (isMax) {
-+          r = std::signbit(a) ? b : a;
-+        } else {
-+          r = std::signbit(a) ? a : b;
-+        }
-+      } else {
-+        r = isMax ? std::max(a, b) : std::min(a, b);
-+      }
-+      memset(out, 0, 8);
-+      memcpy(out + 8, &r, 8);
-+      setVSR128(xt, out);
-+      break;
-+    }
-+
-+    // --- VSX XX3-form: xxpermdi ---
-+    //
-+    // xxpermdi XT, XA, XB, DM:
-+    //   XT.DW0 = XA.DW(DM[0])
-+    //   XT.DW1 = XB.DW(DM[1])
-+    // In BE, DW0 is MSB-side, DW1 is LSB-side. On PPC64LE register
-+    // storage, DW0 = LE bytes 8-15 and DW1 = LE bytes 0-7. The sim's
-+    // previous implementation used the reversed "DW0 = LE 0-7"
-+    // convention which cancelled for self-swap round-trips but
-+    // produced wrong halves when chained with ISA-correct ops
-+    // (mtvsrd, xxspltw, mfvsrd).
-+    case 20: case 21:       // xxpermdi DM=0
-+    case 84: case 85:       // xxpermdi DM=1
-+    case 148: case 149:     // xxpermdi DM=2 (= xxswapd when XA==XB)
-+    case 212: case 213: {   // xxpermdi DM=3
-+      uint8_t dm_hi = (xo >> 7) & 1;  // DM[0]
-+      uint8_t dm_lo = (xo >> 6) & 1;  // DM[1]
-+      int xa = int(instr->raValue() | (instr->bit(2) << 5));
-+      int xb = int(instr->rbValue() | (instr->bit(1) << 5));
-+      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
-+      uint8_t xa_bytes[16], xb_bytes[16], result[16];
-+      getVSR128(xa, xa_bytes);
-+      getVSR128(xb, xb_bytes);
-+      // DW0 in LE storage is bytes 8-15; DW1 is bytes 0-7.
-+      //   XT.DW0 (result[8..15]) = XA.DW(dm_hi)
-+      //   XT.DW1 (result[0..7])  = XB.DW(dm_lo)
-+      // DW(0) is at LE 8, DW(1) is at LE 0.
-+      memcpy(result + 8, xa_bytes + (dm_hi ? 0 : 8), 8);
-+      memcpy(result,     xb_bytes + (dm_lo ? 0 : 8), 8);
-+      setVSR128(xt, result);
-+      break;
-+    }
-+
-+    // --- VSX logical (XX3-form, primary opcode 60) ---
-+    //
-+    // Each takes two 6-bit VSR sources XA/XB and writes 6-bit VSR
-+    // destination XT. 8-bit ISA XO at bits 21-28; our
-+    // 9-bit XO extraction (bits 10:2) includes the AX bit at position 0,
-+    // so each op appears as two consecutive values (AX=0 and AX=1).
-+    //
-+    //   xxland XT,XA,XB     XO=130  (9-bit: 260, 261)  XT = XA & XB
-+    //   xxlandc XT,XA,XB    XO=138  (276, 277)         XT = XA & ~XB
-+    //   xxlor XT,XA,XB      XO=146  (292, 293)         XT = XA | XB
-+    //   xxlxor XT,XA,XB     XO=154  (308, 309)         XT = XA ^ XB
-+    //   xxlnor XT,XA,XB     XO=162  (324, 325)         XT = ~(XA | XB)
-+    //   xxlorc XT,XA,XB     XO=170  (340, 341)         XT = XA | ~XB
-+    //   xxlnand XT,XA,XB    XO=178  (356, 357)         XT = ~(XA & XB)
-+    //   xxleqv XT,XA,XB     XO=186  (372, 373)         XT = ~(XA ^ XB)
-+    //
-+    // The encoding constants in Assembler-ppc64.h match: PPC_xxlor=0xF0000490
-+    // has bits 4,7,10 set in its base (XO=146 in the 8-bit field), which
-+    // under the simulator's 9-bit extraction gives 2*146=292 (AX=0 default).
-+    case 260: case 261:  // xxland
-+    case 276: case 277:  // xxlandc
-+    case 292: case 293:  // xxlor
-+    case 308: case 309:  // xxlxor
-+    case 324: case 325:  // xxlnor
-+    case 340: case 341:  // xxlorc
-+    case 356: case 357:  // xxlnand
-+    case 372: case 373:  // xxleqv
-+    {
-+      int xa = int(instr->raValue() | (instr->bit(2) << 5));
-+      int xb = int(instr->rbValue() | (instr->bit(1) << 5));
-+      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
-+      uint8_t a_bytes[16], b_bytes[16], result[16];
-+      getVSR128(xa, a_bytes);
-+      getVSR128(xb, b_bytes);
-+      // Dispatch on the 8-bit ISA XO (ignoring AX bit at position 0).
-+      uint32_t xo8 = xo >> 1;
-+      for (int i = 0; i < 16; i++) {
-+        uint8_t a = a_bytes[i], b = b_bytes[i];
-+        switch (xo8) {
-+          case 130: result[i] = a & b;        break;  // xxland
-+          case 138: result[i] = a & ~b;       break;  // xxlandc
-+          case 146: result[i] = a | b;        break;  // xxlor
-+          case 154: result[i] = a ^ b;        break;  // xxlxor
-+          case 162: result[i] = (uint8_t)~(a | b);  break;  // xxlnor
-+          case 170: result[i] = a | (uint8_t)~b;    break;  // xxlorc
-+          case 178: result[i] = (uint8_t)~(a & b);  break;  // xxlnand
-+          case 186: result[i] = (uint8_t)~(a ^ b);  break;  // xxleqv
-+        }
-+      }
-+      setVSR128(xt, result);
-+      break;
-+    }
-+
-+    // === XX2-form: xxspltw (splat word from VRB[UIM] to all 4 lanes) ===
-+    //
-+    // xxspltw: UIM selects one of four words in BE numbering. UIM=0
-+    // → BE word 0 (MSB side of the 128 bits). On PPC64LE register
-+    // storage that maps to LE word (3 - UIM). With the input
-+    // {0x11111111, 0x22222222, 0x33333333, 0x44444444}: UIM=0
-+    // splats 0x44444444 (= LE word 3), UIM=3 splats 0x11111111
-+    // (= LE word 0). The JIT emits xxspltw UIM=1 after mtvsrd on the
-+    // POWER8 splatX4 path — mtvsrd puts the GPR's low 32 bits in BE
-+    // word 1 (= LE word 2 on HW), so xxspltw UIM=1 picks up exactly
-+    // that word and splats it to every lane.
-+    case 164: {  // xxspltw
-+      int xt = int(rt | (instr->bit(0) << 5));
-+      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
-+      uint32_t uim = (instr->instructionBits() >> 16) & 0x3;
-+      uint32_t leIdx = 3 - uim;  // BE word UIM → LE word (3-UIM)
-+      uint8_t bb[16], result[16];
-+      getVSR128(xb, bb);
-+      uint32_t word = (uint32_t)bb[leIdx * 4] |
-+                      ((uint32_t)bb[leIdx * 4 + 1] << 8) |
-+                      ((uint32_t)bb[leIdx * 4 + 2] << 16) |
-+                      ((uint32_t)bb[leIdx * 4 + 3] << 24);
-+      for (int i = 0; i < 4; i++) {
-+        result[i * 4]     = (uint8_t)(word & 0xFF);
-+        result[i * 4 + 1] = (uint8_t)((word >> 8) & 0xFF);
-+        result[i * 4 + 2] = (uint8_t)((word >> 16) & 0xFF);
-+        result[i * 4 + 3] = (uint8_t)((word >> 24) & 0xFF);
-+      }
-+      setVSR128(xt, result);
-+      break;
-+    }
-+
-+    // === XX2-form: xxextractuw (extract word at BE byte UIM, place at BE word 1) ===
-+    //
-+    // xxextractuw XT, XB, UIM:
-+    //   Bytes [4:7] of XT receive bytes [UIM:UIM+3] of XB. Bytes [0:3]
-+    //   and [8:15] of XT are set to zero.
-+    // UIM ∈ {0, 4, 8, 12} (caller responsible for alignment).
-+    // BE byte i ↔ LE byte (15-i), so the word at XB BE bytes UIM..UIM+3
-+    // sits at XB LE bytes (12-UIM)..(15-UIM), and lands at XT LE bytes
-+    // 8..11 (= XT BE word 1).
-+    case 165: {  // xxextractuw
-+      int xt = int(rt | (instr->bit(0) << 5));
-+      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
-+      uint32_t uim = (instr->instructionBits() >> 16) & 0xF;
-+      uint8_t bb[16], result[16];
-+      getVSR128(xb, bb);
-+      memset(result, 0, sizeof(result));
-+      // result.LE[8..11] = XB.LE[(12-UIM)..(15-UIM)] (preserves byte order).
-+      memcpy(result + 8, bb + (12 - uim), 4);
-+      setVSR128(xt, result);
-+      break;
-+    }
-+
-+    case 180: {
-+      // xxspltib XT, IMM8 (POWER9, ISA 3.0): splat 8-bit immediate to
-+      // all 16 bytes of XT. The encoder writes `imm8 << 11`, so IMM8
-+      // occupies LE bits 11..18; TX bit at LE bit 0 selects upper VSR.
-+      uint32_t imm8 = instr->bits(18, 11);
-+      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
-+      uint8_t xt_bytes[16];
-+      memset(xt_bytes, (uint8_t)imm8, 16);
-+      setVSR128(xt, xt_bytes);
-+      break;
-+    }
-+    case 181: {
-+      // xxinsertw XT, XB, UIM (POWER9, ISA 3.0): copy XB[32..63] (the
-+      // low 32 bits of XB's BE doubleword 0, which lives at LE bytes
-+      // 8-11 of XB) into XT at BE byte position UIM. UIM ∈ {0,4,8,12};
-+      // dest occupies XT LE bytes (12-UIM)..(15-UIM). Other bytes of
-+      // XT are preserved. UIM at PPC bits 11-15 = LE bits 16-20; TX/BX
-+      // at LE bits 0/1.
-+      uint32_t uim = instr->bits(20, 16);
-+      int xt = int(instr->rtValue() | (instr->bit(0) << 5));
-+      int xb = int(instr->rbValue() | (instr->bit(1) << 5));
-+      uint8_t xb_bytes[16], xt_bytes[16];
-+      getVSR128(xb, xb_bytes);
-+      getVSR128(xt, xt_bytes);
-+      memcpy(xt_bytes + (12 - uim), xb_bytes + 8, 4);
-+      setVSR128(xt, xt_bytes);
-+      break;
-+    }
-+
-+    // === XX2-form: xvabssp / xvabsdp (vector absolute value) ===
-+    case 408: case 409: case 410: case 411: {  // xvabssp + AX/BX bits
-+      int xt = int(rt | (instr->bit(0) << 5));
-+      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
-+      uint8_t bb[16], result[16];
-+      getVSR128(xb, bb);
-+      for (int i = 0; i < 4; i++) {
-+        uint32_t bits = (uint32_t)bb[i * 4] |
-+                        ((uint32_t)bb[i * 4 + 1] << 8) |
-+                        ((uint32_t)bb[i * 4 + 2] << 16) |
-+                        ((uint32_t)bb[i * 4 + 3] << 24);
-+        bits &= 0x7FFFFFFFu;  // clear sign bit
-+        result[i * 4]     = (uint8_t)(bits & 0xFF);
-+        result[i * 4 + 1] = (uint8_t)((bits >> 8) & 0xFF);
-+        result[i * 4 + 2] = (uint8_t)((bits >> 16) & 0xFF);
-+        result[i * 4 + 3] = (uint8_t)((bits >> 24) & 0xFF);
-+      }
-+      setVSR128(xt, result);
-+      break;
-+    }
-+    case 472: case 473: case 474: {            // xvabsdp (475 used by xxbrd)
-+      int xt = int(rt | (instr->bit(0) << 5));
-+      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
-+      uint8_t bb[16], result[16];
-+      getVSR128(xb, bb);
-+      for (int i = 0; i < 2; i++) {
-+        uint64_t bits = 0;
-+        for (int k = 0; k < 8; k++) bits |= ((uint64_t)bb[i * 8 + k]) << (k * 8);
-+        bits &= 0x7FFFFFFFFFFFFFFFULL;
-+        for (int k = 0; k < 8; k++) result[i * 8 + k] = (uint8_t)((bits >> (k * 8)) & 0xFF);
-+      }
-+      setVSR128(xt, result);
-+      break;
-+    }
-+
-+    // === XX2-form unary vector float ops (single XB operand, no AX) ===
-+    //
-+    // Encoding: opcode 60, bits 6-10=XT, 11-15 reserved, 16-20=XB,
-+    // 21-29 = 9-bit XO (full field), 30=BX, 31=TX. Extraction gives us
-+    // xo = XO9 directly (no AX bit). Every op below has a unique XO9.
-+    //
-+    //   xvsqrtsp  XO9=139  PPC_xvsqrtsp=0xF000022C
-+    //   xvsqrtdp  XO9=203  PPC_xvsqrtdp=0xF000032C
-+    //   xvnegsp   XO9=441  PPC_xvnegsp=0xF00006E4
-+    //   xvnegdp   XO9=505  PPC_xvnegdp=0xF00007E4
-+    //   xvrspip   XO9=169  PPC_xvrspip=0xF00002A4   (round +inf = ceil)
-+    //   xvrspiz   XO9=153  PPC_xvrspiz=0xF0000264   (round toward 0 = trunc)
-+    //   xvrspim   XO9=185  PPC_xvrspim=0xF00002E4   (round -inf = floor)
-+    //   xvrspic   XO9=171  PPC_xvrspic=0xF00002AC   (round per FPSCR)
-+    //   xvrdpip   XO9=233  PPC_xvrdpip=0xF00003A4
-+    //   xvrdpiz   XO9=217  PPC_xvrdpiz=0xF0000364
-+    //   xvrdpim   XO9=249  PPC_xvrdpim=0xF00003E4
-+    //   xvrdpic   XO9=235  PPC_xvrdpic=0xF00003AC
-+    //   xvcvspsxws XO9=152 PPC_xvcvspsxws=0xF0000260  (f32 → s32, sat)
-+    //   xvcvspuxws XO9=136 PPC_xvcvspuxws=0xF0000220  (f32 → u32, sat)
-+    //   xvcvsxwsp XO9=184  PPC_xvcvsxwsp=0xF00002E0   (s32 → f32)
-+    //   xvcvuxwsp XO9=168  PPC_xvcvuxwsp=0xF00002A0   (u32 → f32)
-+    case 139: case 203:     // xvsqrtsp / xvsqrtdp
-+    case 441: case 505:     // xvnegsp / xvnegdp
-+    case 169: case 233:     // xvrspip / xvrdpip (ceil)
-+    case 153: case 217:     // xvrspiz / xvrdpiz (trunc)
-+    case 185: case 249:     // xvrspim / xvrdpim (floor)
-+    case 171: case 235:     // xvrspic / xvrdpic (round-to-nearest)
-+    case 136: case 152:     // xvcvspuxws / xvcvspsxws
-+    case 168: case 184: {   // xvcvuxwsp / xvcvsxwsp
-+      int xt = int(rt | (instr->bit(0) << 5));
-+      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
-+      uint8_t bb[16], result[16];
-+      getVSR128(xb, bb);
-+      bool isSp = (xo == 139 || xo == 441 || xo == 169 || xo == 153 ||
-+                   xo == 185 || xo == 171 || xo == 136 || xo == 152 ||
-+                   xo == 168 || xo == 184);
-+      auto getF32 = [](uint8_t* buf, int i) -> float {
-+        uint32_t b = (uint32_t)buf[i * 4] |
-+                     ((uint32_t)buf[i * 4 + 1] << 8) |
-+                     ((uint32_t)buf[i * 4 + 2] << 16) |
-+                     ((uint32_t)buf[i * 4 + 3] << 24);
-+        float f; memcpy(&f, &b, sizeof(f)); return f;
-+      };
-+      auto setF32 = [](uint8_t* buf, int i, float f) {
-+        uint32_t b; memcpy(&b, &f, sizeof(b));
-+        buf[i*4]=(uint8_t)b; buf[i*4+1]=(uint8_t)(b>>8);
-+        buf[i*4+2]=(uint8_t)(b>>16); buf[i*4+3]=(uint8_t)(b>>24);
-+      };
-+      auto getF64 = [](uint8_t* buf, int i) -> double {
-+        uint64_t b = 0;
-+        for (int k=0;k<8;k++) b |= ((uint64_t)buf[i*8+k])<<(k*8);
-+        double d; memcpy(&d, &b, sizeof(d)); return d;
-+      };
-+      auto setF64 = [](uint8_t* buf, int i, double d) {
-+        uint64_t b; memcpy(&b, &d, sizeof(b));
-+        for (int k=0;k<8;k++) buf[i*8+k]=(uint8_t)(b>>(k*8));
-+      };
-+      // Integer lane read/write (used by conversion ops).
-+      auto setU32 = [](uint8_t* buf, int i, uint32_t v) {
-+        buf[i*4]=(uint8_t)v; buf[i*4+1]=(uint8_t)(v>>8);
-+        buf[i*4+2]=(uint8_t)(v>>16); buf[i*4+3]=(uint8_t)(v>>24);
-+      };
-+      // Saturated float→int conversion per Power ISA v3.0B: input NaN maps
-+      // to 0; out-of-range saturates to the extreme of the destination type.
-+      auto fp2sxw = [](double f) -> uint32_t {
-+        if (std::isnan(f)) return 0;
-+        if (f >= (double)INT32_MAX) return (uint32_t)INT32_MAX;
-+        if (f <= (double)INT32_MIN) return (uint32_t)INT32_MIN;
-+        return (uint32_t)(int32_t)std::trunc(f);
-+      };
-+      auto fp2uxw = [](double f) -> uint32_t {
-+        if (std::isnan(f)) return 0;
-+        if (f >= (double)UINT32_MAX) return UINT32_MAX;
-+        if (f <= 0.0) return 0;
-+        return (uint32_t)std::trunc(f);
-+      };
-+
-+      if (isSp) {
-+        for (int i = 0; i < 4; i++) {
-+          float v = getF32(bb, i);
-+          float out = 0.0f;
-+          uint32_t iout = 0;
-+          bool isInt = false;
-+          switch (xo) {
-+            case 139: out = std::sqrt(v); break;                // xvsqrtsp
-+            case 441: out = -v; break;                          // xvnegsp
-+            case 169: out = std::ceil(v); break;                // xvrspip
-+            case 153: out = std::trunc(v); break;               // xvrspiz
-+            case 185: out = std::floor(v); break;               // xvrspim
-+            case 171: out = std::nearbyint(v); break;           // xvrspic
-+            case 152: iout = fp2sxw(v); isInt = true; break;    // xvcvspsxws
-+            case 136: iout = fp2uxw(v); isInt = true; break;    // xvcvspuxws
-+            case 184: {                                          // xvcvsxwsp
-+              uint32_t bits = (uint32_t)bb[i*4] |
-+                              ((uint32_t)bb[i*4+1]<<8) |
-+                              ((uint32_t)bb[i*4+2]<<16) |
-+                              ((uint32_t)bb[i*4+3]<<24);
-+              out = (float)(int32_t)bits;
-+              break;
-+            }
-+            case 168: {                                          // xvcvuxwsp
-+              uint32_t bits = (uint32_t)bb[i*4] |
-+                              ((uint32_t)bb[i*4+1]<<8) |
-+                              ((uint32_t)bb[i*4+2]<<16) |
-+                              ((uint32_t)bb[i*4+3]<<24);
-+              out = (float)(uint32_t)bits;
-+              break;
-+            }
-+          }
-+          if (isInt) setU32(result, i, iout);
-+          else setF32(result, i, out);
-+        }
-+      } else {
-+        for (int i = 0; i < 2; i++) {
-+          double v = getF64(bb, i);
-+          double out = 0.0;
-+          switch (xo) {
-+            case 203: out = std::sqrt(v); break;                // xvsqrtdp
-+            case 505: out = -v; break;                          // xvnegdp
-+            case 233: out = std::ceil(v); break;                // xvrdpip
-+            case 217: out = std::trunc(v); break;               // xvrdpiz
-+            case 249: out = std::floor(v); break;               // xvrdpim
-+            case 235: out = std::nearbyint(v); break;           // xvrdpic
-+          }
-+          setF64(result, i, out);
-+        }
-+      }
-+      setVSR128(xt, result);
-+      break;
-+    }
-+
-+    // === XX3-form vector float compare (eq, gt, ge) ===
-+    // The wasm SIMD compares emit these and use the result as a bitmask.
-+    // Per Power ISA: result is all-1s for true lanes, all-0s for false
-+    // (for the non-recording form; bit 0 of XO selects record form which
-+    // we don't model — wasm doesn't read CR6 here).
-+    // Encodings:
-+    //   0xF0000218 xvcmpeqsp (XO8=67) → XO9 = 134/135 (+AX).
-+    //   0xF0000258 xvcmpgtsp (XO8=75) → XO9 = 150/151.
-+    //   0xF0000298 xvcmpgesp (XO8=83) → XO9 = 166/167.
-+    //   0xF0000318 xvcmpeqdp (XO8=99) → XO9 = 198/199.
-+    //   0xF0000358 xvcmpgtdp (XO8=107) → XO9 = 214/215.
-+    //   0xF0000398 xvcmpgedp (XO8=115) → XO9 = 230/231.
-+    // Rc=1 record form flips ISA bit 21 (sim bit 10), yielding XO9+256
-+    // (not adjacent to the Rc=0 slot). wasm never emits the record form.
-+    case 134: case 135:    // xvcmpeqsp (XO8=67)
-+    case 198: case 199:    // xvcmpeqdp (XO8=99)
-+    case 150: case 151:    // xvcmpgtsp (XO8=75)
-+    case 214: case 215:    // xvcmpgtdp (XO8=107)
-+    case 166: case 167:    // xvcmpgesp (XO8=83)
-+    case 230: case 231: {  // xvcmpgedp (XO8=115)
-+      int xt = int(rt | (instr->bit(0) << 5));
-+      uint32_t ra = instr->raValue();
-+      int xa = int(ra | ((instr->instructionBits() >> 2) & 1) << 5);
-+      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
-+      uint8_t ab[16], bb[16], result[16];
-+      getVSR128(xa, ab);
-+      getVSR128(xb, bb);
-+      uint32_t op8 = xo >> 1;  // canonical 8-bit XO
-+      bool isF32 = (op8 == 67 || op8 == 75 || op8 == 83);
-+      bool isEq  = (op8 == 67 || op8 == 99);
-+      bool isGt  = (op8 == 75 || op8 == 107);
-+      bool isGe  = (op8 == 83 || op8 == 115);
-+      (void)isGe;
-+      auto cmpF32 = [&](int i) -> bool {
-+        uint32_t aBits = (uint32_t)ab[i * 4] |
-+                         ((uint32_t)ab[i * 4 + 1] << 8) |
-+                         ((uint32_t)ab[i * 4 + 2] << 16) |
-+                         ((uint32_t)ab[i * 4 + 3] << 24);
-+        uint32_t bBits = (uint32_t)bb[i * 4] |
-+                         ((uint32_t)bb[i * 4 + 1] << 8) |
-+                         ((uint32_t)bb[i * 4 + 2] << 16) |
-+                         ((uint32_t)bb[i * 4 + 3] << 24);
-+        float fa, fb;
-+        memcpy(&fa, &aBits, sizeof(fa));
-+        memcpy(&fb, &bBits, sizeof(fb));
-+        if (isEq) return fa == fb;
-+        if (isGt) return fa > fb;
-+        return fa >= fb;
-+      };
-+      auto cmpF64 = [&](int i) -> bool {
-+        uint64_t aBits = 0, bBits = 0;
-+        for (int k = 0; k < 8; k++) aBits |= ((uint64_t)ab[i * 8 + k]) << (k * 8);
-+        for (int k = 0; k < 8; k++) bBits |= ((uint64_t)bb[i * 8 + k]) << (k * 8);
-+        double fa, fb;
-+        memcpy(&fa, &aBits, sizeof(fa));
-+        memcpy(&fb, &bBits, sizeof(fb));
-+        if (isEq) return fa == fb;
-+        if (isGt) return fa > fb;
-+        return fa >= fb;
-+      };
-+      if (isF32) {
-+        for (int i = 0; i < 4; i++) {
-+          uint32_t mask = cmpF32(i) ? 0xFFFFFFFFu : 0;
-+          for (int k = 0; k < 4; k++) {
-+            result[i * 4 + k] = (uint8_t)((mask >> (k * 8)) & 0xFF);
-+          }
-+        }
-+      } else {
-+        for (int i = 0; i < 2; i++) {
-+          uint64_t mask = cmpF64(i) ? UINT64_MAX : 0;
-+          for (int k = 0; k < 8; k++) {
-+            result[i * 8 + k] = (uint8_t)((mask >> (k * 8)) & 0xFF);
-+          }
-+        }
-+      }
-+      setVSR128(xt, result);
-+      break;
-+    }
-+
-+    // === XX3-form vector float arithmetic ===
-+    // Encoding: bits 6-10=XT, 11-15=XA, 16-20=XB, 21-28=XO (8 bits), 29=AX,
-+    // 30=BX, 31=TX. We dispatched above using `bits(10, 2)` which is bits
-+    // 21-29 (9 bits) — that includes the AX register-extension bit, which
-+    // changes for every XA in {0..31} vs {32..63}. To match all 4
-+    // (AX,BX) combinations of an XX3 op we use `case xo3 | 0|1|2|3` where
-+    // xo3 = (8-bit XO) << 1 (because XO occupies bits 1..8 of our 9-bit
-+    // extraction). Helper macro: each case covers four labels.
-+    #define XX3_CASE_BASE(name) \
-+      case ((name) | 0): case ((name) | 1):
-+    case 128:  case 129:  // xvaddsp: 4 × f32 add (XO=64 → bits 1..8 = 128)
-+    case 192:  case 193:  // xvadddp
-+    case 144:  case 145:  // xvsubsp
-+    case 208:  case 209:  // xvsubdp
-+    case 160:  case 161:  // xvmulsp
-+    case 224:  case 225:  // xvmuldp
-+    case 176:  case 177:  // xvdivsp
-+    case 240:  case 241:  // xvdivdp
-+    case 384:  case 385:  // xvmaxsp
-+    case 448:  case 449:  // xvmaxdp
-+    case 400:  case 401:  // xvminsp
-+    case 464:  case 465:  // xvmindp
-+    {
-+      // Re-extract the canonical 8-bit XX3 XO.
-+      uint32_t xo3 = (xo >> 1);
-+      (void)xo3;
-+      int xt = int(rt | (instr->bit(0) << 5));
-+      uint32_t ra = instr->raValue();
-+      int xa = int(ra | ((instr->instructionBits() >> 2) & 1) << 5);
-+      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
-+      uint8_t ab[16], bb[16], rb_bytes[16];
-+      getVSR128(xa, ab);
-+      getVSR128(xb, bb);
-+
-+      auto getF32 = [](uint8_t* buf, int i) -> float {
-+        uint32_t bits = (uint32_t)buf[i * 4] |
-+                        ((uint32_t)buf[i * 4 + 1] << 8) |
-+                        ((uint32_t)buf[i * 4 + 2] << 16) |
-+                        ((uint32_t)buf[i * 4 + 3] << 24);
-+        float f;
-+        memcpy(&f, &bits, sizeof(f));
-+        return f;
-+      };
-+      auto setF32 = [](uint8_t* buf, int i, float f) {
-+        uint32_t bits;
-+        memcpy(&bits, &f, sizeof(bits));
-+        buf[i * 4]     = (uint8_t)(bits & 0xFF);
-+        buf[i * 4 + 1] = (uint8_t)((bits >> 8) & 0xFF);
-+        buf[i * 4 + 2] = (uint8_t)((bits >> 16) & 0xFF);
-+        buf[i * 4 + 3] = (uint8_t)((bits >> 24) & 0xFF);
-+      };
-+      auto getF64 = [](uint8_t* buf, int i) -> double {
-+        uint64_t bits = 0;
-+        for (int k = 0; k < 8; k++) bits |= ((uint64_t)buf[i * 8 + k]) << (k * 8);
-+        double d;
-+        memcpy(&d, &bits, sizeof(d));
-+        return d;
-+      };
-+      auto setF64 = [](uint8_t* buf, int i, double d) {
-+        uint64_t bits;
-+        memcpy(&bits, &d, sizeof(bits));
-+        for (int k = 0; k < 8; k++) buf[i * 8 + k] = (uint8_t)((bits >> (k * 8)) & 0xFF);
-+      };
-+
-+      // Dispatch on the canonical 8-bit XX3 XO (bits 21..28 PPC = xo>>1).
-+      switch (xo3) {
-+        case 64:  for (int i = 0; i < 4; i++) setF32(rb_bytes, i, getF32(ab, i) + getF32(bb, i)); break;  // xvaddsp
-+        case 96:  for (int i = 0; i < 2; i++) setF64(rb_bytes, i, getF64(ab, i) + getF64(bb, i)); break;  // xvadddp
-+        case 72:  for (int i = 0; i < 4; i++) setF32(rb_bytes, i, getF32(ab, i) - getF32(bb, i)); break;  // xvsubsp
-+        case 104: for (int i = 0; i < 2; i++) setF64(rb_bytes, i, getF64(ab, i) - getF64(bb, i)); break;  // xvsubdp
-+        case 80:  for (int i = 0; i < 4; i++) setF32(rb_bytes, i, getF32(ab, i) * getF32(bb, i)); break;  // xvmulsp
-+        case 112: for (int i = 0; i < 2; i++) setF64(rb_bytes, i, getF64(ab, i) * getF64(bb, i)); break;  // xvmuldp
-+        case 88:  for (int i = 0; i < 4; i++) setF32(rb_bytes, i, getF32(ab, i) / getF32(bb, i)); break;  // xvdivsp
-+        case 120: for (int i = 0; i < 2; i++) setF64(rb_bytes, i, getF64(ab, i) / getF64(bb, i)); break;  // xvdivdp
-+        // xvmin{sp,dp} / xvmax{sp,dp}:
-+        //   If both operands are NaN, result is the NaN from XA.
-+        //   If exactly one operand is NaN, result is the NON-NaN operand.
-+        //   For 0 / -0, treat -0 < +0 (signed-zero ordering): xvminsp(+0,-0)
-+        //   = -0, xvmaxsp(+0,-0) = +0, in either operand order.
-+        //   Otherwise, result is IEEE min/max(a, b).
-+        // This differs from IEEE 754 (which propagates NaN) and is
-+        // relied upon by wasm relaxed_min/max (bug1946618.js) and by
-+        // wasm f32x4.min(0,-0) returning -0 (simd_f32x4.wast.js).
-+        #define XV_MAX(T, a, b) [](T a_, T b_) -> T {                          \
-+          bool an = std::isnan(a_), bn = std::isnan(b_);                       \
-+          if (an && bn) return a_;                                              \
-+          if (an) return b_;                                                    \
-+          if (bn) return a_;                                                    \
-+          if (a_ == 0.0 && b_ == 0.0) {                                         \
-+            /* -0 is smaller than +0; max picks +0. */                          \
-+            return std::signbit(a_) ? b_ : a_;                                  \
-+          }                                                                     \
-+          return std::max(a_, b_);                                              \
-+        }(a, b)
-+        #define XV_MIN(T, a, b) [](T a_, T b_) -> T {                          \
-+          bool an = std::isnan(a_), bn = std::isnan(b_);                       \
-+          if (an && bn) return a_;                                              \
-+          if (an) return b_;                                                    \
-+          if (bn) return a_;                                                    \
-+          if (a_ == 0.0 && b_ == 0.0) {                                         \
-+            /* -0 is smaller than +0; min picks -0. */                          \
-+            return std::signbit(a_) ? a_ : b_;                                  \
-+          }                                                                     \
-+          return std::min(a_, b_);                                              \
-+        }(a, b)
-+        case 192: for (int i = 0; i < 4; i++) {  // xvmaxsp
-+          float a = getF32(ab, i), b = getF32(bb, i);
-+          setF32(rb_bytes, i, XV_MAX(float, a, b));
-+        } break;
-+        case 224: for (int i = 0; i < 2; i++) {  // xvmaxdp
-+          double a = getF64(ab, i), b = getF64(bb, i);
-+          setF64(rb_bytes, i, XV_MAX(double, a, b));
-+        } break;
-+        case 200: for (int i = 0; i < 4; i++) {  // xvminsp
-+          float a = getF32(ab, i), b = getF32(bb, i);
-+          setF32(rb_bytes, i, XV_MIN(float, a, b));
-+        } break;
-+        case 232: for (int i = 0; i < 2; i++) {  // xvmindp
-+          double a = getF64(ab, i), b = getF64(bb, i);
-+          setF64(rb_bytes, i, XV_MIN(double, a, b));
-+        } break;
-+        #undef XV_MAX
-+        #undef XV_MIN
-+        default:
-+          MOZ_CRASH_UNSAFE_PRINTF(
-+              "xv float dispatch missing 8-bit XO=%u (instr 0x%08x)",
-+              xo3, instr->instructionBits());
-+      }
-+      setVSR128(xt, rb_bytes);
-+      break;
-+    }
-+
-+    // === XX3-form fused multiply-add (3-source: XT is also input) ===
-+    //
-+    //   xvmaddasp XT,XA,XB:  XT = (XA * XB) + XT       (fused madd)
-+    //   xvmaddadp XT,XA,XB:  same for f64
-+    //   xvnmsubasp XT,XA,XB: XT = -((XA * XB) - XT) = XT - (XA * XB)
-+    //   xvnmsubadp XT,XA,XB: same for f64
-+    //
-+    // Encodings (each +AX): XO8 → XO9 pairs
-+    //   xvmaddasp  PPC_xvmaddasp=0xF0000208   XO8=65  → XO9 130/131
-+    //   xvmaddadp  PPC_xvmaddadp=0xF0000308   XO8=97  → XO9 194/195
-+    //   xvnmsubasp PPC_xvnmsubasp=0xF0000688  XO8=209 → XO9 418/419
-+    //   xvnmsubadp PPC_xvnmsubadp=0xF0000788  XO8=241 → XO9 482/483
-+    // std::fma gives IEEE-correct single-rounding behaviour matching the
-+    // Power ISA definition of these fused forms.
-+    case 130: case 131:      // xvmaddasp
-+    case 194: case 195:      // xvmaddadp
-+    case 418: case 419:      // xvnmsubasp
-+    case 482: case 483: {    // xvnmsubadp
-+      int xt = int(rt | (instr->bit(0) << 5));
-+      uint32_t ra = instr->raValue();
-+      int xa = int(ra | ((instr->instructionBits() >> 2) & 1) << 5);
-+      int xb = int(rb | ((instr->instructionBits() >> 1) & 1) << 5);
-+      uint8_t ab[16], bb[16], tb[16];
-+      getVSR128(xa, ab);
-+      getVSR128(xb, bb);
-+      getVSR128(xt, tb);  // XT is also an input (accumulator).
-+      bool isSp = (xo == 130 || xo == 131 || xo == 418 || xo == 419);
-+      bool isNmsub = (xo == 418 || xo == 419 || xo == 482 || xo == 483);
-+      auto rdF32 = [](uint8_t* buf, int i) -> float {
-+        uint32_t b = (uint32_t)buf[i * 4] |
-+                     ((uint32_t)buf[i * 4 + 1] << 8) |
-+                     ((uint32_t)buf[i * 4 + 2] << 16) |
-+                     ((uint32_t)buf[i * 4 + 3] << 24);
-+        float f; memcpy(&f, &b, sizeof(f)); return f;
-+      };
-+      auto wrF32 = [](uint8_t* buf, int i, float f) {
-+        uint32_t b; memcpy(&b, &f, sizeof(b));
-+        buf[i*4]=(uint8_t)b; buf[i*4+1]=(uint8_t)(b>>8);
-+        buf[i*4+2]=(uint8_t)(b>>16); buf[i*4+3]=(uint8_t)(b>>24);
-+      };
-+      auto rdF64 = [](uint8_t* buf, int i) -> double {
-+        uint64_t b = 0;
-+        for (int k=0;k<8;k++) b |= ((uint64_t)buf[i*8+k])<<(k*8);
-+        double d; memcpy(&d, &b, sizeof(d)); return d;
-+      };
-+      auto wrF64 = [](uint8_t* buf, int i, double d) {
-+        uint64_t b; memcpy(&b, &d, sizeof(b));
-+        for (int k=0;k<8;k++) buf[i*8+k]=(uint8_t)(b>>(k*8));
-+      };
-+      uint8_t result[16];
-+      if (isSp) {
-+        for (int i = 0; i < 4; i++) {
-+          float a = rdF32(ab, i), b = rdF32(bb, i), t = rdF32(tb, i);
-+          // madd:  t + a*b ;  nmsub: -(a*b - t) = t - a*b = std::fma(a,b,-t) negated.
-+          float out = isNmsub ? -std::fma(a, b, -t)
-+                              :  std::fma(a, b, t);
-+          wrF32(result, i, out);
-+        }
-+      } else {
-+        for (int i = 0; i < 2; i++) {
-+          double a = rdF64(ab, i), b = rdF64(bb, i), t = rdF64(tb, i);
-+          double out = isNmsub ? -std::fma(a, b, -t)
-+                               :  std::fma(a, b, t);
-+          wrF64(result, i, out);
-+        }
-+      }
-+      setVSR128(xt, result);
-+      break;
-+    }
-+
-+    default:
-+      MOZ_CRASH_UNSAFE_PRINTF(
-+          "decodeVSX: unimplemented XO=%u (instruction 0x%08x)", xo,
-+          instr->instructionBits());
-+  }
-+}
-+
-+// =============================================================================
-+// Power ISA v3.1 prefixed instructions (POWER10).
-+// =============================================================================
-+//
-+// A prefixed instruction is 8 bytes: a 4-byte prefix word (primary opcode 1)
-+// followed by a 4-byte suffix word. Prefix and suffix must lie in the same
-+// 64-byte aligned block — the JIT must guarantee this when emitting; the sim
-+// asserts.
-+//
-+// Prefix word layout (BE bit numbering):
-+//   [0..5]   primary opcode = 1
-+//   [6..7]   Type (00 = 8LS, 10 = MLS — only forms we implement)
-+//   [8..10]  reserved (must be 0)
-+//   [11]     R (1 = PC-relative; RA must be 0)
-+//   [12..13] reserved (must be 0)
-+//   [14..31] d0 (high 18 bits of the 34-bit signed immediate)
-+//
-+// Suffix word (MLS/8LS form, GPR-target instructions like paddi/pld):
-+//   [0..5]   suffix primary opcode (selects the actual instruction)
-+//   [6..10]  RT (or RS for stores)
-+//   [11..15] RA
-+//   [16..31] d1 (low 16 bits of immediate)
-+//
-+// Suffix word (8LS plxv quirk): the suffix opcode field is only 5 bits
-+// wide and bit [5] holds TX, the high bit of the 6-bit XT VSR number:
-+//   [0..4]   plxv suffix opcode = 11001 (= 25)
-+//   [5]      TX
-+//   [6..10]  T
-+//   [11..15] RA
-+//   [16..31] d1
-+// Combined: XT = (TX << 5) | T. (Equivalent: full 6-bit field at [0..5]
-+// is 0b11001(TX) — values 50 or 51 in our LE bits 31..26.)
-+//
-+// Combined immediate: SI = sign_extend((d0 << 16) | d1, 34).
-+// EA when R=1: address-of-prefix + SI. (RA must be 0.)
-+// EA when R=0: (RA == 0 ? 0 : GPR[RA]) + SI.
-+//
-+// Suffix opcodes implemented here:
-+//   MLS (Type 2) / suffix=14  paddi
-+//   MLS (Type 2) / suffix=48  plfs   (load FP single, widens to double)
-+//   MLS (Type 2) / suffix=50  plfd   (load FP double)
-+//   8LS (Type 0) / suffix=57  pld
-+//   8LS (Type 0) / 5-bit suffix=25, bit 26 = TX  plxv
-+//
-+// Verification recipe when adding more: assemble with `gcc -mcpu=power10
-+// -c` (or clang) and compare the emitted bytes against the encoder; encode
-+// in a small inline-asm program and step through under this sim.
-+
-+void Simulator::decodePrefixed(SimInstruction* prefix) {
-+  // Prefix and suffix must reside in the same 64-byte block.
-+  uint64_t prefixAddr = reinterpret_cast<uint64_t>(prefix);
-+  MOZ_ASSERT((prefixAddr & 63) <= 56,
-+             "POWER10 prefixed instruction crosses 64-byte boundary");
-+
-+  SimInstruction* suffix = reinterpret_cast<SimInstruction*>(
-+      reinterpret_cast<uint8_t*>(prefix) + SimInstruction::kInstrSize);
-+
-+  uint32_t type = prefix->bits(25, 24);
-+  uint32_t R = prefix->bit(20);
-+  uint32_t d0 = prefix->bits(17, 0);  // 18 bits
-+  uint32_t suffixOp6 = suffix->bits(31, 26);  // 6-bit form (paddi, pld)
-+  uint32_t suffixOp5 = suffix->bits(31, 27);  // 5-bit form (plxv)
-+  uint32_t plxvTX = suffix->bit(26);
-+  uint32_t rt = suffix->rtValue();
-+  uint32_t ra = suffix->raValue();
-+  uint32_t d1 = suffix->uimm16Value();
-+
-+  // Reassemble 34-bit signed displacement.
-+  int64_t imm34 = (static_cast<int64_t>(d0) << 16) | d1;
-+  imm34 = (imm34 << 30) >> 30;  // sign-extend from bit 33
-+
-+  // R=1 forms require RA=0 per the ISA.
-+  MOZ_ASSERT(!R || ra == 0,
-+             "POWER10 prefixed R=1 form requires RA=0");
-+
-+  // Type 2 = MLS, Type 0 = 8LS. Other types are reserved here.
-+  if (type == 2 && suffixOp6 == 14) {
-+    // paddi RT, RA, SI, R (MLS)
-+    int64_t base = R ? static_cast<int64_t>(prefixAddr)
-+                     : (ra == 0 ? 0 : getRegister(ra));
-+    setRegister(rt, base + imm34);
-+  } else if (type == 0 && suffixOp6 == 57) {
-+    // pld RT, D(RA), R (8LS)
-+    uint64_t ea = R ? prefixAddr + static_cast<uint64_t>(imm34)
-+                    : (ra == 0 ? 0 : getRegister(ra)) +
-+                          static_cast<uint64_t>(imm34);
-+    if (!handleWasmSegFault(ea, 8)) {
-+      setRegister(rt, readDW(ea, prefix));
-+    }
-+  } else if (type == 2 && suffixOp6 == 50) {
-+    // plfd FRT, D(RA), R (MLS) — load 8-byte double into FPR.
-+    uint64_t ea = R ? prefixAddr + static_cast<uint64_t>(imm34)
-+                    : (ra == 0 ? 0 : getRegister(ra)) +
-+                          static_cast<uint64_t>(imm34);
-+    if (!handleWasmSegFault(ea, 8)) {
-+      setFpuRegisterDouble(rt, readD(ea, prefix));
-+    }
-+  } else if (type == 2 && suffixOp6 == 48) {
-+    // plfs FRT, D(RA), R (MLS) — load 4-byte single, widen NaN-preserving.
-+    uint64_t ea = R ? prefixAddr + static_cast<uint64_t>(imm34)
-+                    : (ra == 0 ? 0 : getRegister(ra)) +
-+                          static_cast<uint64_t>(imm34);
-+    if (!handleWasmSegFault(ea, 4)) {
-+      float val = *reinterpret_cast<float*>(ea);
-+      setFpuRegisterDouble(rt, promoteFloatPreservingNaN(val));
-+    }
-+  } else if (type == 0 && suffixOp5 == 25) {
-+    // plxv XT, D(RA), R (8LS) — XT = (TX << 5) | T, TX at suffix bit 26.
-+    int xt = static_cast<int>(rt | (plxvTX << 5));
-+    uint64_t ea = R ? prefixAddr + static_cast<uint64_t>(imm34)
-+                    : (ra == 0 ? 0 : getRegister(ra)) +
-+                          static_cast<uint64_t>(imm34);
-+    if (!handleWasmSegFault(ea, 16)) {
-+      uint8_t buf[16];
-+      memcpy(buf, reinterpret_cast<const void*>(ea), 16);
-+      setVSR128(xt, buf);
-+    }
-+  } else if (type == 0 && suffixOp6 == 61) {
-+    // pstd RS, D(RA), R (8LS) — store doubleword.
-+    uint64_t ea = R ? prefixAddr + static_cast<uint64_t>(imm34)
-+                    : (ra == 0 ? 0 : getRegister(ra)) +
-+                          static_cast<uint64_t>(imm34);
-+    if (!handleWasmSegFault(ea, 8)) {
-+      writeDW(ea, getRegister(rt), prefix);
-+    }
-+  } else if (type == 2 && suffixOp6 == 54) {
-+    // pstfd FRS, D(RA), R (MLS) — store double.
-+    uint64_t ea = R ? prefixAddr + static_cast<uint64_t>(imm34)
-+                    : (ra == 0 ? 0 : getRegister(ra)) +
-+                          static_cast<uint64_t>(imm34);
-+    if (!handleWasmSegFault(ea, 8)) {
-+      writeD(ea, getFpuRegisterDouble(rt), prefix);
-+    }
-+  } else if (type == 2 && suffixOp6 == 52) {
-+    // pstfs FRS, D(RA), R (MLS) — store single (narrow from double in FPR).
-+    uint64_t ea = R ? prefixAddr + static_cast<uint64_t>(imm34)
-+                    : (ra == 0 ? 0 : getRegister(ra)) +
-+                          static_cast<uint64_t>(imm34);
-+    if (!handleWasmSegFault(ea, 4)) {
-+      double dval = getFpuRegisterDouble(rt);
-+      *reinterpret_cast<float*>(ea) = demoteDoublePreservingNaN(dval);
-+    }
-+  } else if (type == 0 && suffixOp5 == 27) {
-+    // pstxv XS, D(RA), R (8LS) — XS = (SX << 5) | S, SX at suffix bit 26.
-+    int xs = static_cast<int>(rt | (plxvTX << 5));
-+    uint64_t ea = R ? prefixAddr + static_cast<uint64_t>(imm34)
-+                    : (ra == 0 ? 0 : getRegister(ra)) +
-+                          static_cast<uint64_t>(imm34);
-+    if (!handleWasmSegFault(ea, 16)) {
-+      uint8_t buf[16];
-+      getVSR128(xs, buf);
-+      memcpy(reinterpret_cast<void*>(ea), buf, 16);
-+    }
-+  } else {
-+    MOZ_CRASH_UNSAFE_PRINTF(
-+        "decodePrefixed: unimplemented type=%u "
-+        "(prefix 0x%08x, suffix 0x%08x)",
-+        type, prefix->instructionBits(), suffix->instructionBits());
-+  }
-+
-+  // Advance past the full 8-byte prefixed instruction unless a handler
-+  // already redirected the PC. The caller (instructionDecode) returns
-+  // immediately after us, so its 4-byte trailing advance is skipped.
-+  if (!pc_modified_) {
-+    set_pc(static_cast<int64_t>(prefixAddr) + 2 * SimInstruction::kInstrSize);
-+  }
-+}
-+
-+// =============================================================================
-+// Top-level instruction decoder.
-+// =============================================================================
-+
-+void Simulator::instructionDecode(SimInstruction* instr) {
-+  if (!SimulatorProcess::ICacheCheckingDisableCount) {
-+    AutoLockSimulatorCache als;
-+    SimulatorProcess::checkICacheLocked(instr);
-+  }
-+  pc_modified_ = false;
-+
-+  uint32_t instrBits = instr->instructionBits();
-+
-+  // Check for kCallRedirInstr first (PPC_stop = 0x4C0002E4).
-+  if (instrBits == kCallRedirInstr) {
-+    softwareInterrupt(instr);
-+    if (!pc_modified_) {
-+      set_pc(reinterpret_cast<int64_t>(instr) + SimInstruction::kInstrSize);
-+    }
-+    return;
-+  }
-+
-+  // Check for PPC_trap (0x7FE00008).
-+  if (instrBits == 0x7FE00008) {
-+    softwareInterrupt(instr);
-+    if (!pc_modified_) {
-+      set_pc(reinterpret_cast<int64_t>(instr) + SimInstruction::kInstrSize);
-+    }
-+    return;
-+  }
-+
-+  uint32_t opcode = instr->opcode();
-+
-+  // Power ISA v3.1 prefixed instructions: primary opcode 1 marks a
-+  // 4-byte prefix word followed by a 4-byte suffix word. decodePrefixed
-+  // advances the PC by the full 8 bytes (or leaves it modified for
-+  // PC-relative side-effects).
-+  if (opcode == 1) {
-+    decodePrefixed(instr);
-+    return;
-+  }
-+
-+  switch (opcode) {
-+    // D-form ALU
-+    case 3:   // twi
-+    case 7:   // mulli
-+    case 8:   // subfic
-+    case 10:  // cmpli
-+    case 11:  // cmpi
-+    case 12:  // addic
-+    case 13:  // addic.
-+    case 14:  // addi
-+    case 15:  // addis
-+    case 24:  // ori
-+    case 25:  // oris
-+    case 26:  // xori
-+    case 27:  // xoris
-+    case 28:  // andi.
-+    case 29:  // andis.
-+      decodeDFormALU(instr);
-+      break;
-+
-+    // D-form loads
-+    case 32:  // lwz
-+    case 33:  // lwzu
-+    case 34:  // lbz
-+    case 35:  // lbzu
-+    case 40:  // lhz
-+    case 41:  // lhzu
-+    case 42:  // lha
-+    case 43:  // lhau
-+    case 48:  // lfs
-+    case 49:  // lfsu
-+    case 50:  // lfd
-+    case 51:  // lfdu
-+      decodeDFormLoad(instr);
-+      break;
-+
-+    // D-form stores
-+    case 36:  // stw
-+    case 38:  // stb
-+    case 39:  // stbu
-+    case 44:  // sth
-+    case 45:  // sthu
-+    case 52:  // stfs
-+    case 53:  // stfsu
-+    case 54:  // stfd
-+    case 55:  // stfdu
-+      decodeDFormStore(instr);
-+      break;
-+
-+    // DS-form
-+    case 58:  // ld, ldu, lwa
-+    case 62:  // std, stdu
-+      decodeDSForm(instr);
-+      break;
-+
-+    // B-form conditional branch
-+    case 16:
-+      decodeBranch(instr);
-+      break;
-+
-+    // SC (system call) - unused in JIT
-+    case 17:
-+      MOZ_CRASH("Simulator: sc instruction not supported");
-+      break;
-+
-+    // I-form unconditional branch
-+    case 18:
-+      decodeBranch(instr);
-+      break;
-+
-+    // XL-form (branch to LR/CTR, CR operations)
-+    case 19:
-+      decodeBranch(instr);
-+      break;
-+
-+    // M-form / MD-form rotate/mask
-+    case 20:  // rlwimi
-+    case 21:  // rlwinm
-+    case 23:  // rlwnm
-+    case 30:  // rldicl, rldicr, rldic, rldimi, rldcl, rldcr
-+      decodeRotateMask(instr);
-+      break;
-+
-+    // VMX (AltiVec) — primary opcode 4. Vector arithmetic / compare / shift /
-+    // splat / merge / pack / unpack on VR0-VR31. The wasm SIMD lowering
-+    // emits these directly (Simd128 lives in the VR namespace).
-+    case 4:
-+      decodeVMX(instr);
-+      break;
-+
-+    // X-form / XO-form
-+    case 31:
-+      decodeXForm(instr);
-+      break;
-+
-+    // FP single (A-form)
-+    case 59:
-+      decodeFP(instr);
-+      break;
-+
-+    // VSX (XX1-form)
-+    case 60:
-+      decodeVSX(instr);
-+      break;
-+
-+    // FP double (X-form / A-form)
-+    case 63:
-+      decodeFP(instr);
-+      break;
-+
-+    default:
-+      MOZ_CRASH_UNSAFE_PRINTF(
-+          "instructionDecode: unsupported opcode %u (instruction 0x%08x)",
-+          opcode, instrBits);
-+  }
-+
-+  if (!pc_modified_) {
-+    set_pc(reinterpret_cast<int64_t>(instr) + SimInstruction::kInstrSize);
-+  }
-+}
-+
-+// =============================================================================
-+// Single-stepping / execute loop.
-+// =============================================================================
-+
-+void Simulator::enable_single_stepping(SingleStepCallback cb, void* arg) {
-+  single_stepping_ = true;
-+  single_step_callback_ = cb;
-+  single_step_callback_arg_ = arg;
-+  single_step_callback_(single_step_callback_arg_, this, (void*)get_pc());
-+}
-+
-+void Simulator::disable_single_stepping() {
-+  if (!single_stepping_) {
-+    return;
-+  }
-+  single_step_callback_(single_step_callback_arg_, this, (void*)get_pc());
-+  single_stepping_ = false;
-+  single_step_callback_ = nullptr;
-+  single_step_callback_arg_ = nullptr;
-+}
-+
-+template <bool enableStopSimAt>
-+void Simulator::execute() {
-+  if (single_stepping_ && getenv("PPC64_TRACE_SIM")) {
-+    fprintf(stderr, "[sim] enter execute pc=0x%lx lr=0x%lx fp=0x%lx sp=0x%lx\n",
-+            (long)get_pc(), (long)getLR(), (long)getRegister(fp),
-+            (long)getRegister(sp));
-+  }
-+  if (single_stepping_) {
-+    single_step_callback_(single_step_callback_arg_, this, nullptr);
-+  }
-+
-+  int64_t program_counter = get_pc();
-+
-+  while (program_counter != end_sim_pc) {
-+    if (enableStopSimAt && (icount_ == Simulator::StopSimAt)) {
-+      ppc64Debugger dbg(this);
-+      dbg.debug();
-+    } else {
-+      if (single_stepping_) {
-+        if (getenv("PPC64_TRACE_SIM")) {
-+          fprintf(stderr,
-+                  "[sim] step icount=%llu pc=0x%lx instr=0x%08x lr=0x%lx fp=0x%lx sp=0x%lx\n",
-+                  (unsigned long long)icount_, (long)program_counter,
-+                  *(uint32_t*)program_counter, (long)getLR(),
-+                  (long)getRegister(fp), (long)getRegister(sp));
-+        }
-+        single_step_callback_(single_step_callback_arg_, this,
-+                              (void*)program_counter);
-+      }
-+      SimInstruction* instr =
-+          reinterpret_cast<SimInstruction*>(program_counter);
-+      instructionDecode(instr);
-+      icount_++;
-+    }
-+    program_counter = get_pc();
-+  }
-+
-+  if (single_stepping_) {
-+    single_step_callback_(single_step_callback_arg_, this, nullptr);
-+  }
-+}
-+
-+// =============================================================================
-+// callInternal / call.
-+// =============================================================================
-+
-+void Simulator::callInternal(uint8_t* entry) {
-+  // Prepare to execute the code at entry.
-+  setRegister(pc, reinterpret_cast<int64_t>(entry));
-+  // The simulation stops when returning to this call point (LR == end_sim_pc).
-+  setLR(end_sim_pc);
-+
-+  // Remember the values of callee-saved registers (r14-r31 in ELFv2).
-+  int64_t r14_val = getRegister(r14);
-+  int64_t r15_val = getRegister(r15);
-+  int64_t r16_val = getRegister(r16);
-+  int64_t r17_val = getRegister(r17);
-+  int64_t r18_val = getRegister(r18);
-+  int64_t r19_val = getRegister(r19);
-+  int64_t r20_val = getRegister(r20);
-+  int64_t r21_val = getRegister(r21);
-+  int64_t r22_val = getRegister(r22);
-+  int64_t r23_val = getRegister(r23);
-+  int64_t r24_val = getRegister(r24);
-+  int64_t r25_val = getRegister(r25);
-+  int64_t r26_val = getRegister(r26);
-+  int64_t r27_val = getRegister(r27);
-+  int64_t r28_val = getRegister(r28);
-+  int64_t r29_val = getRegister(r29);
-+  int64_t r30_val = getRegister(r30);
-+  int64_t r31_val = getRegister(r31);
-+  int64_t sp_val = getRegister(sp);
-+
-+#ifdef DEBUG
-+  // Set up callee-saved registers with a known value to detect clobbers.
-+  // DEBUG-only: in release this would silently corrupt every JS-jit-entry
-+  // stub frame, since the stub saves r14-r31 to its stack early on. Any
-+  // single-step-profiling sample taken later (or any unwind through the
-+  // stub's saved CSR area) then dereferences `icount_` as a frame
-+  // pointer and crashes — see e.g. wasm/profiling.js, ion-error-*.js,
-+  // ion-lazy-tables.js, ion-callerfp-tag.js, return-call-profiling.js,
-+  // externref-global-postbarrier.js, builtin-modules/i8vecmul.js,
-+  // asm.js/testBug1357053.js (all single-step-profiling tests). In
-+  // debug builds the value collides with the same callsites but the
-+  // MOZ_ASSERTs below catch any actual ABI violation, which is the
-+  // entire point.
-+  int64_t callee_saved_value = icount_;
-+  setRegister(r14, callee_saved_value);
-+  setRegister(r15, callee_saved_value);
-+  setRegister(r16, callee_saved_value);
-+  setRegister(r17, callee_saved_value);
-+  setRegister(r18, callee_saved_value);
-+  setRegister(r19, callee_saved_value);
-+  setRegister(r20, callee_saved_value);
-+  setRegister(r21, callee_saved_value);
-+  setRegister(r22, callee_saved_value);
-+  setRegister(r23, callee_saved_value);
-+  setRegister(r24, callee_saved_value);
-+  setRegister(r25, callee_saved_value);
-+  setRegister(r26, callee_saved_value);
-+  setRegister(r27, callee_saved_value);
-+  setRegister(r28, callee_saved_value);
-+  setRegister(r29, callee_saved_value);
-+  setRegister(r30, callee_saved_value);
-+  setRegister(r31, callee_saved_value);
-+#endif
-+
-+  // Start the simulation.
-+  if (Simulator::StopSimAt != -1) {
-+    execute<true>();
-+  } else {
-+    execute<false>();
-+  }
-+
-+#ifdef DEBUG
-+  // Check that the callee-saved registers have been preserved.
-+  MOZ_ASSERT(callee_saved_value == getRegister(r14));
-+  MOZ_ASSERT(callee_saved_value == getRegister(r15));
-+  MOZ_ASSERT(callee_saved_value == getRegister(r16));
-+  MOZ_ASSERT(callee_saved_value == getRegister(r17));
-+  MOZ_ASSERT(callee_saved_value == getRegister(r18));
-+  MOZ_ASSERT(callee_saved_value == getRegister(r19));
-+  MOZ_ASSERT(callee_saved_value == getRegister(r20));
-+  MOZ_ASSERT(callee_saved_value == getRegister(r21));
-+  MOZ_ASSERT(callee_saved_value == getRegister(r22));
-+  MOZ_ASSERT(callee_saved_value == getRegister(r23));
-+  MOZ_ASSERT(callee_saved_value == getRegister(r24));
-+  MOZ_ASSERT(callee_saved_value == getRegister(r25));
-+  MOZ_ASSERT(callee_saved_value == getRegister(r26));
-+  MOZ_ASSERT(callee_saved_value == getRegister(r27));
-+  MOZ_ASSERT(callee_saved_value == getRegister(r28));
-+  MOZ_ASSERT(callee_saved_value == getRegister(r29));
-+  MOZ_ASSERT(callee_saved_value == getRegister(r30));
-+  MOZ_ASSERT(callee_saved_value == getRegister(r31));
-+#endif
-+
-+  // Restore callee-saved registers.
-+  setRegister(r14, r14_val);
-+  setRegister(r15, r15_val);
-+  setRegister(r16, r16_val);
-+  setRegister(r17, r17_val);
-+  setRegister(r18, r18_val);
-+  setRegister(r19, r19_val);
-+  setRegister(r20, r20_val);
-+  setRegister(r21, r21_val);
-+  setRegister(r22, r22_val);
-+  setRegister(r23, r23_val);
-+  setRegister(r24, r24_val);
-+  setRegister(r25, r25_val);
-+  setRegister(r26, r26_val);
-+  setRegister(r27, r27_val);
-+  setRegister(r28, r28_val);
-+  setRegister(r29, r29_val);
-+  setRegister(r30, r30_val);
-+  setRegister(r31, r31_val);
-+  setRegister(sp, sp_val);
-+}
-+
-+int64_t Simulator::call(uint8_t* entry, int argument_count, ...) {
-+  va_list parameters;
-+  va_start(parameters, argument_count);
-+
-+  int64_t original_stack = getRegister(sp);
-+  // Compute position of stack on entry to generated code.
-+  int64_t entry_stack = original_stack;
-+  if (argument_count > kCArgSlotCount) {
-+    entry_stack = entry_stack - argument_count * sizeof(int64_t);
-+  } else {
-+    entry_stack = entry_stack - kCArgsSlotsSize;
-+  }
-+
-+  entry_stack &= ~U64(ABIStackAlignment - 1);
-+
-+  intptr_t* stack_argument = reinterpret_cast<intptr_t*>(entry_stack);
-+
-+  // PPC64 ELFv2: first 8 integer args go in r3-r10.
-+  for (int i = 0; i < argument_count; i++) {
-+    js::jit::Register argReg;
-+    if (GetIntArgReg(i, &argReg)) {
-+      setRegister(argReg.code(), va_arg(parameters, int64_t));
-+    } else {
-+      stack_argument[i] = va_arg(parameters, int64_t);
-+    }
-+  }
-+
-+  va_end(parameters);
-+  setRegister(sp, entry_stack);
-+
-+  callInternal(entry);
-+
-+  MOZ_ASSERT(entry_stack == getRegister(sp));
-+  setRegister(sp, original_stack);
-+
-+  int64_t result = getRegister(r3);
-+  return result;
-+}
-+
-+uintptr_t Simulator::pushAddress(uintptr_t address) {
-+  int64_t new_sp = getRegister(sp) - sizeof(uintptr_t);
-+  uintptr_t* stack_slot = reinterpret_cast<uintptr_t*>(new_sp);
-+  *stack_slot = address;
-+  setRegister(sp, new_sp);
-+  return new_sp;
-+}
-+
-+uintptr_t Simulator::popAddress() {
-+  int64_t current_sp = getRegister(sp);
-+  uintptr_t* stack_slot = reinterpret_cast<uintptr_t*>(current_sp);
-+  uintptr_t address = *stack_slot;
-+  setRegister(sp, current_sp + sizeof(uintptr_t));
-+  return address;
-+}
-+
-+}  // namespace jit
-+}  // namespace js
-+
-+js::jit::Simulator* JSContext::simulator() const { return simulator_; }
-diff --git a/js/src/jit/ppc64/Simulator-ppc64.h b/js/src/jit/ppc64/Simulator-ppc64.h
-new file mode 100644
-index 000000000000..c7a3f3767d61
---- /dev/null
-+++ b/js/src/jit/ppc64/Simulator-ppc64.h
-@@ -0,0 +1,556 @@
-+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
-+ * vim: set ts=8 sts=2 et sw=2 tw=80:
-+ * This Source Code Form is subject to the terms of the Mozilla Public
-+ * License, v. 2.0. If a copy of the MPL was not distributed with this
-+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-+
-+#ifndef jit_ppc64_Simulator_ppc64_h
-+#define jit_ppc64_Simulator_ppc64_h
-+
-+#ifdef JS_SIMULATOR_PPC64
-+
-+#  include "mozilla/Atomics.h"
-+
-+#  include "jit/IonTypes.h"
-+#  include "js/ProfilingFrameIterator.h"
-+#  include "threading/Thread.h"
-+#  include "vm/MutexIDs.h"
-+#  include "wasm/WasmSignalHandlers.h"
-+
-+namespace js {
-+namespace jit {
-+
-+class JitActivation;
-+class Simulator;
-+class Redirection;
-+class CachePage;
-+class AutoLockSimulator;
-+
-+typedef void (*SingleStepCallback)(void* arg, Simulator* sim, void* pc);
-+
-+const intptr_t kPointerAlignment = 8;
-+const intptr_t kPointerAlignmentMask = kPointerAlignment - 1;
-+const intptr_t kDoubleAlignment = 8;
-+const intptr_t kDoubleAlignmentMask = kDoubleAlignment - 1;
-+
-+const int kNumGPRegisters = 32;
-+const int kPCRegister = 32;
-+const int kNumFPURegisters = 32;
-+const int kNumVRRegisters = 32;  // VR0-VR31 (Altivec/VMX; = VSR32-63 in VSX)
-+
-+// PPC64 Condition Register: 8 fields of 4 bits each.
-+// Each field: bit3=LT, bit2=GT, bit1=EQ, bit0=SO (in PPC big-endian numbering
-+// within a field, but stored in little-endian nibble order in our uint32_t).
-+const int kNumCRFields = 8;
-+
-+// CR field bit positions (within a 4-bit field).
-+const uint8_t kCRFieldLT = 0x8;
-+const uint8_t kCRFieldGT = 0x4;
-+const uint8_t kCRFieldEQ = 0x2;
-+const uint8_t kCRFieldSO = 0x1;
-+
-+// XER register bit positions.
-+const int kXERSOBit = 31;
-+const int kXEROVBit = 30;
-+const int kXERCABit = 29;
-+const int kXEROV32Bit = 19;
-+const int kXERCA32Bit = 18;
-+
-+// FPSCR rounding mode bits (bits 62:63, stored in low bits of our uint64_t).
-+const uint64_t kFPSCRRNMask = 0x3;
-+
-+// FPU rounding modes matching PPC64 FPSCR RN field.
-+enum FPURoundingMode {
-+  RN = 0,  // Round to Nearest (ties to even)
-+  RZ = 1,  // Round toward Zero
-+  RP = 2,  // Round toward +Infinity
-+  RM = 3,  // Round toward -Infinity
-+};
-+
-+// FPU invalid result constants.
-+const uint32_t kFPUInvalidResult = static_cast<uint32_t>(1 << 31) - 1;
-+const int32_t kFPUInvalidResultNegative = static_cast<int32_t>(1u << 31);
-+const uint64_t kFPU64InvalidResult =
-+    static_cast<uint64_t>(static_cast<uint64_t>(1) << 63) - 1;
-+const int64_t kFPU64InvalidResultNegative =
-+    static_cast<int64_t>(static_cast<uint64_t>(1) << 63);
-+
-+// Breakpoint/stop code ranges.
-+const uint32_t kMaxWatchpointCode = 31;
-+const uint32_t kMaxStopCode = 127;
-+const uint32_t kWasmTrapCode = 6;
-+
-+// Redirection instruction: PPC_stop (0x4C0002E4).
-+// Distinct from PPC_trap (0x7FE00008) used for wasm traps.
-+const uint32_t kCallRedirInstr = 0x4C0002E4;
-+
-+typedef uint32_t Instr;
-+class SimInstruction;
-+
-+class Simulator {
-+  friend class ppc64Debugger;
-+
-+ public:
-+  enum Register {
-+    no_reg = -1,
-+    r0 = 0,
-+    r1,
-+    r2,
-+    r3,
-+    r4,
-+    r5,
-+    r6,
-+    r7,
-+    r8,
-+    r9,
-+    r10,
-+    r11,
-+    r12,
-+    r13,
-+    r14,
-+    r15,
-+    r16,
-+    r17,
-+    r18,
-+    r19,
-+    r20,
-+    r21,
-+    r22,
-+    r23,
-+    r24,
-+    r25,
-+    r26,
-+    r27,
-+    r28,
-+    r29,
-+    r30,
-+    r31,
-+    pc,
-+    kNumSimuRegisters,
-+    // Aliases
-+    sp = r1,
-+    fp = r31,
-+  };
-+
-+  enum FPURegister {
-+    f0 = 0,
-+    f1,
-+    f2,
-+    f3,
-+    f4,
-+    f5,
-+    f6,
-+    f7,
-+    f8,
-+    f9,
-+    f10,
-+    f11,
-+    f12,
-+    f13,
-+    f14,
-+    f15,
-+    f16,
-+    f17,
-+    f18,
-+    f19,
-+    f20,
-+    f21,
-+    f22,
-+    f23,
-+    f24,
-+    f25,
-+    f26,
-+    f27,
-+    f28,
-+    f29,
-+    f30,
-+    f31,
-+    kNumFPURegisters
-+  };
-+
-+  static Simulator* Create();
-+  static void Destroy(Simulator* simulator);
-+
-+  Simulator();
-+  ~Simulator();
-+
-+  static Simulator* Current();
-+
-+  static inline uintptr_t StackLimit() {
-+    return Simulator::Current()->stackLimit();
-+  }
-+
-+  uintptr_t* addressOfStackLimit();
-+
-+  // GPR accessors.
-+  void setRegister(int reg, int64_t value);
-+  int64_t getRegister(int reg) const;
-+
-+  // FPR accessors.
-+  void setFpuRegister(int fpureg, int64_t value);
-+  void setFpuRegisterWord(int fpureg, int32_t value);
-+  void setFpuRegisterFloat(int fpureg, float value);
-+  void setFpuRegisterDouble(int fpureg, double value);
-+  int64_t getFpuRegister(int fpureg) const;
-+  int32_t getFpuRegisterWord(int fpureg) const;
-+  int32_t getFpuRegisterSignedWord(int fpureg) const;
-+  float getFpuRegisterFloat(int fpureg) const;
-+  double getFpuRegisterDouble(int fpureg) const;
-+
-+  // VR accessors (Altivec/VMX registers VR0-VR31). The bytes array is the
-+  // ground truth: bytes[0] is the most-significant-byte on PPC64 big-endian
-+  // numbering, i.e., VSR[MSB..LSB] mapped as bytes[0..15]. Callers that want
-+  // typed views (lane 0 etc.) should extract from the bytes array according
-+  // to the ISA's lane numbering for that instruction.
-+  void setVRBytes(int vreg, const uint8_t bytes[16]);
-+  void getVRBytes(int vreg, uint8_t bytes[16]) const;
-+
-+  // VSR (Vector-Scalar Register) accessors: unified 64-register namespace
-+  // where VSR 0-31 aliases FPR 0-31 (DW0 is the FPR value, DW1 is
-+  // architecturally undefined — we model it as zero on read, ignored on
-+  // write) and VSR 32-63 aliases VR 0-31. Used by VSX instructions
-+  // (xxpermdi, xxlor, xxlxor, mtvsrd, mfvsrd, ...).
-+  void getVSR128(int vsr, uint8_t bytes[16]) const;
-+  void setVSR128(int vsr, const uint8_t bytes[16]);
-+
-+  // SPR accessors.
-+  int64_t getLR() const { return LR_; }
-+  void setLR(int64_t value) { LR_ = value; }
-+  int64_t getCTR() const { return CTR_; }
-+  void setCTR(int64_t value) { CTR_ = value; }
-+  uint32_t getCR() const { return CR_; }
-+  void setCR(uint32_t value) { CR_ = value; }
-+  uint64_t getXER() const { return XER_; }
-+  void setXER(uint64_t value) { XER_ = value; }
-+  uint64_t getFPSCR() const { return FPSCR_; }
-+  void setFPSCR(uint64_t value) { FPSCR_ = value; }
-+
-+  // CR field accessors: field 0 is the most significant nibble (bits 31:28).
-+  uint8_t getCRField(int field) const {
-+    return (CR_ >> (4 * (7 - field))) & 0xF;
-+  }
-+  void setCRField(int field, uint8_t val) {
-+    uint32_t shift = 4 * (7 - field);
-+    CR_ = (CR_ & ~(0xFu << shift)) | ((val & 0xFu) << shift);
-+  }
-+
-+  // XER bit accessors.
-+  bool getXERSO() const { return (XER_ >> kXERSOBit) & 1; }
-+  void setXERSO(bool v) {
-+    XER_ = (XER_ & ~(1ull << kXERSOBit)) | ((uint64_t)v << kXERSOBit);
-+  }
-+  bool getXEROV() const { return (XER_ >> kXEROVBit) & 1; }
-+  void setXEROV(bool v) {
-+    XER_ = (XER_ & ~(1ull << kXEROVBit)) | ((uint64_t)v << kXEROVBit);
-+    // Mirror to OV32. Real POWER9 silicon sets OV32 == OV for both 32-bit
-+    // and 64-bit overflow ops: mulldo(2, 2^62) produces OV=OV32=1;
-+    // mulldo(2^30, 4) produces OV=OV32=0. The JIT's
-+    // POWER9 Overflow path is `mulldo + mcrxrx + bc Overflow`, where
-+    // mcrxrx places OV32 in the GT slot and the Overflow condition tests
-+    // GT — so OV32 must be live or no-overflow is reported even when
-+    // OV=1. Without this mirror, BigInt fast-path mul silently wraps.
-+    XER_ = (XER_ & ~(1ull << kXEROV32Bit)) | ((uint64_t)v << kXEROV32Bit);
-+    if (v) setXERSO(true);
-+  }
-+  bool getXERCA() const { return (XER_ >> kXERCABit) & 1; }
-+  void setXERCA(bool v) {
-+    XER_ = (XER_ & ~(1ull << kXERCABit)) | ((uint64_t)v << kXERCABit);
-+  }
-+
-+  // PC accessors.
-+  void set_pc(int64_t value);
-+  int64_t get_pc() const;
-+
-+  template <typename T>
-+  T get_pc_as() const {
-+    return reinterpret_cast<T>(get_pc());
-+  }
-+
-+  void enable_single_stepping(SingleStepCallback cb, void* arg);
-+  void disable_single_stepping();
-+
-+  uintptr_t stackLimit() const;
-+  bool overRecursed(uintptr_t newsp = 0) const;
-+  bool overRecursedWithExtra(uint32_t extra) const;
-+
-+  template <bool enableStopSimAt>
-+  void execute();
-+
-+  int64_t call(uint8_t* entry, int argument_count, ...);
-+
-+  uintptr_t pushAddress(uintptr_t address);
-+  uintptr_t popAddress();
-+
-+  void setLastDebuggerInput(char* input);
-+  char* lastDebuggerInput() { return lastDebuggerInput_; }
-+
-+  bool has_bad_pc() const;
-+
-+  // Update CR field 0 from a 64-bit result.
-+  void updateCR0(int64_t result) {
-+    uint8_t field = kCRFieldSO * getXERSO();
-+    if (result < 0)
-+      field |= kCRFieldLT;
-+    else if (result > 0)
-+      field |= kCRFieldGT;
-+    else
-+      field |= kCRFieldEQ;
-+    setCRField(0, field);
-+  }
-+
-+  // Update CR field 0 from a 32-bit result (sign-extended comparison).
-+  void updateCR0_32(int32_t result) {
-+    uint8_t field = kCRFieldSO * getXERSO();
-+    if (result < 0)
-+      field |= kCRFieldLT;
-+    else if (result > 0)
-+      field |= kCRFieldGT;
-+    else
-+      field |= kCRFieldEQ;
-+    setCRField(0, field);
-+  }
-+
-+  // Compare and set an arbitrary CR field.
-+  void setCRFieldCmp(int field, int64_t lhs, int64_t rhs) {
-+    uint8_t val = kCRFieldSO * getXERSO();
-+    if (lhs < rhs)
-+      val |= kCRFieldLT;
-+    else if (lhs > rhs)
-+      val |= kCRFieldGT;
-+    else
-+      val |= kCRFieldEQ;
-+    setCRField(field, val);
-+  }
-+
-+  void setCRFieldCmpU(int field, uint64_t lhs, uint64_t rhs) {
-+    uint8_t val = kCRFieldSO * getXERSO();
-+    if (lhs < rhs)
-+      val |= kCRFieldLT;
-+    else if (lhs > rhs)
-+      val |= kCRFieldGT;
-+    else
-+      val |= kCRFieldEQ;
-+    setCRField(field, val);
-+  }
-+
-+ private:
-+  enum SpecialValues {
-+    // PPC64 masks the low 2 bits of branch targets, so these must be
-+    // 4-byte aligned to survive the & ~3 mask in blr/bcctr.
-+    bad_ra = -4,
-+    end_sim_pc = -8,
-+    Unpredictable = 0xbadbeaf
-+  };
-+
-+  bool init();
-+
-+  void format(SimInstruction* instr, const char* format);
-+
-+  // Memory access.
-+  inline uint8_t readBU(uint64_t addr);
-+  inline int8_t readB(uint64_t addr);
-+  inline void writeB(uint64_t addr, uint8_t value);
-+  inline void writeB(uint64_t addr, int8_t value);
-+
-+  inline uint16_t readHU(uint64_t addr, SimInstruction* instr);
-+  inline int16_t readH(uint64_t addr, SimInstruction* instr);
-+  inline void writeH(uint64_t addr, uint16_t value, SimInstruction* instr);
-+  inline void writeH(uint64_t addr, int16_t value, SimInstruction* instr);
-+
-+  inline uint32_t readWU(uint64_t addr, SimInstruction* instr);
-+  inline int32_t readW(uint64_t addr, SimInstruction* instr);
-+  inline void writeW(uint64_t addr, uint32_t value, SimInstruction* instr);
-+  inline void writeW(uint64_t addr, int32_t value, SimInstruction* instr);
-+
-+  inline int64_t readDW(uint64_t addr, SimInstruction* instr);
-+  inline void writeDW(uint64_t addr, int64_t value, SimInstruction* instr);
-+
-+  inline double readD(uint64_t addr, SimInstruction* instr);
-+  inline void writeD(uint64_t addr, double value, SimInstruction* instr);
-+
-+  inline uint8_t loadLinkedB(uint64_t addr, SimInstruction* instr);
-+  inline int storeConditionalB(uint64_t addr, uint8_t value,
-+                               SimInstruction* instr);
-+  inline uint16_t loadLinkedH(uint64_t addr, SimInstruction* instr);
-+  inline int storeConditionalH(uint64_t addr, uint16_t value,
-+                               SimInstruction* instr);
-+  inline int32_t loadLinkedW(uint64_t addr, SimInstruction* instr);
-+  inline int storeConditionalW(uint64_t addr, int32_t value,
-+                               SimInstruction* instr);
-+  inline int64_t loadLinkedD(uint64_t addr, SimInstruction* instr);
-+  inline int storeConditionalD(uint64_t addr, int64_t value,
-+                               SimInstruction* instr);
-+
-+  // Instruction decoders.
-+  void decodeDFormALU(SimInstruction* instr);
-+  void decodeDFormLoad(SimInstruction* instr);
-+  void decodeDFormStore(SimInstruction* instr);
-+  void decodeDSForm(SimInstruction* instr);
-+  void decodeXForm(SimInstruction* instr);
-+  void decodeRotateMask(SimInstruction* instr);
-+  void decodeBranch(SimInstruction* instr);
-+  void decodeFP(SimInstruction* instr);
-+  void decodeVSX(SimInstruction* instr);
-+  void decodeVMX(SimInstruction* instr);
-+  // Power ISA v3.1 prefixed instructions. `prefix` points at the
-+  // 4-byte prefix word; the suffix is read from `prefix + 4`.
-+  void decodePrefixed(SimInstruction* prefix);
-+
-+  void softwareInterrupt(SimInstruction* instr);
-+
-+  // Stop/breakpoint helpers.
-+  bool isWatchpoint(uint32_t code);
-+  void printWatchpoint(uint32_t code);
-+  void handleStop(uint32_t code, SimInstruction* instr);
-+  bool isStopInstruction(SimInstruction* instr);
-+  bool isEnabledStop(uint32_t code);
-+  void enableStop(uint32_t code);
-+  void disableStop(uint32_t code);
-+  void increaseStopCounter(uint32_t code);
-+  void printStopInfo(uint32_t code);
-+
-+  JS::ProfilingFrameIterator::RegisterState registerState();
-+
-+  bool MOZ_ALWAYS_INLINE handleWasmSegFault(uint64_t addr, unsigned numBytes) {
-+    if (MOZ_LIKELY(!js::wasm::CodeExists)) {
-+      return false;
-+    }
-+    uint8_t* newPC;
-+    if (!js::wasm::MemoryAccessTraps(registerState(), (uint8_t*)addr, numBytes,
-+                                     &newPC)) {
-+      return false;
-+    }
-+    LLBit_ = false;
-+    set_pc(int64_t(newPC));
-+    return true;
-+  }
-+
-+  void instructionDecode(SimInstruction* instr);
-+
-+ public:
-+  static int64_t StopSimAt;
-+
-+  static void* RedirectNativeFunction(void* nativeFunction,
-+                                      ABIFunctionType type);
-+
-+ private:
-+  void setCallResultDouble(double result);
-+  void setCallResultFloat(float result);
-+  void setCallResult(int64_t res);
-+#  ifdef XP_DARWIN
-+  void setCallResult(intptr_t res);
-+#  endif
-+  void setCallResult(__int128 res);
-+
-+  void callInternal(uint8_t* entry);
-+
-+  // Architecture state.
-+  int64_t registers_[kNumSimuRegisters];
-+  int64_t FPUregisters_[kNumFPURegisters];
-+  // VR namespace (Altivec/VMX registers VR0-VR31 == VSR32-63). Stored as
-+  // 16 raw bytes per register to preserve exact architectural byte order
-+  // independent of host endianness. Accessors defined below; the bytes
-+  // array is the ground truth.
-+  uint8_t VRregisters_[kNumVRRegisters][16];
-+
-+  // PPC64 Special Purpose Registers.
-+  int64_t LR_;
-+  int64_t CTR_;
-+  uint32_t CR_;
-+  uint64_t XER_;
-+  uint64_t FPSCR_;
-+
-+  // Atomics.
-+  bool LLBit_;
-+  uintptr_t LLAddr_;
-+  int64_t lastLLValue_;
-+
-+  // Simulator support.
-+  char* stack_;
-+  uintptr_t stackLimit_;
-+  bool pc_modified_;
-+  int64_t icount_;
-+  int64_t break_count_;
-+
-+  char* lastDebuggerInput_;
-+
-+  SimInstruction* break_pc_;
-+  Instr break_instr_;
-+
-+  bool single_stepping_;
-+  SingleStepCallback single_step_callback_;
-+  void* single_step_callback_arg_;
-+
-+  static const uint32_t kNumOfWatchedStops = 256;
-+  static const uint32_t kStopDisabledBit = 1U << 31;
-+
-+  struct StopCountAndDesc {
-+    uint32_t count_;
-+    char* desc_;
-+  };
-+  StopCountAndDesc watchedStops_[kNumOfWatchedStops];
-+};
-+
-+// Process-wide simulator state.
-+class SimulatorProcess {
-+  friend class Redirection;
-+  friend class AutoLockSimulatorCache;
-+
-+ private:
-+  struct ICacheHasher {
-+    typedef void* Key;
-+    typedef void* Lookup;
-+    static HashNumber hash(const Lookup& l);
-+    static bool match(const Key& k, const Lookup& l);
-+  };
-+
-+ public:
-+  typedef HashMap<void*, CachePage*, ICacheHasher, SystemAllocPolicy> ICacheMap;
-+
-+  static mozilla::Atomic<size_t, mozilla::ReleaseAcquire>
-+      ICacheCheckingDisableCount;
-+  static void FlushICache(void* start, size_t size);
-+  static void checkICacheLocked(SimInstruction* instr);
-+
-+  static bool initialize() {
-+    singleton_ = js_new<SimulatorProcess>();
-+    return singleton_;
-+  }
-+  static void destroy() {
-+    js_delete(singleton_);
-+    singleton_ = nullptr;
-+  }
-+
-+  SimulatorProcess();
-+  ~SimulatorProcess();
-+
-+ private:
-+  static SimulatorProcess* singleton_;
-+
-+  Mutex cacheLock_;
-+  Redirection* redirection_;
-+  ICacheMap icache_;
-+
-+ public:
-+  static ICacheMap& icache() {
-+    singleton_->cacheLock_.assertOwnedByCurrentThread();
-+    return singleton_->icache_;
-+  }
-+
-+  static Redirection* redirection() {
-+    singleton_->cacheLock_.assertOwnedByCurrentThread();
-+    return singleton_->redirection_;
-+  }
-+
-+  static void setRedirection(js::jit::Redirection* redirection) {
-+    singleton_->cacheLock_.assertOwnedByCurrentThread();
-+    singleton_->redirection_ = redirection;
-+  }
-+};
-+
-+}  // namespace jit
-+}  // namespace js
-+
-+#endif /* JS_SIMULATOR_PPC64 */
-+
-+#endif /* jit_ppc64_Simulator_ppc64_h */
-diff --git a/js/src/jit/ppc64/Trampoline-ppc64.cpp b/js/src/jit/ppc64/Trampoline-ppc64.cpp
-new file mode 100644
-index 000000000000..515a931c86b0
---- /dev/null
-+++ b/js/src/jit/ppc64/Trampoline-ppc64.cpp
-@@ -0,0 +1,648 @@
-+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
-+ * vim: set ts=8 sts=2 et sw=2 tw=80:
-+ * This Source Code Form is subject to the terms of the Mozilla Public
-+ * License, v. 2.0. If a copy of the MPL was not distributed with this
-+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-+
-+#include "jit/Bailouts.h"
-+#include "jit/BaselineFrame.h"
-+#include "jit/CalleeToken.h"
-+#include "jit/JitFrames.h"
-+#include "jit/JitRuntime.h"
-+#include "jit/PerfSpewer.h"
-+#include "jit/ppc64/SharedICHelpers-ppc64.h"
-+#include "jit/VMFunctions.h"
-+#include "vm/JitActivation.h"
-+#include "vm/JSContext.h"
-+
-+#include "jit/MacroAssembler-inl.h"
-+
-+using namespace js;
-+using namespace js::jit;
-+
-+// Float (Single+Double) and all GPRs. Simd128 excluded — Ion compiles JS
-+// (no v128 type), so SIMD regs are never live at bailout / invalidator /
-+// preBarrier entry. Including them would force the bailout frame's
-+// FPUArray to hold v128 slots that Ion never writes.
-+static const LiveRegisterSet AllRegs = LiveRegisterSet(
-+    GeneralRegisterSet(Registers::AllMask),
-+    FloatRegisterSet(FloatRegisters::AllSingleMask |
-+                     FloatRegisters::AllDoubleMask));
-+
-+static_assert(sizeof(uintptr_t) == sizeof(uint64_t), "Not 64-bit clean.");
-+
-+// PPC64 ELFv2 callee-saved: GPRs r14-r31, FPRs f14-f31, VRs VR20-VR31, LR.
-+// We also save reg_vp (r10 / IntArgReg7) so we can use it after the JIT call.
-+//
-+// Layout is alignas(16) so that after `reserveStack(sizeof(EnterJITRegs))`
-+// the SP-relative offset of every VR slot is 16-byte aligned, satisfying
-+// the 16-byte alignment requirement of stxvd2x / stvx (stvx is technically
-+// alignment-tolerant, but we'd rather align by construction). Padding at
-+// the end keeps sizeof a multiple of 16 so SP stays quadword-aligned per
-+// the ELFv2 stack-pointer rule.
-+struct alignas(16) EnterJITRegs {
-+  // VR20-VR31 first so their SP-relative offsets are 0, 16, 32, ... — all
-+  // 16-byte aligned regardless of what follows.
-+  uint8_t vr20[16];
-+  uint8_t vr21[16];
-+  uint8_t vr22[16];
-+  uint8_t vr23[16];
-+  uint8_t vr24[16];
-+  uint8_t vr25[16];
-+  uint8_t vr26[16];
-+  uint8_t vr27[16];
-+  uint8_t vr28[16];
-+  uint8_t vr29[16];
-+  uint8_t vr30[16];
-+  uint8_t vr31[16];
-+
-+  double f31;
-+  double f30;
-+  double f29;
-+  double f28;
-+  double f27;
-+  double f26;
-+  double f25;
-+  double f24;
-+  double f23;
-+  double f22;
-+  double f21;
-+  double f20;
-+  double f19;
-+  double f18;
-+  double f17;
-+  double f16;
-+  double f15;
-+  double f14;
-+
-+  uint64_t r31;  // FramePointer
-+  uint64_t r30;
-+  uint64_t r29;
-+  uint64_t r28;
-+  uint64_t r27;
-+  uint64_t r26;
-+  uint64_t r25;
-+  uint64_t r24;
-+  uint64_t r23;
-+  uint64_t r22;
-+  uint64_t r21;
-+  uint64_t r20;
-+  uint64_t r19;
-+  uint64_t r18;
-+  uint64_t r17;
-+  uint64_t r16;
-+  uint64_t r15;
-+  uint64_t r14;
-+  uint64_t r2;  // TOC pointer
-+  uint64_t lr;
-+  // Save reg_vp (r10) on stack so we can use it after the JIT call returns.
-+  uint64_t r10;
-+};
-+// alignas(16) on the struct ensures sizeof is a multiple of 16, which keeps
-+// SP quadword-aligned after `reserveStack(sizeof(EnterJITRegs))`. The
-+// existing fields total 312 bytes; with the 192 bytes of VR slots we are
-+// at 504, which alignas(16) bumps to 512.
-+static_assert((sizeof(EnterJITRegs) % 16) == 0,
-+              "EnterJITRegs must be 16-byte aligned to keep SP aligned");
-+
-+static void GenerateReturn(MacroAssembler& masm) {
-+  MOZ_ASSERT(masm.framePushed() == sizeof(EnterJITRegs));
-+
-+  // Restore non-volatile GPRs.
-+  masm.as_ld(r14, StackPointer, offsetof(EnterJITRegs, r14));
-+  masm.as_ld(r15, StackPointer, offsetof(EnterJITRegs, r15));
-+  masm.as_ld(r16, StackPointer, offsetof(EnterJITRegs, r16));
-+  masm.as_ld(r17, StackPointer, offsetof(EnterJITRegs, r17));
-+  masm.as_ld(r18, StackPointer, offsetof(EnterJITRegs, r18));
-+  masm.as_ld(r19, StackPointer, offsetof(EnterJITRegs, r19));
-+  masm.as_ld(r20, StackPointer, offsetof(EnterJITRegs, r20));
-+  masm.as_ld(r21, StackPointer, offsetof(EnterJITRegs, r21));
-+  masm.as_ld(r22, StackPointer, offsetof(EnterJITRegs, r22));
-+  masm.as_ld(r23, StackPointer, offsetof(EnterJITRegs, r23));
-+  masm.as_ld(r24, StackPointer, offsetof(EnterJITRegs, r24));
-+  masm.as_ld(r25, StackPointer, offsetof(EnterJITRegs, r25));
-+  masm.as_ld(r26, StackPointer, offsetof(EnterJITRegs, r26));
-+  masm.as_ld(r27, StackPointer, offsetof(EnterJITRegs, r27));
-+  masm.as_ld(r28, StackPointer, offsetof(EnterJITRegs, r28));
-+  masm.as_ld(r29, StackPointer, offsetof(EnterJITRegs, r29));
-+  masm.as_ld(r30, StackPointer, offsetof(EnterJITRegs, r30));
-+  masm.as_ld(r31, StackPointer, offsetof(EnterJITRegs, r31));
-+  masm.as_ld(r2, StackPointer, offsetof(EnterJITRegs, r2));
-+
-+  // Restore LR.
-+  masm.as_ld(r0, StackPointer, offsetof(EnterJITRegs, lr));
-+  masm.xs_mtlr(r0);
-+
-+  // Restore non-volatile FPRs.
-+  masm.as_lfd(f14, StackPointer, offsetof(EnterJITRegs, f14));
-+  masm.as_lfd(f15, StackPointer, offsetof(EnterJITRegs, f15));
-+  masm.as_lfd(f16, StackPointer, offsetof(EnterJITRegs, f16));
-+  masm.as_lfd(f17, StackPointer, offsetof(EnterJITRegs, f17));
-+  masm.as_lfd(f18, StackPointer, offsetof(EnterJITRegs, f18));
-+  masm.as_lfd(f19, StackPointer, offsetof(EnterJITRegs, f19));
-+  masm.as_lfd(f20, StackPointer, offsetof(EnterJITRegs, f20));
-+  masm.as_lfd(f21, StackPointer, offsetof(EnterJITRegs, f21));
-+  masm.as_lfd(f22, StackPointer, offsetof(EnterJITRegs, f22));
-+  masm.as_lfd(f23, StackPointer, offsetof(EnterJITRegs, f23));
-+  masm.as_lfd(f24, StackPointer, offsetof(EnterJITRegs, f24));
-+  masm.as_lfd(f25, StackPointer, offsetof(EnterJITRegs, f25));
-+  masm.as_lfd(f26, StackPointer, offsetof(EnterJITRegs, f26));
-+  masm.as_lfd(f27, StackPointer, offsetof(EnterJITRegs, f27));
-+  masm.as_lfd(f28, StackPointer, offsetof(EnterJITRegs, f28));
-+  masm.as_lfd(f29, StackPointer, offsetof(EnterJITRegs, f29));
-+  masm.as_lfd(f30, StackPointer, offsetof(EnterJITRegs, f30));
-+  masm.as_lfd(f31, StackPointer, offsetof(EnterJITRegs, f31));
-+
-+  // Restore callee-saved VR20-VR31 (ELFv2). lvx uses indexed addressing
-+  // (RA + RB), and r0's value is used here as RB (RA = StackPointer is
-+  // non-zero, so its value is added). r0 is non-allocatable.
-+#define RESTORE_VR(N)                                                 \
-+  masm.xs_li(r0, offsetof(EnterJITRegs, vr##N));                      \
-+  masm.as_lvx(N, StackPointer, r0)
-+  RESTORE_VR(20); RESTORE_VR(21); RESTORE_VR(22); RESTORE_VR(23);
-+  RESTORE_VR(24); RESTORE_VR(25); RESTORE_VR(26); RESTORE_VR(27);
-+  RESTORE_VR(28); RESTORE_VR(29); RESTORE_VR(30); RESTORE_VR(31);
-+#undef RESTORE_VR
-+
-+  masm.freeStack(sizeof(EnterJITRegs));
-+
-+  masm.as_blr();
-+}
-+
-+static void GeneratePrologue(MacroAssembler& masm) {
-+  // Save LR first (PPC64 LR is SPR, not GPR).
-+  masm.xs_mflr(r0);
-+
-+  // ELFv2 prologue convention: save LR at caller's frame [SP+16] BEFORE
-+  // decrementing SP. External unwinders (gdb, perf, libunwind) walk the
-+  // stack by reading LR-save slots at [SP+16] of every frame; without
-+  // this write they'd find junk at our caller's slot. Costs 1 extra
-+  // instruction; we still keep the in-frame save below for clean
-+  // restore symmetry.
-+  masm.as_std(r0, StackPointer, 16);
-+
-+  masm.reserveStack(sizeof(EnterJITRegs));
-+
-+  // Save LR (also kept in our own frame for the clean restore in
-+  // GenerateReturn — see comment there).
-+  masm.as_std(r0, StackPointer, offsetof(EnterJITRegs, lr));
-+
-+  // Save non-volatile GPRs.
-+  masm.as_std(r2, StackPointer, offsetof(EnterJITRegs, r2));
-+  masm.as_std(r14, StackPointer, offsetof(EnterJITRegs, r14));
-+  masm.as_std(r15, StackPointer, offsetof(EnterJITRegs, r15));
-+  masm.as_std(r16, StackPointer, offsetof(EnterJITRegs, r16));
-+  masm.as_std(r17, StackPointer, offsetof(EnterJITRegs, r17));
-+  masm.as_std(r18, StackPointer, offsetof(EnterJITRegs, r18));
-+  masm.as_std(r19, StackPointer, offsetof(EnterJITRegs, r19));
-+  masm.as_std(r20, StackPointer, offsetof(EnterJITRegs, r20));
-+  masm.as_std(r21, StackPointer, offsetof(EnterJITRegs, r21));
-+  masm.as_std(r22, StackPointer, offsetof(EnterJITRegs, r22));
-+  masm.as_std(r23, StackPointer, offsetof(EnterJITRegs, r23));
-+  masm.as_std(r24, StackPointer, offsetof(EnterJITRegs, r24));
-+  masm.as_std(r25, StackPointer, offsetof(EnterJITRegs, r25));
-+  masm.as_std(r26, StackPointer, offsetof(EnterJITRegs, r26));
-+  masm.as_std(r27, StackPointer, offsetof(EnterJITRegs, r27));
-+  masm.as_std(r28, StackPointer, offsetof(EnterJITRegs, r28));
-+  masm.as_std(r29, StackPointer, offsetof(EnterJITRegs, r29));
-+  masm.as_std(r30, StackPointer, offsetof(EnterJITRegs, r30));
-+  masm.as_std(r31, StackPointer, offsetof(EnterJITRegs, r31));
-+
-+  // Save reg_vp (r10) so we can retrieve it after the JIT call.
-+  masm.as_std(r10, StackPointer, offsetof(EnterJITRegs, r10));
-+
-+  // Save non-volatile FPRs.
-+  masm.as_stfd(f14, StackPointer, offsetof(EnterJITRegs, f14));
-+  masm.as_stfd(f15, StackPointer, offsetof(EnterJITRegs, f15));
-+  masm.as_stfd(f16, StackPointer, offsetof(EnterJITRegs, f16));
-+  masm.as_stfd(f17, StackPointer, offsetof(EnterJITRegs, f17));
-+  masm.as_stfd(f18, StackPointer, offsetof(EnterJITRegs, f18));
-+  masm.as_stfd(f19, StackPointer, offsetof(EnterJITRegs, f19));
-+  masm.as_stfd(f20, StackPointer, offsetof(EnterJITRegs, f20));
-+  masm.as_stfd(f21, StackPointer, offsetof(EnterJITRegs, f21));
-+  masm.as_stfd(f22, StackPointer, offsetof(EnterJITRegs, f22));
-+  masm.as_stfd(f23, StackPointer, offsetof(EnterJITRegs, f23));
-+  masm.as_stfd(f24, StackPointer, offsetof(EnterJITRegs, f24));
-+  masm.as_stfd(f25, StackPointer, offsetof(EnterJITRegs, f25));
-+  masm.as_stfd(f26, StackPointer, offsetof(EnterJITRegs, f26));
-+  masm.as_stfd(f27, StackPointer, offsetof(EnterJITRegs, f27));
-+  masm.as_stfd(f28, StackPointer, offsetof(EnterJITRegs, f28));
-+  masm.as_stfd(f29, StackPointer, offsetof(EnterJITRegs, f29));
-+  masm.as_stfd(f30, StackPointer, offsetof(EnterJITRegs, f30));
-+  masm.as_stfd(f31, StackPointer, offsetof(EnterJITRegs, f31));
-+
-+  // Save callee-saved VR20-VR31 (ELFv2). The JIT freely uses VMX registers
-+  // via EmitVmxBinary etc.; without this save the C caller's VR20-VR31
-+  // contents would be trashed on return. stvx uses indexed addressing —
-+  // r0 holds the offset (non-allocatable in JIT regalloc; safe to use as
-+  // a free temp here).
-+#define SAVE_VR(N)                                                    \
-+  masm.xs_li(r0, offsetof(EnterJITRegs, vr##N));                      \
-+  masm.as_stvx(N, StackPointer, r0)
-+  SAVE_VR(20); SAVE_VR(21); SAVE_VR(22); SAVE_VR(23);
-+  SAVE_VR(24); SAVE_VR(25); SAVE_VR(26); SAVE_VR(27);
-+  SAVE_VR(28); SAVE_VR(29); SAVE_VR(30); SAVE_VR(31);
-+#undef SAVE_VR
-+}
-+
-+void JitRuntime::generateEnterJIT(JSContext* cx, MacroAssembler& masm) {
-+  AutoCreatedBy acb(masm, "JitRuntime::generateEnterJIT");
-+
-+  enterJITOffset_ = startTrampolineCode(masm);
-+
-+  // EnterJitCode signature: (void* code, unsigned argc, Value* argv,
-+  //                          InterpreterFrame* fp, CalleeToken calleeToken,
-+  //                          JSObject* envChain, size_t numStackValues,
-+  //                          Value* vp)
-+  const Register reg_code = IntArgReg0;                       // r3
-+  const Register reg_argc = IntArgReg1;                       // r4
-+  const Register reg_argv = IntArgReg2;                       // r5
-+  const mozilla::DebugOnly<Register> reg_frame = IntArgReg3;  // r6
-+  const Register reg_token = IntArgReg4;                      // r7
-+  const Register reg_chain = IntArgReg5;                      // r8
-+  const Register reg_values = IntArgReg6;                     // r9
-+  const Register reg_vp = IntArgReg7;                         // r10
-+
-+  MOZ_ASSERT(OsrFrameReg == reg_frame);
-+
-+  GeneratePrologue(masm);
-+
-+  // Save stack pointer as baseline frame.
-+  masm.movePtr(StackPointer, FramePointer);
-+
-+  // Use non-volatile scratch registers for generateEnterJitShared.
-+  // r14, r15, r17 are non-volatile and not special-purpose in JIT.
-+  generateEnterJitShared(masm, reg_argc, reg_argv, reg_token, r14, r15, r17);
-+
-+  // Push the descriptor.
-+  masm.unboxInt32(Address(reg_vp, 0), r14);
-+  masm.pushFrameDescriptorForJitCall(FrameType::CppToJSJit, r14, r14);
-+
-+  CodeLabel returnLabel;
-+  Label oomReturnLabel;
-+  {
-+    // Handle Interpreter -> Baseline OSR.
-+    AllocatableGeneralRegisterSet regs(GeneralRegisterSet::All());
-+    MOZ_ASSERT(!regs.has(FramePointer));
-+    regs.take(OsrFrameReg);
-+    regs.take(reg_code);
-+    MOZ_ASSERT(!regs.has(ReturnReg), "ReturnReg matches reg_code");
-+
-+    Label notOsr;
-+    masm.branchTestPtr(Assembler::Zero, OsrFrameReg, OsrFrameReg, &notOsr);
-+
-+    Register numStackValues = reg_values;
-+    regs.take(numStackValues);
-+    Register scratch = regs.takeAny();
-+
-+    // Push return address.
-+    masm.subPtr(Imm32(sizeof(uintptr_t)), StackPointer);
-+    masm.mov(&returnLabel, scratch);
-+    masm.storePtr(scratch, Address(StackPointer, 0));
-+
-+    // Push previous frame pointer.
-+    masm.subPtr(Imm32(sizeof(uintptr_t)), StackPointer);
-+    masm.storePtr(FramePointer, Address(StackPointer, 0));
-+
-+    // Reserve frame.
-+    Register framePtr = FramePointer;
-+    masm.movePtr(StackPointer, framePtr);
-+    masm.subPtr(Imm32(BaselineFrame::Size()), StackPointer);
-+
-+    Register framePtrScratch = regs.takeAny();
-+    masm.movePtr(StackPointer, framePtrScratch);
-+
-+    // Reserve space for locals and stack values.
-+    masm.x_sldi(scratch, numStackValues, 3);
-+    masm.subPtr(scratch, StackPointer);
-+
-+    // Enter exit frame.
-+    masm.reserveStack(3 * sizeof(uintptr_t));
-+    masm.storePtr(ImmWord(MakeFrameDescriptor(FrameType::BaselineJS)),
-+                  Address(StackPointer, 2 * sizeof(uintptr_t)));
-+    masm.storePtr(ImmPtr(nullptr), Address(StackPointer, sizeof(uintptr_t)));
-+    masm.storePtr(FramePointer, Address(StackPointer, 0));
-+
-+    // No GC things to mark, push a bare token.
-+    masm.loadJSContext(scratch);
-+    masm.enterFakeExitFrame(scratch, scratch, ExitFrameType::Bare);
-+
-+    masm.reserveStack(2 * sizeof(uintptr_t));
-+    masm.storePtr(framePtr, Address(StackPointer, sizeof(uintptr_t)));
-+    masm.storePtr(reg_code, Address(StackPointer, 0));
-+
-+    using Fn = void (*)(BaselineFrame* frame, InterpreterFrame* interpFrame,
-+                        uint32_t numStackValues);
-+    masm.setupUnalignedABICall(scratch);
-+    masm.passABIArg(framePtrScratch);
-+    masm.passABIArg(OsrFrameReg);
-+    masm.passABIArg(numStackValues);
-+    masm.callWithABI<Fn, jit::InitBaselineFrameForOsr>(
-+        ABIType::General, CheckUnsafeCallWithABI::DontCheckHasExitFrame);
-+
-+    regs.add(OsrFrameReg);
-+    Register jitcode = regs.takeAny();
-+    masm.loadPtr(Address(StackPointer, 0), jitcode);
-+    masm.loadPtr(Address(StackPointer, sizeof(uintptr_t)), framePtr);
-+    masm.freeStack(2 * sizeof(uintptr_t));
-+
-+    masm.freeStack(ExitFrameLayout::SizeWithFooter());
-+
-+    // If OSR-ing, then emit instrumentation for setting lastProfilerFrame
-+    // if profiler instrumentation is enabled.
-+    {
-+      Label skipProfilingInstrumentation;
-+      AbsoluteAddress addressOfEnabled(
-+          cx->runtime()->geckoProfiler().addressOfEnabled());
-+      masm.branch32(Assembler::Equal, addressOfEnabled, Imm32(0),
-+                    &skipProfilingInstrumentation);
-+      masm.profilerEnterFrame(framePtr, scratch);
-+      masm.bind(&skipProfilingInstrumentation);
-+    }
-+
-+    masm.jump(jitcode);
-+
-+    masm.bind(&notOsr);
-+    // Load the scope chain in R1.
-+    MOZ_ASSERT(R1.scratchReg() != reg_code);
-+    masm.movePtr(reg_chain, R1.scratchReg());
-+  }
-+
-+  // The call will push the return address and frame pointer on the stack, thus
-+  // we check that the stack would be aligned once the call is complete.
-+  masm.assertStackAlignment(JitStackAlignment, 2 * sizeof(uintptr_t));
-+
-+  // Call the function with pushing return address to stack.
-+  masm.callJitNoProfiler(reg_code);
-+
-+  {
-+    // Interpreter -> Baseline OSR will return here.
-+    masm.bind(&returnLabel);
-+    masm.addCodeLabel(returnLabel);
-+    masm.bind(&oomReturnLabel);
-+  }
-+
-+  // Discard arguments and padding. Set sp to the address of the EnterJITRegs
-+  // on the stack.
-+  masm.movePtr(FramePointer, StackPointer);
-+
-+  // Store the returned value into the vp.
-+  masm.as_ld(reg_vp, StackPointer, offsetof(EnterJITRegs, r10));
-+  masm.storeValue(JSReturnOperand, Address(reg_vp, 0));
-+
-+  // Restore non-volatile registers and return.
-+  GenerateReturn(masm);
-+}
-+
-+// static
-+mozilla::Maybe<::JS::ProfilingFrameIterator::RegisterState>
-+JitRuntime::getCppEntryRegisters(JitFrameLayout* frameStackAddress) {
-+  return mozilla::Nothing{};
-+}
-+
-+void JitRuntime::generateInvalidator(MacroAssembler& masm, Label* bailoutTail) {
-+  AutoCreatedBy acb(masm, "JitRuntime::generateInvalidator");
-+
-+  invalidatorOffset_ = startTrampolineCode(masm);
-+
-+  masm.checkStackAlignment();
-+
-+  // Push all registers so we can access them from [base + code].
-+  masm.PushRegsInMask(AllRegs);
-+
-+  // Pass pointer to InvalidationBailoutStack structure.
-+  masm.movePtr(StackPointer, IntArgReg0);
-+
-+  // Reserve place for BailoutInfo pointer. Two words to ensure alignment for
-+  // setupAlignedABICall.
-+  masm.subPtr(Imm32(2 * sizeof(uintptr_t)), StackPointer);
-+  masm.movePtr(StackPointer, IntArgReg1);
-+
-+  using Fn = bool (*)(InvalidationBailoutStack* sp, BaselineBailoutInfo** info);
-+  masm.setupAlignedABICall();
-+  masm.passABIArg(IntArgReg0);
-+  masm.passABIArg(IntArgReg1);
-+  masm.callWithABI<Fn, InvalidationBailout>(
-+      ABIType::General, CheckUnsafeCallWithABI::DontCheckOther);
-+
-+  masm.pop(IntArgReg2);
-+
-+  // Pop the machine state and the dead frame.
-+  masm.moveToStackPtr(FramePointer);
-+
-+  // Jump to shared bailout tail. The BailoutInfo pointer has to be in
-+  // IntArgReg2 (r5).
-+  masm.jump(bailoutTail);
-+}
-+
-+// When bailout is done via out of line code (lazy bailout).
-+// Frame size is stored in LR (look at
-+// CodeGeneratorPPC64::generateOutOfLineCode()) and thunk code should save it
-+// on stack.
-+static void PushBailoutFrame(MacroAssembler& masm, Register spArg) {
-+  // Push the frameSize_ stored in LR.
-+  // See: CodeGeneratorPPC64::generateOutOfLineCode()
-+  masm.pushReturnAddress();
-+
-+  // Push registers such that we can access them from [base + code].
-+  masm.PushRegsInMask(AllRegs);
-+
-+  // Put pointer to BailoutStack as first argument to the Bailout().
-+  masm.movePtr(StackPointer, spArg);
-+}
-+
-+static void GenerateBailoutThunk(MacroAssembler& masm, Label* bailoutTail) {
-+  PushBailoutFrame(masm, IntArgReg0);
-+
-+  // Make space for Bailout's bailoutInfo outparam.
-+  masm.reserveStack(sizeof(void*));
-+  masm.movePtr(StackPointer, IntArgReg1);
-+
-+  // Call the bailout function.
-+  using Fn = bool (*)(BailoutStack* sp, BaselineBailoutInfo** info);
-+  masm.setupUnalignedABICall(IntArgReg2);
-+  masm.passABIArg(IntArgReg0);
-+  masm.passABIArg(IntArgReg1);
-+  masm.callWithABI<Fn, Bailout>(ABIType::General,
-+                                CheckUnsafeCallWithABI::DontCheckOther);
-+
-+  // Get the bailoutInfo outparam.
-+  masm.pop(IntArgReg2);
-+
-+  // Remove both the bailout frame and the topmost Ion frame's stack.
-+  masm.moveToStackPtr(FramePointer);
-+
-+  // Jump to shared bailout tail. The BailoutInfo pointer has to be in
-+  // IntArgReg2 (r5).
-+  masm.jump(bailoutTail);
-+}
-+
-+void JitRuntime::generateBailoutHandler(MacroAssembler& masm,
-+                                        Label* bailoutTail) {
-+  AutoCreatedBy acb(masm, "JitRuntime::generateBailoutHandler");
-+
-+  bailoutHandlerOffset_ = startTrampolineCode(masm);
-+
-+  GenerateBailoutThunk(masm, bailoutTail);
-+}
-+
-+bool JitRuntime::generateVMWrapper(JSContext* cx, MacroAssembler& masm,
-+                                   VMFunctionId id, const VMFunctionData& f,
-+                                   DynFn nativeFun, uint32_t* wrapperOffset) {
-+  AutoCreatedBy acb(masm, "JitRuntime::generateVMWrapper");
-+
-+  *wrapperOffset = startTrampolineCode(masm);
-+
-+  // Avoid conflicts with argument registers while discarding the result after
-+  // the function call.
-+  AllocatableGeneralRegisterSet regs(Register::Codes::WrapperMask);
-+
-+  static_assert(
-+      (Register::Codes::VolatileMask & ~Register::Codes::WrapperMask) == 0,
-+      "Wrapper register set should be a superset of Volatile register set.");
-+
-+  // The context is the first argument; r3 is the first argument register.
-+  Register cxreg = IntArgReg0;
-+  regs.take(cxreg);
-+
-+  // On link-register platforms, it is the responsibility of the VM *callee* to
-+  // push the return address, while the caller must ensure that the address
-+  // is stored in LR on entry. This allows the VM wrapper to work with both
-+  // direct calls and tail calls.
-+  masm.pushReturnAddress();
-+
-+  // Push the frame pointer to finish the exit frame, then link it up.
-+  masm.Push(FramePointer);
-+  masm.moveStackPtrTo(FramePointer);
-+  masm.loadJSContext(cxreg);
-+  masm.enterExitFrame(cxreg, regs.getAny(), id);
-+
-+  // Reserve space for the outparameter.
-+  masm.reserveVMFunctionOutParamSpace(f);
-+
-+  masm.setupUnalignedABICallDontSaveRestoreSP();
-+  masm.passABIArg(cxreg);
-+
-+  size_t argDisp = ExitFrameLayout::Size();
-+
-+  // Copy any arguments.
-+  for (uint32_t explicitArg = 0; explicitArg < f.explicitArgs; explicitArg++) {
-+    switch (f.argProperties(explicitArg)) {
-+      case VMFunctionData::WordByValue:
-+        if (f.argPassedInFloatReg(explicitArg)) {
-+          masm.passABIArg(MoveOperand(FramePointer, argDisp), ABIType::Float64);
-+        } else {
-+          masm.passABIArg(MoveOperand(FramePointer, argDisp), ABIType::General);
-+        }
-+        argDisp += sizeof(void*);
-+        break;
-+      case VMFunctionData::WordByRef:
-+        masm.passABIArg(MoveOperand(FramePointer, argDisp,
-+                                    MoveOperand::Kind::EffectiveAddress),
-+                        ABIType::General);
-+        argDisp += sizeof(void*);
-+        break;
-+      case VMFunctionData::DoubleByValue:
-+      case VMFunctionData::DoubleByRef:
-+        MOZ_CRASH("NYI: PPC64 callVM should not be used with 128bits values.");
-+        break;
-+    }
-+  }
-+
-+  // Copy the implicit outparam, if any.
-+  const int32_t outParamOffset =
-+      -int32_t(ExitFooterFrame::Size()) - f.sizeOfOutParamStackSlot();
-+  if (f.outParam != Type_Void) {
-+    masm.passABIArg(MoveOperand(FramePointer, outParamOffset,
-+                                MoveOperand::Kind::EffectiveAddress),
-+                    ABIType::General);
-+  }
-+
-+  masm.callWithABI(nativeFun, ABIType::General,
-+                   CheckUnsafeCallWithABI::DontCheckHasExitFrame);
-+
-+  // Test for failure.
-+  switch (f.failType()) {
-+    case Type_Cell:
-+      masm.branchTestPtr(Assembler::Zero, IntArgReg0, IntArgReg0,
-+                         masm.failureLabel());
-+      break;
-+    case Type_Bool:
-+      masm.branchIfFalseBool(IntArgReg0, masm.failureLabel());
-+      break;
-+    case Type_Void:
-+      break;
-+    default:
-+      MOZ_CRASH("unknown failure kind");
-+  }
-+
-+  // Load the outparam.
-+  masm.loadVMFunctionOutParam(f, Address(FramePointer, outParamOffset));
-+
-+  // Pop frame and restore frame pointer.
-+  masm.moveToStackPtr(FramePointer);
-+  masm.pop(FramePointer);
-+
-+  // Return. Subtract sizeof(void*) for the frame pointer.
-+  masm.retn(Imm32(sizeof(ExitFrameLayout) - sizeof(void*) +
-+                  f.explicitStackSlots() * sizeof(void*) +
-+                  f.extraValuesToPop * sizeof(Value)));
-+
-+  return true;
-+}
-+
-+uint32_t JitRuntime::generatePreBarrier(JSContext* cx, MacroAssembler& masm,
-+                                        MIRType type) {
-+  AutoCreatedBy acb(masm, "JitRuntime::generatePreBarrier");
-+
-+  uint32_t offset = startTrampolineCode(masm);
-+
-+  MOZ_ASSERT(PreBarrierReg == IntArgReg1);  // r4
-+  Register temp1 = IntArgReg0;              // r3
-+  Register temp2 = IntArgReg2;              // r5
-+  Register temp3 = IntArgReg3;              // r6
-+  masm.push(temp1);
-+  masm.push(temp2);
-+  masm.push(temp3);
-+
-+  Label noBarrier;
-+  masm.emitPreBarrierFastPath(type, temp1, temp2, temp3, &noBarrier);
-+
-+  // Call into C++ to mark this GC thing.
-+  masm.pop(temp3);
-+  masm.pop(temp2);
-+  masm.pop(temp1);
-+
-+  LiveRegisterSet save;
-+  save.set() = RegisterSet(GeneralRegisterSet(Registers::VolatileMask),
-+                           FloatRegisterSet(FloatRegisters::VolatileMask));
-+  // On PPC64, save LR since we'll be making a call.
-+  masm.pushReturnAddress();
-+  masm.PushRegsInMask(save);
-+
-+  masm.movePtr(ImmPtr(cx->runtime()), IntArgReg0);
-+
-+  masm.setupUnalignedABICall(IntArgReg2);
-+  masm.passABIArg(IntArgReg0);
-+  masm.passABIArg(IntArgReg1);
-+  masm.callWithABI(JitPreWriteBarrier(type));
-+
-+  masm.PopRegsInMask(save);
-+  masm.ret();
-+
-+  masm.bind(&noBarrier);
-+  masm.pop(temp3);
-+  masm.pop(temp2);
-+  masm.pop(temp1);
-+  masm.abiret();
-+
-+  return offset;
-+}
-+
-+void JitRuntime::generateBailoutTailStub(MacroAssembler& masm,
-+                                         Label* bailoutTail) {
-+  AutoCreatedBy acb(masm, "JitRuntime::generateBailoutTailStub");
-+
-+  masm.bind(bailoutTail);
-+  masm.generateBailoutTail(IntArgReg1, IntArgReg2);
-+}
-diff --git a/js/src/jit/shared/Assembler-shared.h b/js/src/jit/shared/Assembler-shared.h
-index d5fed2fabe31..490a9f5391e0 100644
---- a/js/src/jit/shared/Assembler-shared.h
-+++ b/js/src/jit/shared/Assembler-shared.h
-@@ -30,14 +30,15 @@
- 
- #if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64) ||      \
-     defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
--    defined(JS_CODEGEN_WASM32) || defined(JS_CODEGEN_RISCV64)
-+    defined(JS_CODEGEN_WASM32) || defined(JS_CODEGEN_RISCV64) || \
-+    defined(JS_CODEGEN_PPC64)
- // Push return addresses callee-side.
- #  define JS_USE_LINK_REGISTER
- #endif
- 
- #if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_ARM64) ||    \
-     defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || \
--    defined(JS_CODEGEN_ARM)
-+    defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_PPC64)
- // JS_CODELABEL_LINKMODE gives labels additional metadata
- // describing how Bind() should patch them.
- #  define JS_CODELABEL_LINKMODE
-diff --git a/js/src/jit/shared/AtomicOperations-feeling-lucky-gcc.h b/js/src/jit/shared/AtomicOperations-feeling-lucky-gcc.h
-index a6909e560bef..d886cba2c7e6 100644
---- a/js/src/jit/shared/AtomicOperations-feeling-lucky-gcc.h
-+++ b/js/src/jit/shared/AtomicOperations-feeling-lucky-gcc.h
-@@ -46,7 +46,8 @@
- // code in this file.
- 
- #if defined(JS_SIMULATOR_ARM64) || defined(JS_SIMULATOR_ARM) || \
--    defined(JS_SIMULATOR_MIPS64) || defined(JS_SIMULATOR_LOONG64)
-+    defined(JS_SIMULATOR_MIPS64) || defined(JS_SIMULATOR_LOONG64) || \
-+    defined(JS_SIMULATOR_PPC64)
- // On some x86 (32-bit) systems this will not work because the compiler does not
- // open-code 64-bit atomics.  If so, try linking with -latomic.  If that doesn't
- // work, you're mostly on your own.
-diff --git a/js/src/jit/shared/CodeGenerator-shared.cpp b/js/src/jit/shared/CodeGenerator-shared.cpp
-index ada87f1f11a2..14468356cf31 100644
---- a/js/src/jit/shared/CodeGenerator-shared.cpp
-+++ b/js/src/jit/shared/CodeGenerator-shared.cpp
-@@ -86,8 +86,8 @@ CodeGeneratorShared::CodeGeneratorShared(MIRGenerator* gen, LIRGraph* graph,
- 
- #ifdef ENABLE_WASM_SIMD
- #  if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_X86) || \
--      defined(JS_CODEGEN_ARM64)
--    // On X64/x86 and ARM64, we don't need alignment for Wasm SIMD at this time.
-+      defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_PPC64)
-+    // On X64/x86, ARM64, and PPC64, we don't need alignment for Wasm SIMD at this time.
- #  else
- #    error \
-         "we may need padding so that local slots are SIMD-aligned and the stack must be kept SIMD-aligned too."
-@@ -1075,7 +1075,7 @@ Label* CodeGeneratorShared::getJumpLabelForBranch(MBasicBlock* block) {
- // This function is not used for MIPS64/LOONG64/RISCV64. They have
- // branchToBlock.
- #if !defined(JS_CODEGEN_MIPS64) && !defined(JS_CODEGEN_LOONG64) && \
--    !defined(JS_CODEGEN_RISCV64)
-+    !defined(JS_CODEGEN_RISCV64) && !defined(JS_CODEGEN_PPC64)
- void CodeGeneratorShared::jumpToBlock(MBasicBlock* mir,
-                                       Assembler::Condition cond) {
-   // Skip past trivial blocks.
-diff --git a/js/src/jit/shared/Lowering-shared-inl.h b/js/src/jit/shared/Lowering-shared-inl.h
-index bdcc1da7d41a..b62f8f681df1 100644
---- a/js/src/jit/shared/Lowering-shared-inl.h
-+++ b/js/src/jit/shared/Lowering-shared-inl.h
-@@ -527,7 +527,7 @@ LAllocation LIRGeneratorShared::useRegisterOrNonDoubleConstant(
- 
- #if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64) ||      \
-     defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_MIPS64) || \
--    defined(JS_CODEGEN_RISCV64)
-+    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
- LAllocation LIRGeneratorShared::useAnyOrConstant(MDefinition* mir) {
-   return useRegisterOrConstant(mir);
- }
-diff --git a/js/src/js-config.mozbuild b/js/src/js-config.mozbuild
-index 22becaf4ecfb..ff5294825e9d 100644
---- a/js/src/js-config.mozbuild
-+++ b/js/src/js-config.mozbuild
-@@ -8,6 +8,7 @@ if (
-     CONFIG["JS_CODEGEN_X64"]
-     or CONFIG["JS_CODEGEN_ARM64"]
-     or CONFIG["JS_CODEGEN_RISCV64"]
-+    or CONFIG["JS_CODEGEN_PPC64"]
- ):
-     DEFINES["WASM_SUPPORTS_HUGE_MEMORY"] = True
- 
-diff --git a/js/src/jsapi-tests/testJitABIcalls.cpp b/js/src/jsapi-tests/testJitABIcalls.cpp
-index b5c03a47dd83..887ad9e3d959 100644
---- a/js/src/jsapi-tests/testJitABIcalls.cpp
-+++ b/js/src/jsapi-tests/testJitABIcalls.cpp
-@@ -718,6 +718,9 @@ class JitABICall final : public jsapitest::RuntimeTest,
- #elif defined(JS_CODEGEN_RISCV64)
-     Register base = t0;
-     regs.take(base);
-+#elif defined(JS_CODEGEN_PPC64)
-+    Register base = r11;
-+    regs.take(base);
- #else
- #  error "Unknown architecture!"
- #endif
-diff --git a/js/src/jsapi-tests/testWasmReturnCalls.cpp b/js/src/jsapi-tests/testWasmReturnCalls.cpp
-index 4728f2404ae8..a07ddb2f214e 100644
---- a/js/src/jsapi-tests/testWasmReturnCalls.cpp
-+++ b/js/src/jsapi-tests/testWasmReturnCalls.cpp
-@@ -32,7 +32,10 @@ BEGIN_TEST(testWasmCheckSlowCallMarkerHit) {
- 
-   masm.bind(&check);
- #  ifdef JS_USE_LINK_REGISTER
--#    if !defined(JS_CODEGEN_LOONG64) && !defined(JS_CODEGEN_MIPS64) && \
-+#    if defined(JS_CODEGEN_PPC64)
-+  static constexpr Register ra = ABINonArgReg3;
-+  masm.xs_mflr(ra);
-+#    elif !defined(JS_CODEGEN_LOONG64) && !defined(JS_CODEGEN_MIPS64) && \
-         !defined(JS_CODEGEN_RISCV64)
-   static constexpr Register ra = lr;
- #    endif
-@@ -70,7 +73,10 @@ BEGIN_TEST(testWasmCheckSlowCallMarkerMiss) {
- 
-   masm.bind(&check);
- #  ifdef JS_USE_LINK_REGISTER
--#    if !defined(JS_CODEGEN_LOONG64) && !defined(JS_CODEGEN_MIPS64) && \
-+#    if defined(JS_CODEGEN_PPC64)
-+  static constexpr Register ra = ABINonArgReg3;
-+  masm.xs_mflr(ra);
-+#    elif !defined(JS_CODEGEN_LOONG64) && !defined(JS_CODEGEN_MIPS64) && \
-         !defined(JS_CODEGEN_RISCV64)
-   static constexpr Register ra = lr;
- #    endif
-diff --git a/js/src/jsapi-tests/testsJit.cpp b/js/src/jsapi-tests/testsJit.cpp
-index a2dfe5d0196c..7f3dcca895d2 100644
---- a/js/src/jsapi-tests/testsJit.cpp
-+++ b/js/src/jsapi-tests/testsJit.cpp
-@@ -25,6 +25,14 @@ void PrepareJit(js::jit::MacroAssembler& masm) {
- #if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
-     defined(JS_CODEGEN_RISCV64)
-   save.add(js::jit::ra);
-+#elif defined(JS_CODEGEN_PPC64)
-+  // LR on PPC64 isn't a GPR; save it to the stack manually.
-+  {
-+    UseScratchRegisterScope temps(masm);
-+    Register scratch = temps.Acquire();
-+    masm.xs_mflr(scratch);
-+    masm.as_stdu(scratch, StackPointer, -8);
-+  }
- #elif defined(JS_USE_LINK_REGISTER)
-   save.add(js::jit::lr);
- #endif
-@@ -44,6 +52,8 @@ bool ExecuteJit(JSContext* cx, js::jit::MacroAssembler& masm) {
- #if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
-     defined(JS_CODEGEN_RISCV64)
-   restore.add(js::jit::ra);
-+#elif defined(JS_CODEGEN_PPC64)
-+  // LR will be restored manually after PopRegsInMask.
- #elif defined(JS_USE_LINK_REGISTER)
-   restore.add(js::jit::lr);
- #endif
-@@ -55,6 +65,16 @@ bool ExecuteJit(JSContext* cx, js::jit::MacroAssembler& masm) {
- 
-   // Reset stack pointer.
-   masm.SetStackPointer64(PseudoStackPointer64);
-+#elif defined(JS_CODEGEN_PPC64)
-+  // Restore LR from the stack and return.
-+  {
-+    UseScratchRegisterScope temps(masm);
-+    Register scratch = temps.Acquire();
-+    masm.as_ld(scratch, StackPointer, 0);
-+    masm.xs_mtlr(scratch);
-+    masm.as_addi(StackPointer, StackPointer, 8);
-+  }
-+  masm.as_blr();
- #else
-   // Exit the JIT-ed code using the ABI return style.
-   masm.abiret();
-diff --git a/js/src/shell/js.cpp b/js/src/shell/js.cpp
-index 45bc0796b964..20eb1231bb7f 100644
---- a/js/src/shell/js.cpp
-+++ b/js/src/shell/js.cpp
-@@ -7895,6 +7895,13 @@ static void SingleStepCallback(void* arg, jit::Simulator* sim, void* pc) {
-   state.fp = (void*)sim->getRegister(jit::Simulator::fp);
-   // see WasmTailCallFPScratchReg and CollapseWasmFrameFast
-   state.tempFP = (void*)sim->getRegister(jit::Simulator::t3);
-+#  elif defined(JS_SIMULATOR_PPC64)
-+  state.sp = (void*)sim->getRegister(jit::Simulator::sp);
-+  state.lr = (void*)sim->getLR();
-+  state.fp = (void*)sim->getRegister(jit::Simulator::fp);
-+  // WasmTailCallFPScratchReg = ABINonArgReg3 = r22 holds the unwind FP
-+  // during the wasm tail-call collapse window (RestoreFpRa unwind info).
-+  state.tempFP = (void*)sim->getRegister(jit::Simulator::r22);
- #  else
- #    error "NYI: Single-step profiling support"
- #  endif
-@@ -13144,6 +13151,15 @@ bool InitOptionParser(OptionParser& op) {
-                        "Stop the RISC-V simulator after the given "
-                        "NUMBER of instructions.",
-                        -1) ||
-+#endif
-+#ifdef JS_SIMULATOR_PPC64
-+      !op.addBoolOption('\0', "ppc64-sim-icache-checks",
-+                        "Enable icache flush checks in the PPC64 "
-+                        "simulator.") ||
-+      !op.addIntOption('\0', "ppc64-sim-stop-at", "NUMBER",
-+                       "Stop the PPC64 simulator after the given "
-+                       "NUMBER of instructions.",
-+                       -1) ||
- #endif
-       !op.addIntOption('\0', "nursery-size", "SIZE-MB",
-                        "Set the maximum nursery size in MB",
-@@ -14235,6 +14251,15 @@ bool SetContextJITOptions(JSContext* cx, const OptionParser& op) {
-   if (stopAt >= 0) {
-     jit::Simulator::StopSimAt = stopAt;
-   }
-+#elif defined(JS_SIMULATOR_PPC64)
-+  if (op.getBoolOption("ppc64-sim-icache-checks")) {
-+    jit::SimulatorProcess::ICacheCheckingDisableCount = 0;
-+  }
-+
-+  int32_t stopAt = op.getIntOption("ppc64-sim-stop-at");
-+  if (stopAt >= 0) {
-+    jit::Simulator::StopSimAt = stopAt;
-+  }
- #endif
- 
- #ifdef DEBUG
-diff --git a/js/src/shell/jsshell.h b/js/src/shell/jsshell.h
-index e8d47ba6888c..57e2b15f3cdd 100644
---- a/js/src/shell/jsshell.h
-+++ b/js/src/shell/jsshell.h
-@@ -22,7 +22,8 @@
- 
- // Some platform hooks must be implemented for single-step profiling.
- #if defined(JS_SIMULATOR_ARM) || defined(JS_SIMULATOR_MIPS64) || \
--    defined(JS_SIMULATOR_ARM64) || defined(JS_SIMULATOR_LOONG64)
-+    defined(JS_SIMULATOR_ARM64) || defined(JS_SIMULATOR_LOONG64) || \
-+    defined(JS_SIMULATOR_RISCV64) || defined(JS_SIMULATOR_PPC64)
- #  define SINGLESTEP_PROFILING
- #endif
- 
-diff --git a/js/src/tests/shell/os.js b/js/src/tests/shell/os.js
-index 929982756548..f3d2396b17eb 100644
---- a/js/src/tests/shell/os.js
-+++ b/js/src/tests/shell/os.js
-@@ -20,7 +20,13 @@ var info = os.waitpid(kidpid, true);
- assertEq(info.hasOwnProperty("pid"), false);
- assertEq(info.hasOwnProperty("exitStatus"), false);
- 
--os.kill(kidpid);
-+// Use SIGKILL (9) instead of the default SIGINT: under heavy parallel test
-+// load, SIGINT delivery can race with the child's signal-handler setup and
-+// the kernel's reaping path, leading to waitpid below blocking until the
-+// `sleep 60` exits normally. SIGKILL is uncatchable and forces immediate
-+// termination, so the assertion below ("killed process should not have
-+// exitStatus") is reliable.
-+os.kill(kidpid, 9);
- 
- info = os.waitpid(kidpid);
- assertEq(info.hasOwnProperty("pid"), true, "waiting on dead process should return pid");
-diff --git a/js/src/util/Poison.h b/js/src/util/Poison.h
-index 721ecff6149d..de7981aa6f60 100644
---- a/js/src/util/Poison.h
-+++ b/js/src/util/Poison.h
-@@ -92,6 +92,8 @@ const uint8_t JS_SCOPE_DATA_TRAILING_NAMES_PATTERN = 0xCC;
- #elif defined(JS_CODEGEN_RISCV64)
- #  define JS_SWEPT_CODE_PATTERN \
-     0x29  // illegal sb instruction, crashes in user mode.
-+#elif defined(JS_CODEGEN_PPC64)
-+#  define JS_SWEPT_CODE_PATTERN 0x00  // illegal instruction (all zeros)
- #else
- #  error "JS_SWEPT_CODE_PATTERN not defined for this platform"
- #endif
-diff --git a/js/src/wasm/WasmAnyRef.h b/js/src/wasm/WasmAnyRef.h
-index f81d4c6171b6..7200e9ab0e23 100644
---- a/js/src/wasm/WasmAnyRef.h
-+++ b/js/src/wasm/WasmAnyRef.h
-@@ -209,7 +209,7 @@ class AnyRef {
-     // Truncate the value to the 31-bit value size.
-     uintptr_t wideValue = uintptr_t(value & 0x7FFFFFFF);
- #elif defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_MIPS64) || \
--    defined(JS_CODEGEN_RISCV64)
-+    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-     // Sign extend the value to the native pointer size.
-     uintptr_t wideValue = uintptr_t(int64_t((uint64_t(value) << 33)) >> 33);
- #elif !defined(JS_64BIT)
-@@ -234,6 +234,11 @@ class AnyRef {
- #  ifdef DEBUG
- #    if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM64)
-     MOZ_ASSERT(value <= UINT32_MAX);
-+#    elif defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_MIPS64) || \
-+        defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-+    // On sign-extending platforms, a canonical i32 must be the sign
-+    // extension of its low 32 bits.
-+    MOZ_ASSERT(value == uintptr_t(int64_t(int32_t(value))));
- #    endif
- #  endif
-   }
-diff --git a/js/src/wasm/WasmBCDefs.h b/js/src/wasm/WasmBCDefs.h
-index b44e91e28693..66a8c9afe8c6 100644
---- a/js/src/wasm/WasmBCDefs.h
-+++ b/js/src/wasm/WasmBCDefs.h
-@@ -44,6 +44,9 @@
- #if defined(JS_CODEGEN_RISCV64)
- #  include "jit/riscv64/Assembler-riscv64.h"
- #endif
-+#if defined(JS_CODEGEN_PPC64)
-+#  include "jit/ppc64/Assembler-ppc64.h"
-+#endif
- #include "js/ScalarType.h"
- #include "util/Memory.h"
- #include "wasm/WasmCodegenTypes.h"
-@@ -151,6 +154,10 @@ enum class RhsDestOp { True = true };
- #  define RABALDR_PIN_INSTANCE
- #endif
- 
-+#ifdef JS_CODEGEN_PPC64
-+#  define RABALDR_PIN_INSTANCE
-+#endif
-+
- // Max number of pushes onto the value stack for any opcode or emitter that
- // does not push a variable, unbounded amount (anything with multiple
- // results).  This includes also intermediate pushes such as values pushed as
-diff --git a/js/src/wasm/WasmBCMemory.cpp b/js/src/wasm/WasmBCMemory.cpp
-index 835512b09b8c..9137b09f4684 100644
---- a/js/src/wasm/WasmBCMemory.cpp
-+++ b/js/src/wasm/WasmBCMemory.cpp
-@@ -372,7 +372,7 @@ void BaseCompiler::boundsCheckBelow4GBAccess(uint32_t memoryIndex,
- // Make sure the ptr could be used as an index register.
- static inline void ToValidIndex(MacroAssembler& masm, RegI32 ptr) {
- #if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
--    defined(JS_CODEGEN_RISCV64)
-+    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-   // When ptr is used as an index, it will be added to a 64-bit register.
-   // So we should explicitly promote ptr to 64-bit. Since now ptr holds a
-   // unsigned 32-bit value, we zero-extend it to 64-bit here.
-@@ -645,6 +645,13 @@ void BaseCompiler::executeLoad(MemoryAccessDesc* access, AccessCheck* check,
-   } else {
-     masm.wasmLoad(*access, memoryBase, ptr, ptr, dest.any());
-   }
-+#elif defined(JS_CODEGEN_PPC64)
-+  MOZ_ASSERT(temp.isInvalid());
-+  if (dest.tag == AnyReg::I64) {
-+    masm.wasmLoadI64(*access, memoryBase, ptr, ptr, dest.i64());
-+  } else {
-+    masm.wasmLoad(*access, memoryBase, ptr, ptr, dest.any());
-+  }
- #else
-   MOZ_CRASH("BaseCompiler platform hook: load");
- #endif
-@@ -675,10 +682,11 @@ void BaseCompiler::load(MemoryAccessDesc* access, AccessCheck* check,
-   // generated is the same for the 64-bit and the 32-bit case.
-   return executeLoad(access, check, instance, memoryBase, RegI32(ptr.reg), dest,
-                      maybeFromI64(temp));
--#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64)
--  // On mips64 and loongarch64, the 'prepareMemoryAccess' function will make
--  // sure that ptr holds a valid 64-bit index value. Thus the code generated in
--  // 'executeLoad' is the same for the 64-bit and the 32-bit case.
-+#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
-+    defined(JS_CODEGEN_PPC64)
-+  // On mips64, loongarch64, and ppc64, the 'prepareMemoryAccess' function will
-+  // make sure that ptr holds a valid 64-bit index value. Thus the code
-+  // generated in 'executeLoad' is the same for the 64-bit and the 32-bit case.
-   return executeLoad(access, check, instance, memoryBase, RegI32(ptr.reg), dest,
-                      maybeFromI64(temp));
- #elif defined(JS_CODEGEN_RISCV64)
-@@ -788,6 +796,13 @@ void BaseCompiler::executeStore(MemoryAccessDesc* access, AccessCheck* check,
-   } else {
-     masm.wasmStore(*access, src.any(), memoryBase, ptr, ptr);
-   }
-+#elif defined(JS_CODEGEN_PPC64)
-+  MOZ_ASSERT(temp.isInvalid());
-+  if (access->type() == Scalar::Int64) {
-+    masm.wasmStoreI64(*access, src.i64(), memoryBase, ptr, ptr);
-+  } else {
-+    masm.wasmStore(*access, src.any(), memoryBase, ptr, ptr);
-+  }
- #else
-   MOZ_CRASH("BaseCompiler platform hook: store");
- #endif
-@@ -812,7 +827,7 @@ void BaseCompiler::store(MemoryAccessDesc* access, AccessCheck* check,
-                       maybeFromI64(temp));
- #elif defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM64) ||    \
-     defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
--    defined(JS_CODEGEN_RISCV64)
-+    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-   return executeStore(access, check, instance, memoryBase, RegI32(ptr.reg), src,
-                       maybeFromI64(temp));
- #else
-@@ -1295,7 +1310,8 @@ static void Deallocate(BaseCompiler* bc, RegI32 rv, const Temps& temps) {
-   bc->freeI32(temps.t0);
- }
- 
--#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64)
-+#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
-+    defined(JS_CODEGEN_PPC64)
- 
- struct Temps {
-   RegI32 t0, t1, t2;
-@@ -1504,7 +1520,7 @@ static void Deallocate(BaseCompiler* bc, AtomicOp op, RegI64 rv, RegI64 temp) {
- }
- 
- #elif defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) || \
--    defined(JS_CODEGEN_LOONG64)
-+    defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_PPC64)
- 
- static void PopAndAllocate(BaseCompiler* bc, AtomicOp op, RegI64* rd,
-                            RegI64* rv, RegI64* temp) {
-@@ -1678,7 +1694,8 @@ static void Deallocate(BaseCompiler* bc, RegI32 rv, const Temps&) {
-   bc->freeI32(rv);
- }
- 
--#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64)
-+#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
-+    defined(JS_CODEGEN_PPC64)
- 
- struct Temps {
-   RegI32 t0, t1, t2;
-@@ -1844,7 +1861,7 @@ static void Deallocate(BaseCompiler* bc, RegI64 rd, RegI64 rv) {
- }
- 
- #elif defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) || \
--    defined(JS_CODEGEN_LOONG64)
-+    defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_PPC64)
- 
- static void PopAndAllocate(BaseCompiler* bc, RegI64* rd, RegI64* rv) {
-   *rv = bc->popI64();
-@@ -2017,7 +2034,8 @@ static void Deallocate(BaseCompiler* bc, RegI32 rexpect, RegI32 rnew,
-   bc->freeI32(rexpect);
- }
- 
--#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64)
-+#elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
-+    defined(JS_CODEGEN_PPC64)
- 
- struct Temps {
-   RegI32 t0, t1, t2;
-@@ -2287,7 +2305,7 @@ static void Deallocate(BaseCompiler* bc, RegI64 rexpect, RegI64 rnew) {
- }
- 
- #elif defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) || \
--    defined(JS_CODEGEN_LOONG64)
-+    defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_PPC64)
- 
- template <typename RegAddressType>
- static void PopAndAllocate(BaseCompiler* bc, RegI64* rexpect, RegI64* rnew,
-@@ -2885,6 +2903,11 @@ void BaseCompiler::loadExtend(MemoryAccessDesc* access, Scalar::Type viewType) {
-   RegI64 rs = popI64();
-   RegV128 rd = needV128();
-   masm.moveGPR64ToDouble(rs, rd);
-+#ifdef JS_CODEGEN_PPC64
-+  // mtvsrd places value in BE dw0 (= LE dw1). widenLow* operates on LE dw0.
-+  // Swap dwords to move loaded data to the correct half.
-+  masm.as_xxpermdi(rd, rd, rd, 2);
-+#endif
-   switch (viewType) {
-     case Scalar::Int8:
-       masm.widenLowInt8x16(rd, rd);
-diff --git a/js/src/wasm/WasmBCRegDefs.h b/js/src/wasm/WasmBCRegDefs.h
-index bb84f0863de2..fd37bd464f39 100644
---- a/js/src/wasm/WasmBCRegDefs.h
-+++ b/js/src/wasm/WasmBCRegDefs.h
-@@ -118,6 +118,13 @@ static constexpr Register RabaldrScratchI32 = CallTempReg2;
- static constexpr Register RabaldrScratchI32 = CallTempReg2;
- #endif
- 
-+#ifdef JS_CODEGEN_PPC64
-+#  define RABALDR_SCRATCH_I32
-+// Use r25 (callee-saved, non-arg, not used by any wasm infrastructure)
-+// instead of CallTempReg2 (r10) which is IntArgReg7.
-+static constexpr Register RabaldrScratchI32 = r25;
-+#endif
-+
- #ifdef RABALDR_SCRATCH_F32_ALIASES_F64
- #  if !defined(RABALDR_SCRATCH_F32) || !defined(RABALDR_SCRATCH_F64)
- #    error "Bad configuration"
-@@ -386,8 +393,9 @@ struct SpecificRegs {
- 
-   SpecificRegs() : abiReturnRegI64(ReturnReg64) {}
- };
--#elif defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) || \
--    defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
-+#elif defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) ||  \
-+    defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || \
-+    defined(JS_CODEGEN_PPC64)
- struct SpecificRegs {
-   // Required by gcc.
-   SpecificRegs() {}
-diff --git a/js/src/wasm/WasmBaselineCompile.cpp b/js/src/wasm/WasmBaselineCompile.cpp
-index 2af7ad7f583b..c57180dd362b 100644
---- a/js/src/wasm/WasmBaselineCompile.cpp
-+++ b/js/src/wasm/WasmBaselineCompile.cpp
-@@ -376,11 +376,15 @@ void BaseCompiler::tableSwitch(Label* theTable, RegI32 switchValue,
-   masm.ma_ldr(DTRAddr(scratch, DtrRegImmShift(switchValue, LSL, 2)), pc, Offset,
-               Assembler::Always);
- #elif defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
--    defined(JS_CODEGEN_RISCV64)
-+    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-   ScratchI32 scratch(*this);
-   CodeLabel tableCl;
- 
-+#  if defined(JS_CODEGEN_PPC64)
-+  masm.mov(&tableCl, scratch);
-+#  else
-   masm.ma_li(scratch, &tableCl);
-+#  endif
- 
-   tableCl.target()->bind(theTable->offset());
-   masm.addCodeLabel(tableCl);
-@@ -898,7 +902,7 @@ void BaseCompiler::insertBreakablePoint(CallSiteKind kind) {
-   masm.append(CallSiteDesc(iter_.lastOpcodeOffset(), kind),
-               CodeOffset(masm.currentOffset()));
- #elif defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_MIPS64) || \
--    defined(JS_CODEGEN_RISCV64)
-+    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-   ScratchPtr scratch(*this);
-   Label L;
-   masm.loadPtr(Address(InstanceReg, Instance::offsetOfDebugStub()), scratch);
-@@ -972,7 +976,7 @@ void BaseCompiler::insertPerFunctionDebugStub() {
-     masm.ma_bx(lr, Assembler::Zero);
-   }
- #elif defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_MIPS64) || \
--    defined(JS_CODEGEN_RISCV64)
-+    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-   {
-     ScratchPtr scratch(*this);
- 
-@@ -1403,7 +1407,7 @@ void BaseCompiler::popStackResults(ABIResultIter& iter, StackHeight stackBase) {
-     switch (v.kind()) {
-       case Stk::ConstI32:
- #if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
--    defined(JS_CODEGEN_RISCV64)
-+    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-         fr.storeImmediatePtrToStack(v.i32val_, resultHeight, temp);
- #else
-         fr.storeImmediatePtrToStack(uint32_t(v.i32val_), resultHeight, temp);
-@@ -1723,6 +1727,13 @@ void BaseCompiler::passArg(ValType type, const Stk& arg, FunctionCall* call) {
-                                       argLoc.offsetFromArgBase()));
-       } else {
-         loadI32(arg, RegI32(argLoc.gpr()));
-+#ifdef JS_CODEGEN_PPC64
-+        // addi can sign-extend, which yields wrong values when the C++
-+        // callee expects a uint32_t. Clear the upper 32 bits.
-+        if (call->abiKind == ABIKind::System) {
-+          masm.as_rldicl(argLoc.gpr(), argLoc.gpr(), 0, 32);
-+        }
-+#endif
-       }
-       break;
-     }
-@@ -2372,9 +2383,10 @@ void BaseCompiler::finishTryNote(size_t tryNoteIndex) {
- RegI32 BaseCompiler::needRotate64Temp() {
- #if defined(JS_CODEGEN_X86)
-   return needI32();
--#elif defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM) ||    \
--    defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) || \
--    defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
-+#elif defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM) ||       \
-+    defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) ||    \
-+    defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || \
-+    defined(JS_CODEGEN_PPC64)
-   return RegI32::Invalid();
- #else
-   MOZ_CRASH("BaseCompiler platform hook: needRotate64Temp");
-@@ -2433,6 +2445,8 @@ void BaseCompiler::popAndAllocateForMulI64(RegI64* r0, RegI64* r1,
-   pop2xI64(r0, r1);
- #elif defined(JS_CODEGEN_RISCV64)
-   pop2xI64(r0, r1);
-+#elif defined(JS_CODEGEN_PPC64)
-+  pop2xI64(r0, r1);
- #else
-   MOZ_CRASH("BaseCompiler porting interface: popAndAllocateForMulI64");
- #endif
-@@ -2866,6 +2880,9 @@ static RegI32 PopcntTemp(BaseCompiler& bc) {
-     defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
-     defined(JS_CODEGEN_RISCV64)
-   return bc.needI32();
-+#elif defined(JS_CODEGEN_PPC64)
-+  // PPC64 has native popcntd/popcntw; no temp register needed.
-+  return RegI32::Invalid();
- #else
-   MOZ_CRASH("BaseCompiler platform hook: PopcntTemp");
- #endif
-@@ -9362,6 +9379,11 @@ static void MulI64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd,
-                      RegV128 temp1, RegV128 temp2) {
-   masm.mulInt64x2(rsd, rs, rsd, temp1, temp2);
- }
-+#  elif defined(JS_CODEGEN_PPC64)
-+static void MulI64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd,
-+                     RegV128 temp1, RegV128 temp2) {
-+  masm.mulInt64x2(rsd, rs, rsd, temp1, temp2);
-+}
- #  endif
- 
- static void MulF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
-@@ -9376,7 +9398,8 @@ static void DivF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
-   masm.divFloat64x2(rsd, rs, rsd);
- }
- 
--#  if defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_X64)
-+#  if defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_X64) || \
-+      defined(JS_CODEGEN_PPC64)
- static void MinF32x4(MacroAssembler& masm, RegV128 rs, RegV128 rsd,
-                      RegV128 temp1, RegV128 temp2) {
-   masm.minFloat32x4(rsd, rs, rsd, temp1, temp2);
-@@ -9397,6 +9420,22 @@ static void MaxF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd,
-   masm.maxFloat64x2(rsd, rs, rsd, temp1, temp2);
- }
- 
-+#  if defined(JS_CODEGEN_PPC64)
-+// PPC64: use non-RhsDestOp convention (first=rhs, second=lhsDest),
-+// matching the pseudoMin/Max function signature.
-+static void PMinF32x4(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
-+  masm.pseudoMinFloat32x4(rs, rsd);
-+}
-+static void PMinF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
-+  masm.pseudoMinFloat64x2(rs, rsd);
-+}
-+static void PMaxF32x4(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
-+  masm.pseudoMaxFloat32x4(rs, rsd);
-+}
-+static void PMaxF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
-+  masm.pseudoMaxFloat64x2(rs, rsd);
-+}
-+#  else
- static void PMinF32x4(MacroAssembler& masm, RegV128 rsd, RegV128 rs,
-                       RhsDestOp) {
-   masm.pseudoMinFloat32x4(rsd, rs);
-@@ -9416,6 +9455,7 @@ static void PMaxF64x2(MacroAssembler& masm, RegV128 rsd, RegV128 rs,
-                       RhsDestOp) {
-   masm.pseudoMaxFloat64x2(rsd, rs);
- }
-+#  endif
- #  elif defined(JS_CODEGEN_ARM64)
- static void MinF32x4(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
-   masm.minFloat32x4(rs, rsd);
-@@ -9806,6 +9846,68 @@ static void ShiftRightI64x2(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
-   masm.rightShiftInt64x2(rsd, temp, rsd);
- }
- 
-+static void ShiftRightUI64x2(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
-+                             RegI32 temp) {
-+  ShiftOpMask(masm, SimdOp::I64x2ShrU, rs, temp);
-+  masm.unsignedRightShiftInt64x2(rsd, temp, rsd);
-+}
-+#  elif defined(JS_CODEGEN_PPC64)
-+// PPC64: same as ARM64 pattern (shift amount in GPR, result in vector reg)
-+static void ShiftLeftI8x16(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
-+                           RegI32 temp) {
-+  ShiftOpMask(masm, SimdOp::I8x16Shl, rs, temp);
-+  masm.leftShiftInt8x16(rsd, temp, rsd);
-+}
-+static void ShiftLeftI16x8(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
-+                           RegI32 temp) {
-+  ShiftOpMask(masm, SimdOp::I16x8Shl, rs, temp);
-+  masm.leftShiftInt16x8(rsd, temp, rsd);
-+}
-+static void ShiftLeftI32x4(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
-+                           RegI32 temp) {
-+  ShiftOpMask(masm, SimdOp::I32x4Shl, rs, temp);
-+  masm.leftShiftInt32x4(rsd, temp, rsd);
-+}
-+static void ShiftLeftI64x2(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
-+                           RegI32 temp) {
-+  ShiftOpMask(masm, SimdOp::I64x2Shl, rs, temp);
-+  masm.leftShiftInt64x2(rsd, temp, rsd);
-+}
-+static void ShiftRightI8x16(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
-+                            RegI32 temp) {
-+  ShiftOpMask(masm, SimdOp::I8x16ShrS, rs, temp);
-+  masm.rightShiftInt8x16(rsd, temp, rsd);
-+}
-+static void ShiftRightUI8x16(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
-+                             RegI32 temp) {
-+  ShiftOpMask(masm, SimdOp::I8x16ShrU, rs, temp);
-+  masm.unsignedRightShiftInt8x16(rsd, temp, rsd);
-+}
-+static void ShiftRightI16x8(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
-+                            RegI32 temp) {
-+  ShiftOpMask(masm, SimdOp::I16x8ShrS, rs, temp);
-+  masm.rightShiftInt16x8(rsd, temp, rsd);
-+}
-+static void ShiftRightUI16x8(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
-+                             RegI32 temp) {
-+  ShiftOpMask(masm, SimdOp::I16x8ShrU, rs, temp);
-+  masm.unsignedRightShiftInt16x8(rsd, temp, rsd);
-+}
-+static void ShiftRightI32x4(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
-+                            RegI32 temp) {
-+  ShiftOpMask(masm, SimdOp::I32x4ShrS, rs, temp);
-+  masm.rightShiftInt32x4(rsd, temp, rsd);
-+}
-+static void ShiftRightUI32x4(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
-+                             RegI32 temp) {
-+  ShiftOpMask(masm, SimdOp::I32x4ShrU, rs, temp);
-+  masm.unsignedRightShiftInt32x4(rsd, temp, rsd);
-+}
-+static void ShiftRightI64x2(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
-+                            RegI32 temp) {
-+  ShiftOpMask(masm, SimdOp::I64x2ShrS, rs, temp);
-+  masm.rightShiftInt64x2(rsd, temp, rsd);
-+}
- static void ShiftRightUI64x2(MacroAssembler& masm, RegI32 rs, RegV128 rsd,
-                              RegI32 temp) {
-   ShiftOpMask(masm, SimdOp::I64x2ShrU, rs, temp);
-@@ -10107,6 +10209,23 @@ static void BitmaskI32x4(MacroAssembler& masm, RegV128 rs, RegI32 rd,
-   masm.bitmaskInt32x4(rs, rd, temp);
- }
- 
-+static void BitmaskI64x2(MacroAssembler& masm, RegV128 rs, RegI32 rd,
-+                         RegV128 temp) {
-+  masm.bitmaskInt64x2(rs, rd, temp);
-+}
-+#  elif defined(JS_CODEGEN_PPC64)
-+static void BitmaskI8x16(MacroAssembler& masm, RegV128 rs, RegI32 rd,
-+                         RegV128 temp) {
-+  masm.bitmaskInt8x16(rs, rd, temp);
-+}
-+static void BitmaskI16x8(MacroAssembler& masm, RegV128 rs, RegI32 rd,
-+                         RegV128 temp) {
-+  masm.bitmaskInt16x8(rs, rd, temp);
-+}
-+static void BitmaskI32x4(MacroAssembler& masm, RegV128 rs, RegI32 rd,
-+                         RegV128 temp) {
-+  masm.bitmaskInt32x4(rs, rd, temp);
-+}
- static void BitmaskI64x2(MacroAssembler& masm, RegV128 rs, RegI32 rd,
-                          RegV128 temp) {
-   masm.bitmaskInt64x2(rs, rd, temp);
-@@ -10182,6 +10301,13 @@ static void BitselectV128(MacroAssembler& masm, RegV128 rhs, RegV128 control,
-   masm.bitwiseSelectSimd128(lhsDest, rhs, temp);
-   masm.moveSimd128(temp, lhsDest);
- }
-+#  elif defined(JS_CODEGEN_PPC64)
-+static void BitselectV128(MacroAssembler& masm, RegV128 rhs, RegV128 control,
-+                          RegV128 lhsDest, RegV128 temp) {
-+  masm.moveSimd128(control, temp);
-+  masm.bitwiseSelectSimd128(lhsDest, rhs, temp);
-+  masm.moveSimd128(temp, lhsDest);
-+}
- #  endif
- 
- #  ifdef ENABLE_WASM_RELAXED_SIMD
-@@ -10257,7 +10383,7 @@ void BaseCompiler::emitDotI8x16I7x16AddS() {
-   RegV128 rsd = popV128();
-   RegV128 rs0, rs1;
-   pop2xV128(&rs0, &rs1);
--#    if defined(JS_CODEGEN_ARM64)
-+#    if defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_PPC64)
-   RegV128 temp = needV128();
-   masm.dotInt8x16Int7x16ThenAdd(rs0, rs1, rsd, temp);
-   freeV128(temp);
-@@ -10463,7 +10589,7 @@ bool BaseCompiler::emitVectorLaneSelect() {
-   freeV128(lhs);
-   freeV128(mask);
-   pushV128(rhsDest);
--#    elif defined(JS_CODEGEN_ARM64)
-+#    elif defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_PPC64)
-   RegV128 maskDest = popV128();
-   RegV128 rhs = popV128();
-   RegV128 lhs = popV128();
-@@ -12628,7 +12754,7 @@ bool js::wasm::BaselinePlatformSupport() {
- #if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_X86) ||        \
-     defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64) ||      \
-     defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
--    defined(JS_CODEGEN_RISCV64)
-+    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-   return true;
- #else
-   return false;
-diff --git a/js/src/wasm/WasmCodegenConstants.h b/js/src/wasm/WasmCodegenConstants.h
-index 9c10d307ae6f..e25332b5464e 100644
---- a/js/src/wasm/WasmCodegenConstants.h
-+++ b/js/src/wasm/WasmCodegenConstants.h
-@@ -43,7 +43,8 @@ static const unsigned InterpFailInstanceReg = 0xbad;
- // The following thresholds were derived from a microbenchmark. If we begin to
- // ship this optimization for more platforms, we will need to extend this list.
- 
--#if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM64)
-+#if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM64) || \
-+    defined(JS_CODEGEN_PPC64)
- static const uint32_t MaxInlineMemoryCopyLength = 64;
- static const uint32_t MaxInlineMemoryFillLength = 64;
- #elif defined(JS_CODEGEN_X86)
-diff --git a/js/src/wasm/WasmCodegenTypes.cpp b/js/src/wasm/WasmCodegenTypes.cpp
-index 8b9f32639ea3..e906c4afecc3 100644
---- a/js/src/wasm/WasmCodegenTypes.cpp
-+++ b/js/src/wasm/WasmCodegenTypes.cpp
-@@ -144,14 +144,15 @@ void TrapSitesForKind::checkInvariants(const uint8_t* codeBase) const {
-     last = pcOffset;
-   }
- 
--#  if (defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_X86) ||   \
--       defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_ARM) || \
--       defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_MIPS64))
-+#  if (defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_X86) ||        \
-+       defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_ARM) ||      \
-+       defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_MIPS64) || \
-+       defined(JS_CODEGEN_PPC64))
-   // Check that each trapsite is associated with a plausible instruction.  The
-   // required instruction kind depends on the trapsite kind.
-   //
--  // NOTE: currently enabled on x86_{32,64}, arm{32,64}, loongson64 and mips64.
--  // Ideally it should be extended to riscv64 too.
-+  // NOTE: currently enabled on x86_{32,64}, arm{32,64}, loongson64, mips64,
-+  // and ppc64. Ideally it should be extended to riscv64 too.
-   //
-   for (uint32_t i = 0; i < length(); i++) {
-     uint32_t pcOffset = pcOffsets_[i];
-diff --git a/js/src/wasm/WasmCompile.cpp b/js/src/wasm/WasmCompile.cpp
-index 051c60ebaa55..89447aa668ff 100644
---- a/js/src/wasm/WasmCompile.cpp
-+++ b/js/src/wasm/WasmCompile.cpp
-@@ -71,8 +71,9 @@ uint32_t wasm::ObservedCPUFeatures() {
-     ARM64 = 0x6,
-     LOONG64 = 0x7,
-     RISCV64 = 0x8,
-+    PPC64 = 0x9,
- 
--    LAST = RISCV64,
-+    LAST = PPC64,
-     ARCH_BITS = 4
-   };
- 
-@@ -101,6 +102,9 @@ uint32_t wasm::ObservedCPUFeatures() {
- #elif defined(JS_CODEGEN_RISCV64)
-   MOZ_ASSERT(jit::GetRISCV64Flags() <= (UINT32_MAX >> ARCH_BITS));
-   return RISCV64 | (jit::GetRISCV64Flags() << ARCH_BITS);
-+#elif defined(JS_CODEGEN_PPC64)
-+  MOZ_ASSERT(jit::GetPPC64Flags() <= (UINT32_MAX >> ARCH_BITS));
-+  return PPC64 | (jit::GetPPC64Flags() << ARCH_BITS);
- #elif defined(JS_CODEGEN_NONE) || defined(JS_CODEGEN_WASM32)
-   return 0;
- #else
-diff --git a/js/src/wasm/WasmFrameIter.cpp b/js/src/wasm/WasmFrameIter.cpp
-index b3b264bc625a..b540acf9a05d 100644
---- a/js/src/wasm/WasmFrameIter.cpp
-+++ b/js/src/wasm/WasmFrameIter.cpp
-@@ -622,6 +622,19 @@ static const unsigned PushedFP = 16;
- static const unsigned SetFP = 20;
- static const unsigned PoppedFP = 4;
- static const unsigned PoppedFPJitEntry = 8;
-+#elif defined(JS_CODEGEN_PPC64)
-+// pushReturnAddress = mflr(4) + stdu(4) = 8 bytes.
-+// push(FP) = stdu(4) = 4 bytes (PPC64 stdu is a single DS-form instruction).
-+// moveStackPtrTo = mr(4) = 4 bytes.
-+static const unsigned PushedRetAddr = 8;
-+static const unsigned PushedFP = 12;
-+static const unsigned SetFP = 16;
-+// Callable + jit-entry epilogues between poppedFP and *ret are:
-+//   mtlr r0; addi sp, sp, 16  (two 4-byte instructions — 8 bytes).
-+// mtlr must come before addi so LR holds the caller's RA throughout the
-+// post-poppedFP window (single-step profiling fires every instruction).
-+static const unsigned PoppedFP = 8;
-+static const unsigned PoppedFPJitEntry = 8;
- #elif defined(JS_CODEGEN_NONE) || defined(JS_CODEGEN_WASM32)
- // Synthetic values to satisfy asserts and avoid compiler warnings.
- static const unsigned PushedRetAddr = 0;
-@@ -710,6 +723,17 @@ static void GenerateCallablePrologue(MacroAssembler& masm, uint32_t* entry) {
-     masm.moveStackPtrTo(FramePointer);
-     MOZ_ASSERT_IF(!masm.oom(), SetFP == masm.currentOffset() - *entry);
-   }
-+#elif defined(JS_CODEGEN_PPC64)
-+  {
-+    *entry = masm.currentOffset();
-+
-+    masm.pushReturnAddress();
-+    MOZ_ASSERT_IF(!masm.oom(), PushedRetAddr == masm.currentOffset() - *entry);
-+    masm.push(FramePointer);
-+    MOZ_ASSERT_IF(!masm.oom(), PushedFP == masm.currentOffset() - *entry);
-+    masm.moveStackPtrTo(FramePointer);
-+    MOZ_ASSERT_IF(!masm.oom(), SetFP == masm.currentOffset() - *entry);
-+  }
- #elif defined(JS_CODEGEN_ARM64)
-   {
-     // We do not use the PseudoStackPointer.  However, we may be called in a
-@@ -803,6 +827,38 @@ static void GenerateCallableEpilogue(MacroAssembler& masm, unsigned framePushed,
-     masm.jalr(zero, ra, 0);
-     masm.nop();
-   }
-+#elif defined(JS_CODEGEN_PPC64)
-+  // Load RA and FP from the Frame while it's still on the stack.
-+  // Using r0 (js::jit::r0) for RA is safe: it's volatile, used as
-+  // RT (not base), and we're in an epilogue where it's not live.
-+  masm.loadPtr(Address(StackPointer, Frame::returnAddressOffset()),
-+               js::jit::r0);
-+  masm.loadPtr(Address(StackPointer, Frame::callerFPOffset()), FramePointer);
-+
-+  // Fence the pool BEFORE capturing poppedFP. PoppedFP is a static 8
-+  // (mtlr + addi); enterNoPool itself can emit insertNopFill() and a
-+  // preemptive finishPool() at its top edge, so any leading insertions
-+  // must land before poppedFP — not between poppedFP and *ret. A pool
-+  // flush inside the 2-insn window would otherwise extend *ret - poppedFP
-+  // and trip the post-condition assertion below. P9 routes FP constants
-+  // through the pool so flushes are more frequent than on P8 (the
-+  // assertion was historically silent on P8 but reproducible on P9 dbgopt).
-+  masm.enterNoPool(2);
-+  poppedFP = masm.currentOffset();
-+
-+  // Move RA into LR BEFORE popping the Frame. If the order were addi/mtlr,
-+  // single-step profiling firing at the mtlr instruction would see: sp
-+  // already moved (so saved RA at sp[8] is gone), addi already executed,
-+  // and LR still holding the address right after the function's last `bl`
-+  // (i.e. inside this function, not the caller's RA). With mtlr first,
-+  // the entire post-poppedFP window has LR == caller's RA available
-+  // either via sp[8] (pre-addi) or registers.lr (post-mtlr).
-+  masm.xs_mtlr(js::jit::r0);
-+  masm.addToStackPtr(Imm32(sizeof(Frame)));
-+  *ret = masm.currentOffset();
-+  masm.leaveNoPool();
-+  masm.as_blr();
-+
- #elif defined(JS_CODEGEN_ARM64)
- 
-   // See comment at equivalent place in |GenerateCallablePrologue| above.
-@@ -1483,6 +1539,9 @@ void wasm::GenerateJitEntryPrologue(MacroAssembler& masm,
-     BlockTrampolinePoolScope block_trampoline_pool(&masm, 10);
-     offsets->begin = masm.currentOffset();
-     masm.push(ra);
-+#elif defined(JS_CODEGEN_PPC64)
-+    offsets->begin = masm.currentOffset();
-+    masm.pushReturnAddress();
- #elif defined(JS_CODEGEN_ARM64)
-     {
-       AutoForbidPoolsAndNops afp(&masm,
-@@ -1536,6 +1595,20 @@ void wasm::GenerateJitEntryEpilogue(MacroAssembler& masm,
-     masm.Ret(ARMRegister(lr, 64));
-     masm.setFramePushed(0);
-   }
-+#elif defined(JS_CODEGEN_PPC64)
-+  // Load RA and FP from the frame while it's still on the stack, then
-+  // restore LR, pop the frame, and return. mtlr must precede addi so LR
-+  // holds the caller's RA across the whole post-poppedFP window (see
-+  // GenerateCallableEpilogue for the matching rationale).
-+  masm.loadPtr(Address(StackPointer, Frame::returnAddressOffset()),
-+               js::jit::r0);
-+  masm.loadPtr(Address(StackPointer, Frame::callerFPOffset()), FramePointer);
-+  poppedFP = masm.currentOffset();
-+
-+  masm.xs_mtlr(js::jit::r0);
-+  masm.addToStackPtr(Imm32(sizeof(Frame)));
-+  offsets->ret = masm.currentOffset();
-+  masm.as_blr();
- #else
-   // Forbid pools for the same reason as described in GenerateCallablePrologue.
- #  if defined(JS_CODEGEN_ARM)
-@@ -1905,6 +1978,22 @@ bool js::wasm::StartUnwinding(const RegisterState& registers,
-         fixedFP = fp;
-         AssertMatchesCallSite(fixedPC, fixedFP);
-       } else
-+#elif defined(JS_CODEGEN_PPC64)
-+      if (codeRange->isThunk()) {
-+        // The FarJumpIsland sequence temporary scrambles the link register.
-+        fixedPC = pc;
-+        fixedFP = fp;
-+        *unwoundCaller = false;
-+        AssertMatchesCallSite(
-+            Frame::fromUntaggedWasmExitFP(fp)->returnAddress(),
-+            Frame::fromUntaggedWasmExitFP(fp)->rawCaller());
-+      } else if (offsetFromEntry < PushedFP) {
-+        // On PPC64 the return address is in LR (registers.lr) until
-+        // pushReturnAddress() saves it to the stack.
-+        fixedPC = (uint8_t*)registers.lr;
-+        fixedFP = fp;
-+        AssertMatchesCallSite(fixedPC, fixedFP);
-+      } else
- #elif defined(JS_CODEGEN_ARM64)
-       if (offsetFromEntry < SetFP || codeRange->isThunk()) {
-         // On ARM64 we rely on register state instead of state saved on
-@@ -1956,6 +2045,35 @@ bool js::wasm::StartUnwinding(const RegisterState& registers,
-         fixedPC = Frame::fromUntaggedWasmExitFP(sp)->returnAddress();
-         fixedFP = fp;
-         AssertMatchesCallSite(fixedPC, fixedFP);
-+#elif defined(JS_CODEGEN_PPC64)
-+      } else if (offsetInCode >= codeRange->ret() - PoppedFP &&
-+                 offsetInCode < codeRange->ret()) {
-+        // PPC64 epilogue (RA loaded into r0, FP restored, RA not yet
-+        // moved to LR, SP not yet adjusted):
-+        //   ld r0, 8(sp)      ; restore caller's RA into r0
-+        //   ld FP, 0(sp)      ; restore caller's FP
-+        //   <-- poppedFP -->
-+        //   mtlr r0           ; LR := caller's RA
-+        //   addi sp, sp, 16   ; pop the Frame
-+        //   <-- ret -->
-+        //   blr
-+        // In the [poppedFP, ret) window the addi has not run, so *sp
-+        // is still the saved Frame and sp[8] is the caller's RA.
-+        // (registers.lr would also be correct after mtlr executes, but
-+        // sp[8] is valid throughout this window — including before mtlr —
-+        // so we read it consistently.)
-+        MOZ_ASSERT(*sp == fp);
-+        fixedPC = Frame::fromUntaggedWasmExitFP(sp)->returnAddress();
-+        fixedFP = fp;
-+        AssertMatchesCallSite(fixedPC, fixedFP);
-+      } else if (offsetInCode == codeRange->ret()) {
-+        // PPC64 epilogue, at the blr: addi has run, so SP is the
-+        // caller's and *sp is unrelated memory. mtlr ran earlier in
-+        // the [poppedFP, ret) window, so LR holds the caller's RA.
-+        // fp holds the restored caller's FP.
-+        fixedPC = (uint8_t*)registers.lr;
-+        fixedFP = fp;
-+        AssertMatchesCallSite(fixedPC, fixedFP);
- #elif defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_LOONG64)
-         // The stack pointer does not move until all values have
-         // been restored so several cases can be coalesced here.
-diff --git a/js/src/wasm/WasmGC.cpp b/js/src/wasm/WasmGC.cpp
-index e59cd4f5aba0..21cd01fd1c5e 100644
---- a/js/src/wasm/WasmGC.cpp
-+++ b/js/src/wasm/WasmGC.cpp
-@@ -444,6 +444,14 @@ bool wasm::IsPlausibleStackMapKey(const uint8_t* nextPC) {
-             insn[-1] == 0x00000013 /* addi zero, zero, 0 */) ||  // jal; nop
-            (insn[-1] == 0x00100073 &&
-             (insn[-2] & kITypeMask) == RO_CSRRWI)));  // wasm trap
-+#  elif defined(JS_CODEGEN_PPC64)
-+  const uint32_t* insn = reinterpret_cast<const uint32_t*>(nextPC);
-+  MOZ_ASSERT((uintptr_t(insn) & 3) == 0);
-+  // xs_trap() = tw 31,r0,r0 (PPC_trap); bctrl = PPC_bctr|1; bl = I-form
-+  // opcode 18 (PPC_b) with LK=1, AA=0, checked via 0xFC000003 mask.
-+  return insn[-1] == uint32_t(PPC_trap) ||
-+         insn[-1] == (uint32_t(PPC_bctr) | 1u) ||
-+         (insn[-1] & 0xFC000003u) == (uint32_t(PPC_b) | 1u);
- #  else
-   MOZ_CRASH("IsValidStackMapKey: requires implementation on this platform");
- #  endif
-diff --git a/js/src/wasm/WasmGenerator.cpp b/js/src/wasm/WasmGenerator.cpp
-index 2dafac73e96a..07ffe150fc57 100644
---- a/js/src/wasm/WasmGenerator.cpp
-+++ b/js/src/wasm/WasmGenerator.cpp
-@@ -930,7 +930,23 @@ bool ModuleGenerator::finishCodeBlock(CodeBlockResult* result) {
-   callSiteTargets_.clear();
-   callFarJumps_.clear();
- 
--  // None of the linking or far-jump operations should emit masm metadata.
-+  // None of the linking or far-jump operations should emit masm metadata,
-+  // except on PPC64 where patchFarJump uses addLongJump to create CodeLabels
-+  // for absolute-address far jumps. Drain those into linkData_ here.
-+#ifdef JS_CODEGEN_PPC64
-+  for (const jit::CodeLabel& codeLabel : masm_->codeLabels()) {
-+    LinkData::InternalLink link;
-+    link.patchAtOffset = codeLabel.patchAt().offset();
-+    link.targetOffset = codeLabel.target().offset();
-+#  ifdef JS_CODELABEL_LINKMODE
-+    link.mode = codeLabel.linkMode();
-+#  endif
-+    if (!linkData_->internalLinks.append(link)) {
-+      return false;
-+    }
-+  }
-+  masm_->codeLabels().clear();
-+#endif
- 
-   MOZ_ASSERT(masm_->inliningContext().empty());
-   MOZ_ASSERT(masm_->callSites().empty());
-diff --git a/js/src/wasm/WasmIonCompile.cpp b/js/src/wasm/WasmIonCompile.cpp
-index 9c79b9cf0704..0d0e661770af 100644
---- a/js/src/wasm/WasmIonCompile.cpp
-+++ b/js/src/wasm/WasmIonCompile.cpp
-@@ -11602,7 +11602,7 @@ bool js::wasm::IonPlatformSupport() {
- #if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_X86) ||       \
-     defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) ||    \
-     defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_LOONG64) || \
--    defined(JS_CODEGEN_RISCV64)
-+    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-   return true;
- #else
-   return false;
-diff --git a/js/src/wasm/WasmMemory.cpp b/js/src/wasm/WasmMemory.cpp
-index 0e3e6d3509ad..feee9f6ea1c9 100644
---- a/js/src/wasm/WasmMemory.cpp
-+++ b/js/src/wasm/WasmMemory.cpp
-@@ -288,9 +288,9 @@ static_assert(MaxMemoryAccessSize <= HugeUnalignedGuardPage,
- static_assert(HugeOffsetGuardLimit < UINT32_MAX,
-               "checking for overflow against OffsetGuardLimit is enough.");
- 
--// We have only tested huge memory on x64, arm64 and riscv64.
-+// We have only tested huge memory on x64, arm64, riscv64 and ppc64.
- #  if !(defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM64) || \
--        defined(JS_CODEGEN_RISCV64))
-+        defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64))
- #    error "Not an expected configuration"
- #  endif
- 
-diff --git a/js/src/wasm/WasmSignalHandlers.cpp b/js/src/wasm/WasmSignalHandlers.cpp
-index cc8bc2755745..84d3c4ec164d 100644
---- a/js/src/wasm/WasmSignalHandlers.cpp
-+++ b/js/src/wasm/WasmSignalHandlers.cpp
-@@ -111,7 +111,9 @@ using namespace js::wasm;
- #    if defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || \
-         defined(__PPC64LE__)
- #      define R01_sig(p) ((p)->sc_frame.fixreg[1])
-+#      define R31_sig(p) ((p)->sc_frame.fixreg[31])
- #      define R32_sig(p) ((p)->sc_frame.srr0)
-+#      define R36_sig(p) ((p)->sc_frame.lr)
- #    endif
- #  elif defined(__linux__) || defined(__sun)
- #    if defined(__linux__)
-@@ -157,7 +159,9 @@ using namespace js::wasm;
- #    if defined(__linux__) && (defined(__ppc64__) || defined(__PPC64__) || \
-                                defined(__ppc64le__) || defined(__PPC64LE__))
- #      define R01_sig(p) ((p)->uc_mcontext.gp_regs[1])
-+#      define R31_sig(p) ((p)->uc_mcontext.gp_regs[31])
- #      define R32_sig(p) ((p)->uc_mcontext.gp_regs[32])
-+#      define R36_sig(p) ((p)->uc_mcontext.gp_regs[36])
- #    endif
- #    if defined(__linux__) && defined(__loongarch__)
- #      define EPC_sig(p) ((p)->uc_mcontext.__pc)
-@@ -200,7 +204,9 @@ using namespace js::wasm;
- #    if defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || \
-         defined(__PPC64LE__)
- #      define R01_sig(p) ((p)->uc_mcontext.__gregs[_REG_R1])
-+#      define R31_sig(p) ((p)->uc_mcontext.__gregs[_REG_R31])
- #      define R32_sig(p) ((p)->uc_mcontext.__gregs[_REG_PC])
-+#      define R36_sig(p) ((p)->uc_mcontext.__gregs[_REG_LR])
- #    endif
- #  elif defined(__DragonFly__) || defined(__FreeBSD__) || \
-       defined(__FreeBSD_kernel__)
-@@ -234,7 +240,9 @@ using namespace js::wasm;
- #    if defined(__FreeBSD__) && (defined(__ppc64__) || defined(__PPC64__) || \
-                                  defined(__ppc64le__) || defined(__PPC64LE__))
- #      define R01_sig(p) ((p)->uc_mcontext.mc_gpr[1])
-+#      define R31_sig(p) ((p)->uc_mcontext.mc_gpr[31])
- #      define R32_sig(p) ((p)->uc_mcontext.mc_srr0)
-+#      define R36_sig(p) ((p)->uc_mcontext.mc_lr)
- #    endif
- #  elif defined(XP_DARWIN)
- #    define EIP_sig(p) ((p)->thread.uts.ts32.__eip)
-@@ -412,7 +420,8 @@ struct macos_aarch64_context {
-       defined(__PPC64LE__)
- #    define PC_sig(p) R32_sig(p)
- #    define SP_sig(p) R01_sig(p)
--#    define FP_sig(p) R01_sig(p)
-+#    define FP_sig(p) R31_sig(p)
-+#    define LR_sig(p) R36_sig(p)
- #  elif defined(__loongarch__)
- #    define PC_sig(p) EPC_sig(p)
- #    define FP_sig(p) RFP_sig(p)
-@@ -458,7 +467,8 @@ static uint8_t* ContextToSP(CONTEXT* context) {
- }
- 
- #  if defined(__arm__) || defined(__aarch64__) || defined(__mips__) || \
--      defined(__loongarch__) || defined(__riscv)
-+      defined(__loongarch__) || defined(__riscv) || \
-+      defined(__ppc64__) || defined(__PPC64__)
- static uint8_t* ContextToLR(CONTEXT* context) {
- #    ifdef LR_sig
-   return mozilla::BitwiseCast<uint8_t*>(LR_sig(context));
-@@ -475,7 +485,8 @@ static JS::ProfilingFrameIterator::RegisterState ToRegisterState(
-   state.pc = ContextToPC(context);
-   state.sp = ContextToSP(context);
- #  if defined(__arm__) || defined(__aarch64__) || defined(__mips__) || \
--      defined(__loongarch__) || defined(__riscv)
-+      defined(__loongarch__) || defined(__riscv) || \
-+      defined(__ppc64__) || defined(__PPC64__)
-   state.lr = ContextToLR(context);
- #  else
-   state.lr = (void*)UINTPTR_MAX;
-@@ -776,6 +787,9 @@ static void MachExceptionHandlerThread() {
- 
- #    if defined(__mips__) || defined(__loongarch__)
- static const uint32_t kWasmTrapSignal = SIGFPE;
-+#    elif defined(__ppc64__) || defined(__PPC64__) || \ 
-+          defined(__ppc64le__) || defined(__PPC64LE__)
-+static const uint32_t kWasmTrapSignal = SIGTRAP;
- #    else
- static const uint32_t kWasmTrapSignal = SIGILL;
- #    endif
-diff --git a/js/src/wasm/WasmStacks.cpp b/js/src/wasm/WasmStacks.cpp
-index 71497353c5c1..6514d8b0e2e4 100644
---- a/js/src/wasm/WasmStacks.cpp
-+++ b/js/src/wasm/WasmStacks.cpp
-@@ -426,6 +426,30 @@ static constexpr size_t ContStackMaxJitStackSize = 10 * 1024 * 1024;
- // or stack snapshots utilities.
- static constexpr size_t ContStackRedZoneSize = 0x8000;
- 
-+// Effective red-zone size used when laying out a continuation stack.
-+//
-+// The jit stack (and therefore the bottom guard page) must start on a page
-+// boundary; otherwise gc::ProtectPages trips MOZ_RELEASE_ASSERT(length %
-+// pageSize == 0). The red zone sits between the top guard page and the jit
-+// stack, so its size has to be a page multiple to keep that start aligned.
-+//
-+// Rounding the red zone up to a page is correct on every platform and would
-+// also cover any configuration whose page size exceeds ContStackRedZoneSize
-+// (32K) -- e.g. a 64K-page AArch64 kernel -- but ContStackRedZoneSize is
-+// already a multiple of the 4K/16K pages used on the tier-1 platforms, so the
-+// round-up is a no-op there today. We deliberately gate it to PPC64 (64K
-+// pages, where the round-up is load-bearing) so this patch cannot alter
-+// continuation stack layout on any tier-1 platform. Drop the gate if the
-+// general case is ever wanted.
-+static inline size_t ContStackEffectiveRedZoneSize(
-+    [[maybe_unused]] size_t pageSize) {
-+#ifdef JS_CODEGEN_PPC64
-+  return RoundUp(ContStackRedZoneSize, pageSize);
-+#else
-+  return ContStackRedZoneSize;
-+#endif
-+}
-+
- // Number of guard pages at the top and bottom of each continuation stack slot.
- static constexpr size_t ContStackTopGuardPages = 1;
- static constexpr size_t ContStackBottomGuardPages = 1;
-@@ -444,8 +468,8 @@ void ContStackSize::compute() {
-                          ContStackMinJitStackSize, ContStackMaxJitStackSize),
-               pageSize);
-   headerSize = RoundUp(sizeof(ContStack), pageSize);
--  totalSize = topGuardSize + ContStackRedZoneSize + jitStackSize +
--              bottomGuardSize + headerSize;
-+  totalSize = topGuardSize + ContStackEffectiveRedZoneSize(pageSize) +
-+              jitStackSize + bottomGuardSize + headerSize;
- 
-   // Assert we can't overflow when multiplying our size by capacity. Assume
-   // 32-bit integers to be conservative.
-@@ -467,7 +491,8 @@ void ContStack::init(ContStackArena* arena, uintptr_t allocationBase,
-   uintptr_t topGuardPagePhysicalStart = allocationBase;
-   uintptr_t topGuardPagePhysicalEnd = allocationBase + topGuardPageSize;
-   uintptr_t redZonePhysicalStart = topGuardPagePhysicalEnd;
--  uintptr_t jitStackPhysicalStart = redZonePhysicalStart + ContStackRedZoneSize;
-+  uintptr_t jitStackPhysicalStart =
-+      redZonePhysicalStart + ContStackEffectiveRedZoneSize(pageSize);
-   uintptr_t jitStackPhysicalEnd = jitStackPhysicalStart + jitStackSize;
-   uintptr_t bottomGuardPagePhysicalStart = jitStackPhysicalEnd;
-   uintptr_t headerPhysicalStart =
-diff --git a/js/src/wasm/WasmStubs.cpp b/js/src/wasm/WasmStubs.cpp
-index 8a98e201a452..8497814fcd37 100644
---- a/js/src/wasm/WasmStubs.cpp
-+++ b/js/src/wasm/WasmStubs.cpp
-@@ -646,8 +646,9 @@ static bool GenerateInterpEntry(MacroAssembler& masm, const FuncExport& fe,
- 
-   // Save the return address if it wasn't already saved by the call insn.
- #ifdef JS_USE_LINK_REGISTER
--#  if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) || \
--      defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
-+#  if defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_MIPS64) ||      \
-+      defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || \
-+      defined(JS_CODEGEN_PPC64)
-   masm.pushReturnAddress();
- #  elif defined(JS_CODEGEN_ARM64)
-   // WasmPush updates framePushed() unlike pushReturnAddress(), but that's
-@@ -2123,9 +2124,10 @@ static bool GenerateImportInterpExit(MacroAssembler& masm, const FuncImport& fi,
-   // The native ABI preserves the instance, heap and global registers since they
-   // are non-volatile.
-   MOZ_ASSERT(NonVolatileRegs.has(InstanceReg));
--#if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM) ||      \
--    defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) || \
--    defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64)
-+#if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM) ||         \
-+    defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_MIPS64) ||    \
-+    defined(JS_CODEGEN_LOONG64) || defined(JS_CODEGEN_RISCV64) || \
-+    defined(JS_CODEGEN_PPC64)
-   MOZ_ASSERT(NonVolatileRegs.has(HeapReg));
- #endif
- 
-@@ -2571,6 +2573,15 @@ bool wasm::GenerateBuiltinThunk(MacroAssembler& masm, ABIFunctionType abiType,
-                         Register::FromCode(regId + 1));
-         }
-       }
-+#endif
-+#ifdef JS_CODEGEN_PPC64
-+      // PPC64 32-bit operations do not zero-extend to 64 bits (unlike
-+      // x86-64/ARM64/LA64). The ELFv2 ABI requires callers to zero/sign-extend
-+      // narrow args. Wasm i32 values may have garbage upper bits in 64-bit
-+      // registers, so zero-extend them before calling C++ builtins.
-+      if (selfArgs.mirType() == MIRType::Int32) {
-+        masm.move32ZeroExtendToPtr(selfArgs->gpr(), selfArgs->gpr());
-+      }
- #endif
-       continue;
-     }
-@@ -2659,6 +2670,28 @@ static const LiveRegisterSet RegsToPreserve(
- #  ifdef ENABLE_WASM_SIMD
- #    error "high lanes of SIMD registers need to be saved too."
- #  endif
-+#elif defined(JS_CODEGEN_PPC64)
-+// Exclude r0 (ScratchRegister, not allocatable, special addressing semantics),
-+// r1 (SP), r2 (TOC pointer, reserved), and r13 (TLS pointer, reserved).
-+static const LiveRegisterSet RegsToPreserve(
-+    GeneralRegisterSet(Registers::AllMask & ~((uint32_t(1) << Registers::r0) |
-+                                              (uint32_t(1) << Registers::r1) |
-+                                              (uint32_t(1) << Registers::r2) |
-+                                              (uint32_t(1) << Registers::r13))),
-+#  ifdef ENABLE_WASM_SIMD
-+    // Unlike ARM64, where the vector registers alias the doubles, PPC64
-+    // doubles live in the FPRs (VSR0-31) while wasm v128 values live in the
-+    // VRs (VSR32-63) -- two disjoint physical pools, so both must be
-+    // preserved. Saving only the doubles loses the entire live v128 state: a
-+    // trap firing while a v128 is live (notably the interrupt-check trap,
-+    // which fires constantly in hot loops) resumes with whatever the C++
-+    // handler's libc left in the VRs (e.g. glibc's vector memcpy leaves lvsl
-+    // alignment-control patterns in low VRs).
-+    FloatRegisterSet(FloatRegisters::AllDoubleMask |
-+                     FloatRegisters::AllSimd128Mask));
-+#  else
-+    FloatRegisterSet(FloatRegisters::AllDoubleMask));
-+#  endif
- #elif defined(JS_CODEGEN_ARM64)
- // We assume that traps do not happen while lr is live. This both ensures that
- // the size of RegsToPreserve is a multiple of 2 (preserving WasmStackAlignment)
-diff --git a/js/src/wasm/WasmSummarizeInsn.cpp b/js/src/wasm/WasmSummarizeInsn.cpp
-index 7bb4f4b7a725..2ae55a1b1b9e 100644
---- a/js/src/wasm/WasmSummarizeInsn.cpp
-+++ b/js/src/wasm/WasmSummarizeInsn.cpp
-@@ -1731,6 +1731,169 @@ Maybe<TrapMachineInsn> SummarizeTrapInstruction(const uint8_t* insnAddr) {
-   return Nothing();
- }
- 
-+// ================================================================== ppc64 ====
-+
-+#  elif defined(JS_CODEGEN_PPC64)
-+
-+Maybe<TrapMachineInsn> SummarizeTrapInstruction(const uint8_t* insnAddr) {
-+  MOZ_ASSERT(0 == (uintptr_t(insnAddr) & 3));
-+
-+  const uint32_t insn = *(uint32_t*)insnAddr;
-+  const uint32_t majorOp = insn >> 26;
-+  // X-form secondary opcode: bits 10..1.
-+  const uint32_t xo = (insn >> 1) & 0x3FF;
-+
-+  // PPC_trap = 0x7FE00008 = tw 31,0,0.
-+  if (insn == 0x7FE00008) {
-+    return Some(TrapMachineInsn::OfficialUD);
-+  }
-+
-+  // D-form / DS-form loads.
-+  switch (majorOp) {
-+    case 34:  // lbz
-+      return Some(TrapMachineInsn::Load8);
-+    case 40:  // lhz
-+    case 42:  // lha
-+      return Some(TrapMachineInsn::Load16);
-+    case 32:  // lwz
-+      return Some(TrapMachineInsn::Load32);
-+    case 58:  // ld (DS=0) / lwa (DS=2)
-+      if ((insn & 3) == 2) {
-+        return Some(TrapMachineInsn::Load32);  // lwa
-+      }
-+      return Some(TrapMachineInsn::Load64);  // ld
-+    case 48:                                 // lfs
-+      return Some(TrapMachineInsn::Load32);
-+    case 50:  // lfd
-+      return Some(TrapMachineInsn::Load64);
-+    default:
-+      break;
-+  }
-+
-+  // D-form / DS-form stores.
-+  switch (majorOp) {
-+    case 38:  // stb
-+      return Some(TrapMachineInsn::Store8);
-+    case 44:  // sth
-+      return Some(TrapMachineInsn::Store16);
-+    case 36:  // stw
-+    case 37:  // stwu
-+      return Some(TrapMachineInsn::Store32);
-+    case 52:  // stfs
-+      return Some(TrapMachineInsn::Store32);
-+    case 62:  // std (DS=0) / stdu (DS=1)
-+      return Some(TrapMachineInsn::Store64);
-+    case 54:  // stfd
-+    case 55:  // stfdu
-+      return Some(TrapMachineInsn::Store64);
-+    default:
-+      break;
-+  }
-+
-+  // X-form instructions (major opcode 31).
-+  if (majorOp == 31) {
-+    switch (xo) {
-+      // Indexed loads.
-+      case 87:  // lbzx
-+        return Some(TrapMachineInsn::Load8);
-+      case 279:  // lhzx
-+      case 343:  // lhax
-+        return Some(TrapMachineInsn::Load16);
-+      case 23:  // lwzx
-+        return Some(TrapMachineInsn::Load32);
-+      case 21:  // ldx
-+        return Some(TrapMachineInsn::Load64);
-+      case 535:  // lfsx
-+      case 855:  // lfiwax
-+      case 887:  // lfiwzx
-+        return Some(TrapMachineInsn::Load32);
-+      case 599:  // lfdx
-+        return Some(TrapMachineInsn::Load64);
-+      case 790:  // lhbrx (byte-reverse halfword)
-+        return Some(TrapMachineInsn::Load16);
-+      case 534:  // lwbrx (byte-reverse word)
-+        return Some(TrapMachineInsn::Load32);
-+
-+      // Indexed stores.
-+      case 215:  // stbx
-+        return Some(TrapMachineInsn::Store8);
-+      case 407:  // sthx
-+        return Some(TrapMachineInsn::Store16);
-+      case 151:  // stwx
-+        return Some(TrapMachineInsn::Store32);
-+      case 149:  // stdx
-+        return Some(TrapMachineInsn::Store64);
-+      case 663:  // stfsx
-+        return Some(TrapMachineInsn::Store32);
-+      case 727:  // stfdx
-+        return Some(TrapMachineInsn::Store64);
-+      case 918:  // sthbrx (byte-reverse halfword store)
-+        return Some(TrapMachineInsn::Store16);
-+      case 662:  // stwbrx (byte-reverse word store)
-+        return Some(TrapMachineInsn::Store32);
-+
-+      // VSX SIMD indexed load/store (XX1-form, same major opcode 31).
-+      case 268:  // lxvx (POWER9)
-+      case 844:  // lxvd2x (POWER8)
-+        return Some(TrapMachineInsn::Load128);
-+      case 396:  // stxvx (POWER9)
-+      case 972:  // stxvd2x (POWER8)
-+        return Some(TrapMachineInsn::Store128);
-+
-+      // Atomic (load-reserve / store-conditional).
-+      case 20:   // lwarx
-+      case 52:   // lbarx (POWER7+)
-+      case 84:   // ldarx
-+      case 116:  // lharx (POWER7+)
-+        return Some(TrapMachineInsn::Atomic);
-+      default:
-+        break;
-+    }
-+    // stwcx. (XO=150, Rc=1), stdcx. (XO=214, Rc=1), stbcx. (XO=694, Rc=1)
-+    // and sthcx. (XO=726, Rc=1) have bit 0 set. Note xo above already
-+    // discards bit 0, so we need a separate low-11-bit match.
-+    const uint32_t xoRc = insn & 0x7FF;  // bits 10..0
-+    if (xoRc == ((150 << 1) | 1) || xoRc == ((214 << 1) | 1) ||
-+        xoRc == ((694 << 1) | 1) || xoRc == ((726 << 1) | 1)) {
-+      return Some(TrapMachineInsn::Atomic);
-+    }
-+  }
-+
-+  // POWER10 prefixed loads/stores (major opcode 1). The trap-site PC
-+  // points at the prefix word; the actual load/store kind is encoded in
-+  // the suffix word at insnAddr + 4. The 64-byte-boundary rule
-+  // (ensurePrefixedAlignment) guarantees the suffix is in the same block.
-+  if (majorOp == 1) {
-+    const uint32_t suffix = *(uint32_t*)(insnAddr + 4);
-+    const uint32_t suffixOp6 = suffix >> 26;          // 6-bit suffix op
-+    const uint32_t suffixOp5 = suffix >> 27;          // 5-bit suffix op (plxv/pstxv)
-+    switch (suffixOp6) {
-+      case 57:  // pld
-+        return Some(TrapMachineInsn::Load64);
-+      case 50:  // plfd
-+        return Some(TrapMachineInsn::Load64);
-+      case 48:  // plfs
-+        return Some(TrapMachineInsn::Load32);
-+      case 61:  // pstd
-+        return Some(TrapMachineInsn::Store64);
-+      case 54:  // pstfd
-+        return Some(TrapMachineInsn::Store64);
-+      case 52:  // pstfs
-+        return Some(TrapMachineInsn::Store32);
-+      default:
-+        break;
-+    }
-+    if (suffixOp5 == 25) {  // plxv
-+      return Some(TrapMachineInsn::Load128);
-+    }
-+    if (suffixOp5 == 27) {  // pstxv
-+      return Some(TrapMachineInsn::Store128);
-+    }
-+  }
-+
-+  return Nothing();
-+}
-+
- // ================================================================== none ====
- 
- #  elif defined(JS_CODEGEN_NONE)
-diff --git a/js/src/wasm/WasmValue.cpp b/js/src/wasm/WasmValue.cpp
-index fda0996851e1..45fff24fa582 100644
---- a/js/src/wasm/WasmValue.cpp
-+++ b/js/src/wasm/WasmValue.cpp
-@@ -430,7 +430,7 @@ bool ToWebAssemblyValue_i32(JSContext* cx, HandleValue val, int32_t* loc,
-   bool ok = ToInt32(cx, val, loc);
-   if (ok && mustWrite64) {
- #if defined(JS_CODEGEN_MIPS64) || defined(JS_CODEGEN_LOONG64) || \
--    defined(JS_CODEGEN_RISCV64)
-+    defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_PPC64)
-     loc[1] = loc[0] >> 31;
- #else
-     loc[1] = 0;
-diff --git a/mfbt/Assertions.h b/mfbt/Assertions.h
-index a436d019a197..4887af7e7676 100644
---- a/mfbt/Assertions.h
-+++ b/mfbt/Assertions.h
-@@ -282,6 +282,11 @@ static inline void MOZ_CrashSequence(void* aAddress, intptr_t aLine) {
-       "st.d %1,%0,0;\n"  // Write the line number to the crashing address
-       :                  // no output registers
-       : "r"(aAddress), "r"(aLine));
-+#  elif defined(__powerpc64__)
-+  asm volatile(
-+      "std %1,0(%0);\n"  // Write the line number to the crashing address
-+      :                  // no output registers
-+      : "r"(aAddress), "r"(aLine));
- #  else
- #    warning \
-         "Unsupported architecture, replace the code below with assembly suitable to crash the process"
--- 
-2.52.0
-

diff --git a/firefox.spec b/firefox.spec
index be8abaf..06a6900 100644
--- a/firefox.spec
+++ b/firefox.spec
@@ -281,11 +281,6 @@ Patch600:        pgo.patch
 Patch602:        mozilla-1516803.patch
 Patch603:        firefox-gcc-always-inline.patch
 
-# ppc64le JIT
-Patch900:        0001-Add-VSX-instructions-for-SKIA.patch
-Patch901:        0002-Add-VSX-instructions-for-libwebp.patch
-Patch902:        0003-Add-PPC64LE-JIT-backend.patch
-
 
 %if %{?system_nss}
 BuildRequires:  pkgconfig(nspr) >= %{nspr_version}
@@ -606,11 +601,6 @@ cat %{SOURCE49} | sed -e "s|LIBCLANG_RT_PLACEHOLDER|`pwd`/wasi-sdk-30/build/sysr
 %endif
 %patch -P603 -p1 -b .inline
 
-# ppc64le JIT
-%patch -P900 -p1
-%patch -P901 -p1
-%patch -P902 -p1
-
 rm -f .mozconfig
 cp %{SOURCE10} .mozconfig
 echo "ac_add_options --enable-default-toolkit=cairo-gtk3-wayland" >> .mozconfig

^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2026-06-16 13:11 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2026-06-16 13:11 [rpms/firefox] rawhide: Revert "add ppc64le JIT" 

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox